id3

import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
import math

# Load the dataset
df = pd.read_csv(‘diabetes.csv’)

# Display the first few rows of the dataset
print(df.head())

def calculate_entropy(data, target_column):
    total_rows = len(data)
    target_values = data[target_column].unique()

    entropy = 0
    for value in target_values:
        # Calculate the proportion of instances with the current value
        value_count = len(data[data[target_column] == value])
        proportion = value_count / total_rows
        entropy -= proportion * math.log2(proportion) if proportion != 0 else 0

    return entropy

entropy_outcome = calculate_entropy(df, ‘Outcome’)
print(f”Entropy of the dataset: {entropy_outcome}”)

def calculate_information_gain(data, feature, target_column):
    # Calculate weighted average entropy for the feature
    unique_values = data[feature].unique()
    weighted_entropy = 0

    for value in unique_values:
        subset = data[data[feature] == value]
        proportion = len(subset) / len(data)
        weighted_entropy += proportion * calculate_entropy(subset, target_column)

    # Calculate information gain
    information_gain = entropy_outcome – weighted_entropy

    return information_gain

# Calculate entropy and information gain for each feature
for column in df.columns[:-1]:
    entropy = calculate_entropy(df, column)
    information_gain = calculate_information_gain(df, column, ‘Outcome’)
    print(f”{column} – Entropy: {entropy:.3f}, Information Gain: {information_gain:.3f}”)

# Feature selection for the first step in making decision tree
selected_feature = ‘DiabetesPedigreeFunction’

# Create a decision tree
clf = DecisionTreeClassifier(criterion=’entropy’, max_depth=1)
X = df[[selected_feature]]
y = df[‘Outcome’]
clf.fit(X, y)

# Plot the decision tree
plt.figure(figsize=(8, 6))
plot_tree(clf, feature_names=[selected_feature], class_names=[‘0’, ‘1’], filled=True, rounded=True)
plt.show()

def id3(data, target_column, features):
    if len(data[target_column].unique()) == 1:
        return data[target_column].iloc[0]

    if len(features) == 0:
        return data[target_column].mode().iloc[0]

    best_feature = max(features, key=lambda x: calculate_information_gain(data, x, target_column))

    tree = {best_feature: {}}

    features = [f for f in features if f != best_feature]

    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        tree[best_feature][value] = id3(subset, target_column, features)

    return tree

Paste text,images,html and share with anyone
Scroll to Top