{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "390f5c1c-7797-4071-ae9c-9725007c4fb7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from math import log2\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "b714cb3e-1f56-4230-972c-fa281aa6037a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CSV file read!\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv(\"data/play_tennis.csv\")\n",
    "print(\"CSV file read!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "9247ff29-db71-4902-a231-a4fd281a56cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def entropy(target_col):\n",
    "    elements, counts = np.unique(target_col, return_counts=True)\n",
    "    entropy_value = 0\n",
    "    for i in range(len(elements)):\n",
    "        p = counts[i] / np.sum(counts)\n",
    "        entropy_value += -p * log2(p)\n",
    "    return entropy_value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "87f1a152-e728-451b-85df-22ca7873a0a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def info_gain(data, split_attribute, target_name=\"PlayTennis\"):\n",
    "    total_entropy = entropy(data[target_name])\n",
    "    \n",
    "    values, counts = np.unique(data[split_attribute], return_counts=True)\n",
    "    \n",
    "    weighted_entropy = 0\n",
    "    for i in range(len(values)):\n",
    "        subset = data[data[split_attribute] == values[i]]\n",
    "        weighted_entropy += (counts[i] / np.sum(counts)) * entropy(subset[target_name])\n",
    "    \n",
    "    return total_entropy - weighted_entropy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "8ca2b9ea-e301-43f2-aa45-de428f9c3af4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def id3(data, original_data, features, target=\"PlayTennis\", parent_node_class=None):\n",
    "    \n",
    "    # If all target values same → return that class\n",
    "    if len(np.unique(data[target])) <= 1:\n",
    "        return np.unique(data[target])[0]\n",
    "    \n",
    "    # If dataset empty → return most common class\n",
    "    elif len(data) == 0:\n",
    "        return np.unique(original_data[target])[np.argmax(\n",
    "            np.unique(original_data[target], return_counts=True)[1])]\n",
    "    \n",
    "    # If no features left → return parent node class\n",
    "    elif len(features) == 0:\n",
    "        return parent_node_class\n",
    "    \n",
    "    else:\n",
    "        parent_node_class = np.unique(data[target])[np.argmax(\n",
    "            np.unique(data[target], return_counts=True)[1])]\n",
    "        \n",
    "        # Select best feature\n",
    "        item_values = [info_gain(data, feature, target) for feature in features]\n",
    "        best_feature_index = np.argmax(item_values)\n",
    "        best_feature = features[best_feature_index]\n",
    "        \n",
    "        tree = {best_feature: {}}\n",
    "        \n",
    "        features = [i for i in features if i != best_feature]\n",
    "        \n",
    "        for value in np.unique(data[best_feature]):\n",
    "            subset = data[data[best_feature] == value]\n",
    "            subtree = id3(subset, data, features, target, parent_node_class)\n",
    "            tree[best_feature][value] = subtree\n",
    "        \n",
    "        return tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "8e303fa7-96d3-43cb-a43f-0d7fdd41dece",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"Outlook\": {\n",
      "    \"Overcast\": \"Yes\",\n",
      "    \"Rain\": {\n",
      "      \"Wind\": {\n",
      "        \"Strong\": \"No\",\n",
      "        \"Weak\": \"Yes\"\n",
      "      }\n",
      "    },\n",
      "    \"Sunny\": \"No\"\n",
      "  }\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "features = df.columns[:-1].tolist()\n",
    "tree = id3(df, df, features)\n",
    "\n",
    "print(json.dumps(tree, indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9656d059-3955-42b7-bbcc-45f9937ee472",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}