记一次探索ID3算法的过程


import pandas as pd
#import openpyxl
import math

#计算对数的方法,第二个参数代表底,或者直接用log2
#print(math.log(4,2))
#print(math.log(10))#此时代表自然对数

#使用openpyxl操作工作表
#wb = openpyxl.load_workbook('配眼镜.xlsx')
#ws_data = wb['Sheet1']
#删除第一行
#ws_data.delete_rows(1)
#header = None 不指定索引
df = pd.read_excel('配眼镜.xlsx',header = None)
#读取指定索引的数据
#print(df.loc[0][0])
#将指定列输出为列表
#print(df[0].values.tolist())
#获取索引
namespace = df.loc[0].tolist()
#删除指定行,如果不指定inplace参数则不会对原工作表进行处理
df.drop([0],inplace = True)
#总的信息条数
counts = len(df)
rows = len(df.loc[1])
#删除指定列
#dz = df.drop([0],axis = 1)#如果想对原工作表进行操作,指定inplace参数即可
#print(dz)
#对数据存入字典,并为字典配备其他属性
def get_data():
    data = []
    for i in range(rows):
        if i != rows - 1:
            list_temp = {
                'name':namespace[i],
                'values':df[i].values.tolist(),
                'count':[],
                'keys':[],
                'procount':[],
                'lastcount':[]
               }
        else:
           list_temp = {
                'name':namespace[i],
                'values':df[i].values.tolist(),
                'count':[],
                'keys':[],
               } 
        data.append(list_temp)
    return data
#获取信息熵
def get_info_entropy(data):
    data[-1]['keys'].append(data[-1]['values'][0])
    for key in data[-1]['values']:
        if key not in data[-1]['keys']:
            data[-1]['keys'].append(key)
    for j in range(len(data[-1]['keys'])):
        data[-1]['count'].append(0)
        for key in data[-1]['values']:
            if key == data[-1]['keys'][j]:
                data[-1]['count'][j] += 1
    result_count = len(data[-1]['keys'])
    Information_entropy = 0
    for i in range(result_count):
        probality = data[-1]['count'][i] / counts
        Information_entropy += probality * math.log2(1 / probality)
    return result_count,Information_entropy
#统计条件熵所有信息
def get_cond_info(data):
    for i in range(rows - 1):
        data[i]['keys'].append(data[i]['values'][0])
        for key in data[i]['values']:
            if key not in data[i]['keys']:
                data[i]['keys'].append(key)
        for j in range(len(data[i]['keys'])):
            data[i]['count'].append(0)
            for key in data[i]['values']:
                if key == data[i]['keys'][j]:
                    data[i]['count'][j] += 1
        for item in data[i]['keys']:
            temp = []
            for z in range(result_count):
                temp.append(0)
                key = data[-1]['keys'][z]
                for j in range(counts):
                    if key == data[-1]['values'][j] and item == data[i]['values'][j]:
                        temp[z] += 1
            data[i]['procount'].append(temp)
#计算条件熵
def get_cond_entropy(data):
    for i in range(rows - 1):
        Conditional_entropy_temp = 0
        for j in range(len(data[i]['procount'])):
            probality_1 = data[i]['count'][j] / counts
            for z in range(result_count):
                probality_2 = data[i]['procount'][j][z] / data[i]['count'][j]
                if probality_2 != 0:
                    Conditional_entropy_temp += probality_1 * probality_2 * math.log(1 / probality_2,10)
        Conditional_entropy.append(Conditional_entropy_temp)
#计算互信息,并对索引排序
def get_mutu_info(cond_info):
    Mutual_information = []
    list_index = []
    for item in cond_info:
        Mutual_information.append(Information_entropy - item)
    for i in range(rows - 1):
        list_index.append(i)
    for i in range(rows - 1):
        for j in range(1,rows - 1 - i):
            if Mutual_information[j] > Mutual_information[j - 1]:
                list_index[j],list_index[j - 1] = list_index[j - 1],list_index[j] 
                #Mutual_information[j],Mutual_information[j - 1] = Mutual_information[j - 1],Mutual_information[j]
    return list_index,Mutual_information

if __name__ == '__main__':
    data = get_data()
    result_count,Information_entropy = get_info_entropy(data)
    get_cond_info(data)
    Conditional_entropy = []
    get_cond_entropy(data)
    list_index,Mutual_information = get_mutu_info(Conditional_entropy)
    """print('信息熵:' + str(Information_entropy))
    print('条件熵:' + '\n' + str(Conditional_entropy))
    print('信息增益:' + '\n' + str(Mutual_information))
    print('位次索引:' + '\n' + str(list_index))"""
    #统计决策树的子节点
    for key1 in data[3]['keys']:
        for key2 in data[2]['keys']:
            c = []
            for i in range(result_count):
                key3 = data[-1]['keys'][i]
                c.append(0)
                for j in range(counts):
                    if key1 == data[3]['values'][j] and key2 == data[2]['values'][j] and key3 == data[-1]['values'][j]:
                        c[i] += 1
            data[2]['lastcount'].append(c)
    for key1 in data[3]['keys']:
        for key2 in data[2]['keys']:
            for key3 in data[1]['keys']:
                c = []
                for i in range(result_count):
                    key4 = data[-1]['keys'][i]
                    c.append(0)
                    for j in range(counts):
                        if key3 == data[1]['values'][j] and key1 == data[3]['values'][j] and key2 == data[2]['values'][j] and key4 == data[-1]['values'][j]:
                            c[i] += 1
                data[1]['lastcount'].append(c)
    for key1 in data[3]['keys']:
        for key2 in data[2]['keys']:
            for key3 in data[1]['keys']:
                for key4 in data[0]['keys']:
                    c = []
                    for i in range(result_count):
                        key5 = data[-1]['keys'][i]
                        c.append(0)
                        for j in range(counts):
                            if key4 == data[0]['values'][j] and key3 == data[1]['values'][j] and key1 == data[3]['values'][j] and key2 == data[2]['values'][j] and key5 == data[-1]['values'][j]:
                                c[i] += 1
                    data[0]['lastcount'].append(c)
    for i in list_index:
        if i == list_index[0]:
            print(data[i]['name'])
            print(data[i]['keys'])
            print(data[-1]['keys'])
            print(data[i]['count'])
            print('\n')
        else:
            print(data[i]['name'])
            print(data[i]['keys'])
            print(data[-1]['keys'])
            print(data[i]['lastcount'])
            print('\n')