import pandas as pd
#import openpyxl
import math
#计算对数的方法,第二个参数代表底,或者直接用log2
#print(math.log(4,2))
#print(math.log(10))#此时代表自然对数
#使用openpyxl操作工作表
#wb = openpyxl.load_workbook('配眼镜.xlsx')
#ws_data = wb['Sheet1']
#删除第一行
#ws_data.delete_rows(1)
#header = None 不指定索引
df = pd.read_excel('配眼镜.xlsx',header = None)
#读取指定索引的数据
#print(df.loc[0][0])
#将指定列输出为列表
#print(df[0].values.tolist())
#获取索引
namespace = df.loc[0].tolist()
#删除指定行,如果不指定inplace参数则不会对原工作表进行处理
df.drop([0],inplace = True)
#总的信息条数
counts = len(df)
rows = len(df.loc[1])
#删除指定列
#dz = df.drop([0],axis = 1)#如果想对原工作表进行操作,指定inplace参数即可
#print(dz)
#对数据存入字典,并为字典配备其他属性
def get_data():
data = []
for i in range(rows):
if i != rows - 1:
list_temp = {
'name':namespace[i],
'values':df[i].values.tolist(),
'count':[],
'keys':[],
'procount':[],
'lastcount':[]
}
else:
list_temp = {
'name':namespace[i],
'values':df[i].values.tolist(),
'count':[],
'keys':[],
}
data.append(list_temp)
return data
#获取信息熵
def get_info_entropy(data):
data[-1]['keys'].append(data[-1]['values'][0])
for key in data[-1]['values']:
if key not in data[-1]['keys']:
data[-1]['keys'].append(key)
for j in range(len(data[-1]['keys'])):
data[-1]['count'].append(0)
for key in data[-1]['values']:
if key == data[-1]['keys'][j]:
data[-1]['count'][j] += 1
result_count = len(data[-1]['keys'])
Information_entropy = 0
for i in range(result_count):
probality = data[-1]['count'][i] / counts
Information_entropy += probality * math.log2(1 / probality)
return result_count,Information_entropy
#统计条件熵所有信息
def get_cond_info(data):
for i in range(rows - 1):
data[i]['keys'].append(data[i]['values'][0])
for key in data[i]['values']:
if key not in data[i]['keys']:
data[i]['keys'].append(key)
for j in range(len(data[i]['keys'])):
data[i]['count'].append(0)
for key in data[i]['values']:
if key == data[i]['keys'][j]:
data[i]['count'][j] += 1
for item in data[i]['keys']:
temp = []
for z in range(result_count):
temp.append(0)
key = data[-1]['keys'][z]
for j in range(counts):
if key == data[-1]['values'][j] and item == data[i]['values'][j]:
temp[z] += 1
data[i]['procount'].append(temp)
#计算条件熵
def get_cond_entropy(data):
for i in range(rows - 1):
Conditional_entropy_temp = 0
for j in range(len(data[i]['procount'])):
probality_1 = data[i]['count'][j] / counts
for z in range(result_count):
probality_2 = data[i]['procount'][j][z] / data[i]['count'][j]
if probality_2 != 0:
Conditional_entropy_temp += probality_1 * probality_2 * math.log(1 / probality_2,10)
Conditional_entropy.append(Conditional_entropy_temp)
#计算互信息,并对索引排序
def get_mutu_info(cond_info):
Mutual_information = []
list_index = []
for item in cond_info:
Mutual_information.append(Information_entropy - item)
for i in range(rows - 1):
list_index.append(i)
for i in range(rows - 1):
for j in range(1,rows - 1 - i):
if Mutual_information[j] > Mutual_information[j - 1]:
list_index[j],list_index[j - 1] = list_index[j - 1],list_index[j]
#Mutual_information[j],Mutual_information[j - 1] = Mutual_information[j - 1],Mutual_information[j]
return list_index,Mutual_information
if __name__ == '__main__':
data = get_data()
result_count,Information_entropy = get_info_entropy(data)
get_cond_info(data)
Conditional_entropy = []
get_cond_entropy(data)
list_index,Mutual_information = get_mutu_info(Conditional_entropy)
"""print('信息熵:' + str(Information_entropy))
print('条件熵:' + '\n' + str(Conditional_entropy))
print('信息增益:' + '\n' + str(Mutual_information))
print('位次索引:' + '\n' + str(list_index))"""
#统计决策树的子节点
for key1 in data[3]['keys']:
for key2 in data[2]['keys']:
c = []
for i in range(result_count):
key3 = data[-1]['keys'][i]
c.append(0)
for j in range(counts):
if key1 == data[3]['values'][j] and key2 == data[2]['values'][j] and key3 == data[-1]['values'][j]:
c[i] += 1
data[2]['lastcount'].append(c)
for key1 in data[3]['keys']:
for key2 in data[2]['keys']:
for key3 in data[1]['keys']:
c = []
for i in range(result_count):
key4 = data[-1]['keys'][i]
c.append(0)
for j in range(counts):
if key3 == data[1]['values'][j] and key1 == data[3]['values'][j] and key2 == data[2]['values'][j] and key4 == data[-1]['values'][j]:
c[i] += 1
data[1]['lastcount'].append(c)
for key1 in data[3]['keys']:
for key2 in data[2]['keys']:
for key3 in data[1]['keys']:
for key4 in data[0]['keys']:
c = []
for i in range(result_count):
key5 = data[-1]['keys'][i]
c.append(0)
for j in range(counts):
if key4 == data[0]['values'][j] and key3 == data[1]['values'][j] and key1 == data[3]['values'][j] and key2 == data[2]['values'][j] and key5 == data[-1]['values'][j]:
c[i] += 1
data[0]['lastcount'].append(c)
for i in list_index:
if i == list_index[0]:
print(data[i]['name'])
print(data[i]['keys'])
print(data[-1]['keys'])
print(data[i]['count'])
print('\n')
else:
print(data[i]['name'])
print(data[i]['keys'])
print(data[-1]['keys'])
print(data[i]['lastcount'])
print('\n')
记一次探索ID3算法的过程
发表评论
670 views