实验报告课程名称:数据挖掘课设实验名称:基于主题的文本情感分析实验地点:行远楼专业班级:软件1533学号:2015005677学生姓名:高聪江指导教师:王莉2017年1月1日课题代码:import pandas as pdimport jieba# from sklearn.feature_extraction.text import CountVectorizer# from sklearn.feature_extraction.text import TfidfTransformerfrom jieba import analyse# import jieba.posseg as pseg# from sklearn import feature_extractiondf = pd.read_csv('/home/jiangshen/Downloads/''数据挖掘_大数据/数据挖掘程序题/基于主题的文本情感分析/train.csv')#---------整理情感语料zongGeShu = len(df.values)qingGanCi_Train = {}for i in range(0,zongGeShu):valueMin001 = str(df.values[i][4])keyMin001 = str(df.values[i][3])preValue = valueMin001.split(sep=';')preKey = keyMin001.split(sep=';')del preKey[-1]del preValue[-1]for k in range(0,len(preKey)):dictionary = dict(zip(preKey,preValue))qingGanCi_Train = dict(qingGanCi_Train,**dictionary)#print(dictionary)print(qingGanCi_Train)df = pd.read_csv('/home/jiangshen/Downloads/''数据挖掘_大数据/数据挖掘程序题/基于主题的文本情感分析/train.csv')zongGeShu = len(df.values)#print(list(df.values[1][4]))# def quchuqingganci(str1):# stopWord = ';'# yuQieCi = list(str1)# xiangYao = []# for j in range(0,len(yuQieCi)):# if yuQieCi[j]!=stopWord:# xiangYao.append(yuQieCi[j])# return xiangYao# def quChuShuXing(str2):# stopWord = ';'# yuQie = list(str2)# print(quchuqingganci(df.values[1][3]))# for i in range(0,len(df.values)):# print(ganqingci)# print(df.values)# def ciXingPanDuan(word):# good = list# bad = list# if word in good:# return 1# if word in bad:# return -1# else:# return 0#-------------用TF-idf提取关键词# wordsList = df.values[9998][1]# vectorizer = CountVectorizer()# transformer = TfidfTransformer()# tfidf = transformer.fit_transform(vectorizer.fit_transform(wordsList)) # word = vectorizer.get_feature_names()# weight = tfidf.toarray()# print(word)# yuanzhuti = df.values[1][2]#data = '我爱你于是标有了光,水果,西瓜,哈哈,我爱水果,和橘都属于芸香科柑橘属的宽皮柑橘类,果实外皮肥厚,内藏瓤瓣# ,由汁泡和种子构成。
李时珍在《本草纲目·果部》中记载:“橘实小,其瓣味微醋(即酸),其皮薄而红,味辛而苦;柑大于橘,其瓣味酢' \# ',其皮稍厚而黄,叶辛而甘。
”一般说来,柑的果形正圆,黄赤色,皮紧纹细不易剥,多汁甘香;橘的果形扁圆,' \# '红或黄色,皮薄而光滑易剥,味微甘酸。
柑和橘虽有区别,但在日常语言中常混用,如广柑也说广橘,蜜橘也说蜜柑' \# '。
橘子中的维生素A还能够增强人体在黑暗环境中的视力和治疗夜盲症。
橘子不宜食用过量,吃太多会患有胡萝卜素' \# '血症,皮肤呈深黄色,如同黄疸一般。
若因吃太多橘子造成手掌变黄,只要停吃一段时间,就能让肤色渐渐恢复正常' \# '。
明代张岱季叔张烨芳对橘子情有独钟,据载其“性好啖橘,橘熟,堆砌床案间,无非橘者,自刊不给,'#---------------关键词提取#--------------<><><><>-------------keyWordsList = []dataList = df.values[6][1]keyWord2 = analyse.textrank(dataList,topK = 10,allowPOS=('n','v'),withFlag=True,withWeight=True) for i in range(0,zongGeShu):eachKeyWord = []dataList = df.values[i][1]keyWord2 = analyse.textrank(dataList,topK = 5,allowPOS=('n','v'),withFlag=False,withWeight=True) midList = list(keyWord2)for j in range(0,len(midList)):if midList[j][1] <1 and midList[j][1] >0.6:eachKeyWord.append(midList[j][0])keyWordsList.append(eachKeyWord)for i in range(0,zongGeShu):if len(keyWordsList[i]) == 0:keyWordsList[i] = ['']print('各个行的关键词:------------》')print(keyWordsList)#-----------情感词提取和分析-----------allGanQingCi = []allGanqingShuXing = []for i in range(0,zongGeShu):dataList = df.values[i][1]eachIdWord = []eachIdganqingci = []ciDeShuXing = []eachIdWord = jieba.lcut(dataList)for j in range(0,len(eachIdWord)):if (eachIdWord[j] in qingGanCi_Train.keys()) and (eachIdWord[j] not in eachIdganqingci):eachIdganqingci.append(eachIdWord[j])ciDeShuXing.append(qingGanCi_Train[eachIdWord[j]]) allGanQingCi.append(eachIdganqingci)allGanqingShuXing.append(ciDeShuXing)print('各个行的感情词:----------->')print(allGanQingCi)print('各个行的属性:——————————>')print(allGanqingShuXing)#测试生成结果---------》# okNum = 0# for i in range(0,zongGeShu):# keyWordstr = ';'.join(keyWordsList[i])# print(keyWordstr)# if keyWordstr==df.values[i][2]:# okNum +=1# roat = float(okNum/10000)# print('总的正确率是:',roat)#每一行的主题词:zhuTiend = []for j in range(0,zongGeShu):eachZhuTiend = []eachZhuTi = str(df.values[j][2]).split(sep=';')if len(eachZhuTi)==0:zhuTiend.append([])else:for i in range(0,len(eachZhuTi)):if (eachZhuTi[i]!='NULL'):eachZhuTiend.append(eachZhuTi[i])zhuTiend.append(eachZhuTiend)print(zhuTiend)okNum = 0for i in range(0,zongGeShu):if zhuTiend[i] == keyWordsList[i]:okNum += 1roat = float(okNum/10000)print('总的正确率是:',roat)#----------->>>将生成的数据生成CSV文件#keyWordList = []#listLen = len(dataList)#for i in range(0,listLen):# keyWord = analyse.extract_tags(dataList)#stop_key = pd.read_csv('/home/jiangshen/'#'PycharmProjects/DMshiyan/qingganfenxi/基于主题的文本情感分析/tingyongci')#print(stop_key)#print(keyWord[0],keyWord[1]) # print(dataList)# print(yuanzhuti)# print((keyWord))# print(keyWord2)# print(keyWord2[2][1])#print(df.values[0])# print(df.values[0][4])# print(df.values[9999][4][2])课题截图:。