-
一.中文分词
-
二.数据清洗
-
三.特征提取及TF-IDF计算
-
1.基本概念
-
2.代码实现
-
3.MemoryError内存溢出错误
-
四.基于逻辑回归的情感分类
-
五.算法性能评估
-
六.算法对比实验
-
1.RandomForest
-
2.SVM
-
3.朴素贝叶斯
-
4.KNN
-
5.决策树
-
6.SGD
-
7.MLP
-
8.GradientBoosting
-
9.AdaBoost
-
七.总结
一.中文分词
输入:我是程序员
输出1:我是程序员
输出2:我是是程程序序员
输出3:我是程序员
#encoding=utf-8
import jieba
text = "北京理工大学生前来应聘"
data = jieba.cut(text,cut_all=True) #全模式
print("[全模式]: ", " ".join(data))
data = jieba.cut(text,cut_all=False) #精确模式
print("[精确模式]: ", " ".join(data))
data = jieba.cut(text) #默认是精确模式
print("[默认模式]: ", " ".join(data))
data = jieba.cut_for_search(text) #搜索引擎模式
print("[搜索引擎模式]: ", " ".join(data))
二.数据清洗
# -*- coding:utf-8 -*-
import csv
import pandas as pd
import numpy as np
import jieba
import jieba.analyse
#添加自定义词典和停用词典
jieba.load_userdict("user_dict.txt")
stop_list = pd.read_csv('stop_words.txt',
engine='python',
encoding='utf-8',
delimiter="n",
names=['t'])['t'].tolist()
#中文分词函数
def txt_cut(juzi):
return [w for w in jieba.lcut(juzi) if w not in stop_list]
#写入分词结果
fw = open('fenci_data.csv', "a+", newline = '',encoding = 'gb18030')
writer = csv.writer(fw)
writer.writerow(['content','label'])
# 使用csv.DictReader读取文件中的信息
labels = []
contents = []
file = "data.csv"
with open(file, "r", encoding="UTF-8") as f:
reader = csv.DictReader(f)
for row in reader:
# 数据元素获取
if row['label'] == '好评':
res = 0
else:
res = 1
labels.append(res)
content = row['content']
seglist = txt_cut(content)
output = ' '.join(list(seglist)) #空格拼接
contents.append(output)
#文件写入
tlist = []
tlist.append(output)
tlist.append(res)
writer.writerow(tlist)
print(labels[:5])
print(contents[:5])
fw.close()
三.特征提取及TF-IDF计算
2.代码实现
# -*- coding:utf-8 -*-
import csv
import pandas as pd
import numpy as np
import jieba
import jieba.analyse
from scipy.sparse import coo_matrix
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#----------------------------------第一步 读取文件--------------------------------
with open('fenci_data.csv', 'r', encoding='UTF-8') as f:
reader = csv.DictReader(f)
labels = []
contents = []
for row in reader:
labels.append(row['label']) #0-好评 1-差评
contents.append(row['content'])
print(labels[:5])
print(contents[:5])
#----------------------------------第二步 数据预处理--------------------------------
#将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
vectorizer = CountVectorizer()
#该类会统计每个词语的tf-idf权值
transformer = TfidfTransformer()
#第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵
tfidf = transformer.fit_transform(vectorizer.fit_transform(contents))
for n in tfidf[:5]:
print(n)
print(type(tfidf))
# 获取词袋模型中的所有词语
word = vectorizer.get_feature_names()
for n in word[:10]:
print(n)
print("单词数量:", len(word))
#将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重
#X = tfidf.toarray()
X = coo_matrix(tfidf, dtype=np.float32).toarray() #稀疏矩阵 注意float
print(X.shape)
print(X[:10])
<class 'scipy.sparse.csr.csr_matrix'>
aaaaa
achievements
amazing
ananananan
ancient
anshun
aperture
app
单词数量: 20254
(6074, 20254)
[[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
...
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]]
3.MemoryError内存溢出错误
四.基于逻辑回归的情感分类
# -*- coding:utf-8 -*-
import csv
import pandas as pd
import numpy as np
import jieba
import jieba.analyse
from scipy.sparse import coo_matrix
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn import neighbors
from sklearn.naive_bayes import MultinomialNB
#----------------------------------第一步 读取文件--------------------------------
with open('fenci_data.csv', 'r', encoding='UTF-8') as f:
reader = csv.DictReader(f)
labels = []
contents = []
for row in reader:
labels.append(row['label']) #0-好评 1-差评
contents.append(row['content'])
print(labels[:5])
print(contents[:5])
#----------------------------------第二步 数据预处理--------------------------------
#将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
vectorizer = CountVectorizer(min_df=5)
#该类会统计每个词语的tf-idf权值
transformer = TfidfTransformer()
#第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵
tfidf = transformer.fit_transform(vectorizer.fit_transform(contents))
for n in tfidf[:5]:
print(n)
print(type(tfidf))
# 获取词袋模型中的所有词语
word = vectorizer.get_feature_names()
for n in word[:10]:
print(n)
print("单词数量:", len(word))
#将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重
#X = tfidf.toarray()
X = coo_matrix(tfidf, dtype=np.float32).toarray() #稀疏矩阵 注意float
print(X.shape)
print(X[:10])
#----------------------------------第三步 数据划分--------------------------------
#使用 train_test_split 分割 X y 列表
X_train, X_test, y_train, y_test = train_test_split(X,
labels,
test_size=0.3,
random_state=1)
#--------------------------------第四步 机器学习分类--------------------------------
# 逻辑回归分类方法模型
LR = LogisticRegression(solver='liblinear')
LR.fit(X_train, y_train)
print('模型的准确度:{}'.format(LR.score(X_test, y_test)))
pre = LR.predict(X_test)
print("逻辑回归分类")
print(len(pre), len(y_test))
print(classification_report(y_test, pre))
print("n")
五.算法性能评估
# -*- coding:utf-8 -*-
import csv
import pandas as pd
import numpy as np
import jieba
import jieba.analyse
from scipy.sparse import coo_matrix
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn import neighbors
from sklearn.naive_bayes import MultinomialNB
#----------------------------------第一步 读取文件--------------------------------
with open('fenci_data.csv', 'r', encoding='UTF-8') as f:
reader = csv.DictReader(f)
labels = []
contents = []
for row in reader:
labels.append(row['label']) #0-好评 1-差评
contents.append(row['content'])
print(labels[:5])
print(contents[:5])
#----------------------------------第二步 数据预处理--------------------------------
#将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
vectorizer = CountVectorizer(min_df=5)
#该类会统计每个词语的tf-idf权值
transformer = TfidfTransformer()
#第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵
tfidf = transformer.fit_transform(vectorizer.fit_transform(contents))
for n in tfidf[:5]:
print(n)
print(type(tfidf))
# 获取词袋模型中的所有词语
word = vectorizer.get_feature_names()
for n in word[:10]:
print(n)
print("单词数量:", len(word))
#将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重
#X = tfidf.toarray()
X = coo_matrix(tfidf, dtype=np.float32).toarray() #稀疏矩阵 注意float
print(X.shape)
print(X[:10])
#----------------------------------第三步 数据划分--------------------------------
#使用 train_test_split 分割 X y 列表
X_train, X_test, y_train, y_test = train_test_split(X,
labels,
test_size=0.3,
random_state=1)
#--------------------------------第四步 机器学习分类--------------------------------
# 逻辑回归分类方法模型
LR = LogisticRegression(solver='liblinear')
LR.fit(X_train, y_train)
print('模型的准确度:{}'.format(LR.score(X_test, y_test)))
pre = LR.predict(X_test)
print("逻辑回归分类")
print(len(pre), len(y_test))
print(classification_report(y_test, pre))
#----------------------------------第五步 评价结果--------------------------------
def classification_pj(name, y_test, pre):
print("算法评价:", name)
# 正确率 Precision = 正确识别的个体总数 / 识别出的个体总数
# 召回率 Recall = 正确识别的个体总数 / 测试集中存在的个体总数
# F值 F-measure = 正确率 * 召回率 * 2 / (正确率 + 召回率)
YC_B, YC_G = 0,0 #预测 bad good
ZQ_B, ZQ_G = 0,0 #正确
CZ_B, CZ_G = 0,0 #存在
#0-good 1-bad 同时计算防止类标变化
i = 0
while i<len(pre):
z = int(y_test[i]) #真实
y = int(pre[i]) #预测
if z==0:
CZ_G += 1
else:
CZ_B += 1
if y==0:
YC_G += 1
else:
YC_B += 1
if z==y and z==0 and y==0:
ZQ_G += 1
elif z==y and z==1 and y==1:
ZQ_B += 1
i = i + 1
print(ZQ_B, ZQ_G, YC_B, YC_G, CZ_B, CZ_G)
print("")
# 结果输出
P_G = ZQ_G * 1.0 / YC_G
P_B = ZQ_B * 1.0 / YC_B
print("Precision Good 0:", P_G)
print("Precision Bad 1:", P_B)
R_G = ZQ_G * 1.0 / CZ_G
R_B = ZQ_B * 1.0 / CZ_B
print("Recall Good 0:", R_G)
print("Recall Bad 1:", R_B)
F_G = 2 * P_G * R_G / (P_G + R_G)
F_B = 2 * P_B * R_B / (P_B + R_B)
print("F-measure Good 0:", F_G)
print("F-measure Bad 1:", F_B)
#函数调用
classification_pj("LogisticRegression", y_test, pre)
逻辑回归分类
1823 1823
precision recall f1-score support
0 0.94 0.99 0.97 1520
1 0.93 0.70 0.80 303
accuracy 0.94 1823
macro avg 0.94 0.85 0.88 1823
weighted avg 0.94 0.94 0.94 1823
算法评价: LogisticRegression
213 1504 229 1594 303 1520
Precision Good 0: 0.9435382685069009
Precision Bad 1: 0.9301310043668122
Recall Good 0: 0.9894736842105263
Recall Bad 1: 0.7029702970297029
F-measure Good 0: 0.9659601798330122
F-measure Bad 1: 0.800751879699248
六.算法对比实验
# 随机森林分类方法模型 n_estimators:森林中树的数量
clf = RandomForestClassifier(n_estimators=20)
clf.fit(X_train, y_train)
print('模型的准确度:{}'.format(clf.score(X_test, y_test)))
print("n")
pre = clf.predict(X_test)
print('预测结果:', pre[:10])
print(len(pre), len(y_test))
print(classification_report(y_test, pre))
classification_pj("RandomForest", y_test, pre)
print("n")
2.SVM
# SVM分类方法模型
SVM = svm.LinearSVC() #支持向量机分类器LinearSVC
SVM.fit(X_train, y_train)
print('模型的准确度:{}'.format(SVM.score(X_test, y_test)))
pre = SVM.predict(X_test)
print("支持向量机分类")
print(len(pre), len(y_test))
print(classification_report(y_test, pre))
classification_pj("LinearSVC", y_test, pre)
print("n")
3.朴素贝叶斯
#朴素贝叶斯模型
nb = MultinomialNB()
nb.fit(X_train, y_train)
print('模型的准确度:{}'.format(nb.score(X_test, y_test)))
pre = nb.predict(X_test)
print("朴素贝叶斯分类")
print(len(pre), len(y_test))
print(classification_report(y_test, pre))
classification_pj("MultinomialNB", y_test, pre)
print("n")
4.KNN
#最近邻算法
knn = neighbors.KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
print('模型的准确度:{}'.format(knn.score(X_test, y_test)))
pre = knn.predict(X_test)
print("最近邻分类")
print(classification_report(y_test, pre))
classification_pj("KNeighbors", y_test, pre)
print("n")
5.决策树
#决策树算法
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
print('模型的准确度:{}'.format(dtc.score(X_test, y_test)))
pre = dtc.predict(X_test)
print("决策树分类")
print(len(pre), len(y_test))
print(classification_report(y_test, pre))
classification_pj("DecisionTreeClassifier", y_test, pre)
print("n")
6.SGD
#SGD分类模型
from sklearn.linear_model.stochastic_gradient import SGDClassifier
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
print('模型的准确度:{}'.format(sgd.score(X_test, y_test)))
pre = sgd.predict(X_test)
print("SGD分类")
print(len(pre), len(y_test))
print(classification_report(y_test, pre))
classification_pj("SGDClassifier", y_test, pre)
print("n")
7.MLP
#MLP分类模型
from sklearn.neural_network.multilayer_perceptron import MLPClassifier
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
print('模型的准确度:{}'.format(mlp.score(X_test, y_test)))
pre = mlp.predict(X_test)
print("MLP分类")
print(len(pre), len(y_test))
print(classification_report(y_test, pre))
classification_pj("MLPClassifier", y_test, pre)
print("n")
8.GradientBoosting
#GradientBoosting分类模型
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
print('模型的准确度:{}'.format(gb.score(X_test, y_test)))
pre = gb.predict(X_test)
print("GradientBoosting分类")
print(len(pre), len(y_test))
print(classification_report(y_test, pre))
classification_pj("GradientBoostingClassifier", y_test, pre)
print("n")
9.AdaBoost
#AdaBoost分类模型
from sklearn.ensemble import AdaBoostClassifier
AdaBoost = AdaBoostClassifier()
AdaBoost.fit(X_train, y_train)
print('模型的准确度:{}'.format(AdaBoost.score(X_test, y_test)))
pre = AdaBoost.predict(X_test)
print("AdaBoost分类")
print(len(pre), len(y_test))
print(classification_report(y_test, pre))
classification_pj("AdaBoostClassifier", y_test, pre)
print("n")
七.总结
原文始发于微信公众号(娜璋AI安全之家):Python人工智能 | 二十三.基于机器学习和TFIDF的情感分类(含详细的NLP数据清洗)
免责声明:文章中涉及的程序(方法)可能带有攻击性,仅供安全研究与教学之用,读者将其信息做其他用途,由读者承担全部法律及连带责任,本站不承担任何法律及连带责任;如有问题可邮件联系(建议使用企业邮箱或有效邮箱,避免邮件被拦截,联系方式见首页),望知悉。
- 左青龙
- 微信扫一扫
-
- 右白虎
- 微信扫一扫
-
评论