|
| 1 | +# -*- coding:utf-8 -*- |
| 2 | +# By:Eastmount CSDN 2021-03-19 |
| 3 | +import csv |
| 4 | +import pandas as pd |
| 5 | +import numpy as np |
| 6 | +import jieba |
| 7 | +import jieba.analyse |
| 8 | + |
| 9 | +#添加自定义词典和停用词典 |
| 10 | +jieba.load_userdict("user_dict.txt") |
| 11 | +stop_list = pd.read_csv('stop_words.txt', |
| 12 | + engine='python', |
| 13 | + encoding='utf-8', |
| 14 | + delimiter="\n", |
| 15 | + names=['t'])['t'].tolist() |
| 16 | + |
| 17 | +#----------------------------------------------------------------------- |
| 18 | +#Jieba分词函数 |
| 19 | +def txt_cut(juzi): |
| 20 | + return [w for w in jieba.lcut(juzi) if w not in stop_list] |
| 21 | + |
| 22 | +#----------------------------------------------------------------------- |
| 23 | +#中文分词读取文件 |
| 24 | +def fenci(filename,result): |
| 25 | + #写入分词结果 |
| 26 | + fw = open(result, "w", newline = '',encoding = 'UTF-8') |
| 27 | + writer = csv.writer(fw) |
| 28 | + writer.writerow(['label','cutword']) |
| 29 | + |
| 30 | + #使用csv.DictReader读取文件中的信息 |
| 31 | + labels = [] |
| 32 | + contents = [] |
| 33 | + with open(filename, "r", encoding="UTF-8") as f: |
| 34 | + reader = csv.DictReader(f) |
| 35 | + for row in reader: |
| 36 | + #数据元素获取 |
| 37 | + labels.append(row['label']) |
| 38 | + content = row['content'] |
| 39 | + #中文分词 |
| 40 | + seglist = txt_cut(content) |
| 41 | + #空格拼接 |
| 42 | + output = ' '.join(list(seglist)) |
| 43 | + contents.append(output) |
| 44 | + |
| 45 | + #文件写入 |
| 46 | + tlist = [] |
| 47 | + tlist.append(row['label']) |
| 48 | + tlist.append(output) |
| 49 | + writer.writerow(tlist) |
| 50 | + print(labels[:5]) |
| 51 | + print(contents[:5]) |
| 52 | + fw.close() |
| 53 | + |
| 54 | +#----------------------------------------------------------------------- |
| 55 | +#主函数 |
| 56 | +if __name__ == '__main__': |
| 57 | + fenci("news_dataset_train.csv", "news_dataset_train_fc.csv") |
| 58 | + fenci("news_dataset_test.csv", "news_dataset_test_fc.csv") |
| 59 | + fenci("news_dataset_val.csv", "news_dataset_val_fc.csv") |
0 commit comments