用Python统计词频

2025-05-01 10:22:34
推荐回答(2个)
回答1:

def statistics(astr):
# astr.replace("\n", "")
slist = list(astr.split("\t"))
alist = []
[alist.append(i) for i in slist if i not in alist]
alist[-1] = alist[-1].replace("\n", "")
return alist

if __name__ == "__main__":
code_doc = {}
with open("test_data.txt", "r", encoding='utf-8') as fs:
for ln in fs.readlines():
l = statistics(ln)
for t in l:
if t not in code_doc:
code_doc.setdefault(t, 1)
else:
code_doc[t] += 1

for keys in code_doc.keys():
print(keys + ' ' + str(code_doc[keys]))

回答2:

import pandas as pd

a='''123456 人性 尴尬 啊哈

147852 哈哈 不好看

123456 啊哈

147852 哈哈

147852 嗯嗯 二刷

147852 略略略 人性

123456 尴尬

147963 人性 极端

123456 啊哈

147963 不好看'''

arr=[[x] for x in a.split('\n')]

df=pd.DataFrame(arr,)

df1=df[0].str.split('\t',expand=True);df1.columns='id a b c'.split()

df2=df1.set_index('id').stack().reset_index()

df2.drop_duplicates(inplace=True)

print(df2[0].value_counts())

##         源文件txt.txt,输出文件txt_out.txt

import pandas as pd

df=pd.read_csv(r'd:/txt.txt',encoding='gbk',header=None)

df1=df[0].str.split('\t',expand=True);df1.columns='id a b c'.split()

df2=df1.set_index('id').stack().reset_index()

df2.drop_duplicates(inplace=True)

df2[0].value_counts().to_csv(r'd:/txt_out.txt',header=None,sep='\t')