def statistics(astr):
# astr.replace("\n", "")
slist = list(astr.split("\t"))
alist = []
[alist.append(i) for i in slist if i not in alist]
alist[-1] = alist[-1].replace("\n", "")
return alist
if __name__ == "__main__":
code_doc = {}
with open("test_data.txt", "r", encoding='utf-8') as fs:
for ln in fs.readlines():
l = statistics(ln)
for t in l:
if t not in code_doc:
code_doc.setdefault(t, 1)
else:
code_doc[t] += 1
for keys in code_doc.keys():
print(keys + ' ' + str(code_doc[keys]))
import pandas as pd
a='''123456 人性 尴尬 啊哈
147852 哈哈 不好看
123456 啊哈
147852 哈哈
147852 嗯嗯 二刷
147852 略略略 人性
123456 尴尬
147963 人性 极端
123456 啊哈
147963 不好看'''
arr=[[x] for x in a.split('\n')]
df=pd.DataFrame(arr,)
df1=df[0].str.split('\t',expand=True);df1.columns='id a b c'.split()
df2=df1.set_index('id').stack().reset_index()
df2.drop_duplicates(inplace=True)
print(df2[0].value_counts())
## 源文件txt.txt,输出文件txt_out.txt
import pandas as pd
df=pd.read_csv(r'd:/txt.txt',encoding='gbk',header=None)
df1=df[0].str.split('\t',expand=True);df1.columns='id a b c'.split()
df2=df1.set_index('id').stack().reset_index()
df2.drop_duplicates(inplace=True)
df2[0].value_counts().to_csv(r'd:/txt_out.txt',header=None,sep='\t')