我目前正在尝试创建一个脚本,该脚本允许我运行文件中包含的文本并计算单词数,不同单词,列出前10个最频繁的单词和计数,并将字符频率从最频繁到最不频繁进行排序。
以下是我到目前为止所拥有的:
import sys
import os
os.getcwd()
import string
path = ""
os.chdir(path)
#Prompt for user to input filename:
fname = input('Enter the filename: ')
try:
fhand = open(fname)
except IOError:
#Invalid filename error
print('\n')
print("Sorry, file can't be opened! Please check your spelling.")
sys.exit()
#Initialize char counts and word counts dictionary
counts = {}
worddict = {}
#For character and word frequency count
for line in fhand:
#Remove leading spaces
line = line.strip()
#Convert everything in the string to lowercase
line = line.lower()
#Take into account punctuation
line = line.translate(line.maketrans('', '', string.punctuation))
#Take into account white spaces
line = line.translate(line.maketrans('', '', string.whitespace))
#Take into account digits
line = line.translate(line.maketrans('', '', string.digits))
#Splitting line into words
words = line.split(" ")
for word in words:
#Is the word already in the word dictionary?
if word in worddict:
#Increase by 1
worddict[word] += 1
else:
#Add word to dictionary with count of 1 if not there already
worddict[word] = 1
#Character count
for word in line:
#Increase count by 1 if letter
if word in counts:
counts[word] += 1
else:
counts[word] = 1
#Initialize dictionaries
lst = []
countlst = []
freqlst = []
#Count up the number of letters
for ltrs, c in counts.items():
lst.append((c,ltrs))
countlst.append(c)
#Sum up the count
totalcount = sum(countlst)
#Calculate the frequency in each dictionary
for ec in countlst:
efreq = (ec/totalcount) * 100
freqlst.append(efreq)
#Sort lists by count and percentage frequency
freqlst.sort(reverse=True)
lst.sort(reverse=True)
扬帆大鱼
跃然一笑
相关分类