# -*- coding: utf-8 -*-
""" Created on Mon Oct 03 11:07:58 2016 @author: liqi """
keep = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',' ','-',"'"}
def normalize(s):
return ''.join(c for c in s.lower() if c in keep)
def make_freq_dict(s):
s = normalize(s)
words = s.split()
d = {}
for w in words:
if w in d:
d[w] += 1
else:
d[w] = 1
return d
def print_file_stats(fname):
s = open(fname,'r').read()
num_chars = len(s)
num_lines = s.count('\n')
d = make_freq_dict(s)
num_words = sum(d[w] for w in d)
lst = [(d[w],w) for w in d]
lst.sort()
lst.reverse()
print("The file '%s' has:"% fname)
print(" %s characters" % num_chars)
print(" %s lines" % num_lines)
print(" %s words" % num_words)
print("\nThe top 10 most frequant words are:")
i = 1
for count,word in lst[:20]:
print('%2s. %4s %s' %(i,count,word))
i += 1
def main():
print_file_stats('bill.txt')
if __name__ == '__main__':
main()
输出结果
The file 'bill.txt' has:
34426 characters
94 lines
6215 words
The top 10 most frequant words are:
1. 320 the
2. 260 i
3. 202 and
4. 183 to
5. 148 of
6. 147 a
7. 131 was
8. 124 in
9. 81 my
10. 64 he
11. 61 for
12. 57 had
13. 56 that
14. 51 it
15. 50 with
16. 50 me
17. 48 his
18. 47 on
19. 35 when
20. 35 but