4
\$\begingroup\$

I have built a tiny script count the word frequencies of all text files with a specified directory. I would be glad to receive any suggestions and feedback.

import re, glob,os
from collections import OrderedDict, Counter
class CountWords:
 def __init__(self,dir,ext):
 self.dir = dir
 self.ext = ext
 @property
 def files(self):
 #collect all the files with `.txt` extension
 os.chdir(self.dir)
 return glob.glob(f"**/*.{self.ext}", recursive=True)
 @property
 def contents(self):
 #concatenate files
 cat_content = []
 for file in self.files:
 with open(file, 'r') as f:
 content = f.read()
 cat_content.append(content)
 contents = ",".join(cat_content)
 return contents
 @property
 def words(self):
 words = re.findall(r"[a-zA-z_]+", self.contents)
 words = [word.lower() for word in words]
 return words
 def count(self):
 words_counter = Counter(self.words)
 words_counter = OrderedDict(sorted(words_counter.items(),key=lambda t:t[1], reverse=True))
 return words_counter

The function version:

def count_words(dir,ext):
 os.chdir(dir)
 #collect all the files with `.txt` extension
 files = glob.glob(f"**/*.ext", recursive=True)
 #concatenate files
 cat_content = []
 for file in files:
 with open(file, 'r') as f:
 content = f.read()
 cat_content.append(content)
 contents = ",".join(cat_content)
 #extract the words
 words = re.findall(r"[a-zA-z_]+", contents)
 words = [word.lower() for word in words]
 words_counter = Counter(words)
 words_counter = OrderedDict(sorted(words_counter.items(),key=lambda t:t[1], reverse=True))
 return words_counter
Jamal
35.2k13 gold badges134 silver badges238 bronze badges
asked May 31, 2018 at 13:10
\$\endgroup\$

1 Answer 1

3
\$\begingroup\$
#!/usr/bin/python3
import os
import re
from os.path import join
from collections import Counter, OrderedDict
def count_words(directory, ext): # don't use the name dir, it's a builtin function
 """Counts word frequencies in a directory of files.
 Keyword arguments:
 directory -- count_words will search this directory recursively
 ext -- the extension of files that you wish to count
 Returns an OrderedDict, from most to least frequent.
 (You don't strictly need a docstring, but it's a good habit to get into.
 See PEP 257.)"""
 # Initialize the counter
 word_counter = Counter()
 # Personally I like to break my code into small, simple functions
 # This code could be inline in the loop below,
 # but I think it's a bit clearer this way.
 def update_counter(word_counter, filename):
 '''updates word_counter with all the words in open(filename)'''
 with open(filename, 'r') as f:
 try:
 # Two things:
 # First, I'm updating counter here,
 # instead of creating an enormous string.
 # Each file string only exists long enough for me to search it.
 # Second and less important,
 # I chose to lower the full text first, to make the regex simpler.
 word_counter.update(re.findall('[a-z_]+', f.read().lower()))
 except UnicodeDecodeError:
 # In python3 if your file isn't UTF-8
 # then read() throws an exception.
 # Maybe you want this behavior,
 # but I'd rather just warn and continue.
 print("Warning: couldn't decode", filename)
 # Using os.walk instead of glob
 for root, dirs, files in os.walk(directory):
 for fname in files:
 if fname.endswith(ext):
 update_counter(word_counter, join(root, fname))
 # words_counter.most_common() does exactly the sort you are looking for
 return OrderedDict(word_counter.most_common())
answered May 31, 2018 at 18:53
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.