The challenge
Write a function that, given a string of text (possibly with punctuation and line-breaks), returns an array of the top-3 most occurring words, in descending order of the number of occurrences.
Assumptions:
- A word is a string of letters (A to Z) optionally containing one or more apostrophes (‘) in ASCII. (No need to handle fancy punctuation.)
- Matches should be case-insensitive, and the words in the result should be lowercased.
- Ties may be broken arbitrarily.
- If a text contains fewer than three unique words, then either the top-2 or top-1 words should be returned, or an empty array if a text contains no words.
Examples:
top_3_words("In a village of La Mancha, the name of which I have no desire to call to
mind, there lived not long since one of those gentlemen that keep a lance
in the lance-rack, an old buckler, a lean hack, and a greyhound for
coursing. An olla of rather more beef than mutton, a salad on most
nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so extra
on Sundays, made away with three-quarters of his income.")
# => ["a", "of", "on"]
top_3_words("e e e e DDD ddd DdD: ddd ddd aa aA Aa, bb cc cC e e e")
# => ["e", "ddd", "aa"]
top_3_words(" //wont won't won't")
# => ["won't", "wont"]
Bonus points:
- Avoid creating an array whose memory footprint is roughly as big as the input text.
- Avoid sorting the entire array of unique words.
Test cases
from random import choice, randint, sample, shuffle, choices
import re
from collections import Counter
def check(s, this=None): # this: only for debugging purpose
returned_result = top_3_words(s) if this is None else this
fs = Counter(w for w in re.findall(r"[a-zA-Z']+", s.lower()) if w != "'" * len(w))
exp,expected_frequencies = map(list,zip(*fs.most_common(3))) if fs else ([],[])
msg = ''
wrong_words = [w for w in returned_result if not fs[w]]
actual_freq = [fs[w] for w in returned_result]
if wrong_words:
msg = 'Incorrect match: words not present in the string. Your output: {}. One possible valid answer: {}'.format(returned_result, exp)
elif len(set(returned_result)) != len(returned_result):
msg = 'The result should not contain copies of the same word. Your output: {}. One possible output: {}'.format(returned_result, exp)
elif actual_freq!=expected_frequencies:
msg = "Incorrect frequencies: {} should be {}. Your output: {}. One possible output: {}".format(actual_freq, expected_frequencies, returned_result, exp)
Test.expect(not msg, msg)
@test.describe("Fixed tests")
def fixed_tests():
TESTS = (
"a a a b c c d d d d e e e e e",
"e e e e DDD ddd DdD: ddd ddd aa aA Aa, bb cc cC e e e",
" //wont won't won't ",
" , e .. ",
" ... ",
" ' ",
" ''' ",
"""In a village of La Mancha, the name of which I have no desire to cao
mind, there lived not long since one of those gentlemen that keep a lance
in the lance-rack, an old buckler, a lean hack, and a greyhound for
coursing. An olla of rather more beef than mutton, a salad on most
nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so extra
on Sundays, made away with three-quarters of his income.""",
"a a a b c c X",
"a a c b b",
)
for s in TESTS: check(s)
@test.describe("Random tests")
def random_tests():
def gen_word():
return "".join(choice("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'") for _ in range(randint(3, 10)))
def gen_string():
words = []
nums = choices(range(1, 31), k=20)
for _ in range(randint(0, 20)):
words += [gen_word()] * nums.pop()
shuffle(words)
s = ""
while words:
s += words.pop() + "".join(choice("-,.?!_:;/ ") for _ in range(randint(1, 5)))
return s
@test.it("Tests")
def it_1():
for _ in range(100): check(gen_string())
The solution using Python
Option 1:
# use the Counter module
from collections import Counter
# use the regex module
import re
def top_3_words(text):
# count the input, pass through a regex and lowercase it
c = Counter(re.findall(r"[a-z']+", re.sub(r" '+ ", " ", text.lower())))
# return the `most common` 3 items
return [w for w,_ in c.most_common(3)]
Option 2:
def top_3_words(text):
# loop through each character in the string
for c in text:
# if it's not alphanumeric or an apostrophe
if not (c.isalpha() or c=="'"):
# replace with a space
text = text.replace(c,' ')
# create some `list` variables
words,counts,out = [],[],[]
# loop through the words in the text
for word in list(filter(None,text.lower().split())):
# if in all, then continue
if all([not c.isalpha() for c in word]):
continue
# if the word is in the words list
if word in words:
# increment the count
counts[words.index(word)] += 1
else:
# otherwise create a new entry
words.append(word); counts.append(0)
# loop while bigger than 0 and less than 3
while len(words)>0 and len(out)<3:
# append the counts
out.append(words.pop(counts.index(max(counts))).lower())
counts.remove(max(counts))
# return the counts
return out
Option 3:
def top_3_words(text):
wrds = {}
for p in r'!"#$%&()*+,./:;<=>?@[\]^_`{|}~-':
text = text.replace(p, ' ')
for w in text.lower().split():
if w.replace("'", '') != '':
wrds[w] = wrds.get(w, 0) + 1
return [y[0] for y in sorted(wrds.items(), key=lambda x: x[1], reverse=True)[:3]]