The challenge

Write a function that, given a string of text (possibly with punctuation and line-breaks), returns an array of the top-3 most occurring words, in descending order of the number of occurrences.

Assumptions:

  • A word is a string of letters (A to Z) optionally containing one or more apostrophes (‘) in ASCII. (No need to handle fancy punctuation.)
  • Matches should be case-insensitive, and the words in the result should be lowercased.
  • Ties may be broken arbitrarily.
  • If a text contains fewer than three unique words, then either the top-2 or top-1 words should be returned, or an empty array if a text contains no words.

Examples:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
top_3_words("In a village of La Mancha, the name of which I have no desire to call to
mind, there lived not long since one of those gentlemen that keep a lance
in the lance-rack, an old buckler, a lean hack, and a greyhound for
coursing. An olla of rather more beef than mutton, a salad on most
nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so extra
on Sundays, made away with three-quarters of his income.")
# => ["a", "of", "on"]

top_3_words("e e e e DDD ddd DdD: ddd ddd aa aA Aa, bb cc cC e e e")
# => ["e", "ddd", "aa"]

top_3_words("  //wont won't won't")
# => ["won't", "wont"]

Bonus points:

  1. Avoid creating an array whose memory footprint is roughly as big as the input text.
  2. Avoid sorting the entire array of unique words.

Test cases

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from random import choice, randint, sample, shuffle, choices
import re
from collections import Counter


def check(s, this=None):                                            # this: only for debugging purpose
    returned_result = top_3_words(s) if this is None else this
    fs = Counter(w for w in re.findall(r"[a-zA-Z']+", s.lower()) if w != "'" * len(w))
    exp,expected_frequencies = map(list,zip(*fs.most_common(3))) if fs else ([],[])
    
    msg = ''
    wrong_words = [w for w in returned_result if not fs[w]]
    actual_freq = [fs[w] for w in returned_result]
    
    if wrong_words:
        msg = 'Incorrect match: words not present in the string. Your output: {}. One possible valid answer: {}'.format(returned_result, exp)
    elif len(set(returned_result)) != len(returned_result):
        msg = 'The result should not contain copies of the same word. Your output: {}. One possible output: {}'.format(returned_result, exp)
    elif actual_freq!=expected_frequencies:
        msg = "Incorrect frequencies: {} should be {}. Your output: {}. One possible output: {}".format(actual_freq, expected_frequencies, returned_result, exp)
    
    Test.expect(not msg, msg)



@test.describe("Fixed tests")
def fixed_tests():

    TESTS = (
    "a a a  b  c c  d d d d  e e e e e",
    "e e e e DDD ddd DdD: ddd ddd aa aA Aa, bb cc cC e e e",
    "  //wont won't won't ",
    "  , e   .. ",
    "  ...  ",
    "  '  ",
    "  '''  ",
    """In a village of La Mancha, the name of which I have no desire to cao
    mind, there lived not long since one of those gentlemen that keep a lance
    in the lance-rack, an old buckler, a lean hack, and a greyhound for
    coursing. An olla of rather more beef than mutton, a salad on most
    nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so extra
    on Sundays, made away with three-quarters of his income.""",
    "a a a  b  c c X",
    "a a c b b",
    )
    for s in TESTS: check(s)
    
@test.describe("Random tests")
def random_tests():
    
    def gen_word():
        return "".join(choice("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'") for _ in range(randint(3, 10)))
    
    def gen_string():
        words = []
        nums = choices(range(1, 31), k=20)
        for _ in range(randint(0, 20)):
            words += [gen_word()] * nums.pop()
        shuffle(words)
        s = ""
        while words:
            s += words.pop() + "".join(choice("-,.?!_:;/ ") for _ in range(randint(1, 5)))
        return s
    
    @test.it("Tests")
    def it_1():
        for _ in range(100): check(gen_string())
            

The solution using Python

Option 1:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
# use the Counter module
from collections import Counter
# use the regex module
import re

def top_3_words(text):
    # count the input, pass through a regex and lowercase it
    c = Counter(re.findall(r"[a-z']+", re.sub(r" '+ ", " ", text.lower())))
    # return the `most common` 3 items
    return [w for w,_ in c.most_common(3)]

Option 2:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def top_3_words(text):
    # loop through each character in the string
    for c in text:
        # if it's not alphanumeric or an apostrophe
        if not (c.isalpha() or c=="'"):
            # replace with a space
            text = text.replace(c,' ')
    # create some `list` variables
    words,counts,out = [],[],[]

    # loop through the words in the text
    for word in list(filter(None,text.lower().split())):
        # if in all, then continue
        if all([not c.isalpha() for c in word]):
            continue
        # if the word is in the words list
        if word in words:
            # increment the count
            counts[words.index(word)] += 1
        else:
            # otherwise create a new entry
            words.append(word); counts.append(0)

    # loop while bigger than 0 and less than 3
    while len(words)>0 and len(out)<3:
        # append the counts
        out.append(words.pop(counts.index(max(counts))).lower())
        counts.remove(max(counts))
    # return the counts
    return out

Option 3:

1
2
3
4
5
6
7
8
def top_3_words(text):
    wrds = {}
    for p in r'!"#$%&()*+,./:;<=>?@[\]^_`{|}~-':
        text = text.replace(p, ' ')
    for w in text.lower().split():
        if w.replace("'", '') != '':
            wrds[w] = wrds.get(w, 0) + 1
    return [y[0] for y in sorted(wrds.items(), key=lambda x: x[1], reverse=True)[:3]]