From 2fc009341aa940ccb66940c63d909fad5b486563 Mon Sep 17 00:00:00 2001 From: BethanyG Date: Tue, 21 Mar 2023 02:41:21 -0700 Subject: [PATCH 1/4] First draft of Word Count approaches. --- .../word-count/..articles/config.json | 11 + .../..articles/performance/code/Benchmark.py | 0 .../..articles/performance/content.md | 0 .../..articles/performance/snippet.md | 0 .../word-count/.approaches/config.json | 56 +++ .../filter-with-counter/content.md | 19 + .../filter-with-counter/snippet.txt | 8 + .../word-count/.approaches/introduction.md | 368 ++++++++++++++++++ .../str-iteration-with-dict/content.md | 23 ++ .../str-iteration-with-dict/snippet.txt | 8 + .../str-replace-with-collections/content.md | 50 +++ .../str-replace-with-collections/snippet.txt | 8 + .../content.md | 36 ++ .../snippet.txt | 8 + .../str-replace-with-dict/content.md | 31 ++ .../str-replace-with-dict/snippet.txt | 8 + .../str-translate-with-counter/content.md | 23 ++ .../str-translate-with-counter/snippet.txt | 7 + .../using-the-re-module/content.md | 57 +++ .../using-the-re-module/snippet.txt | 7 + 20 files changed, 728 insertions(+) create mode 100644 exercises/practice/word-count/..articles/config.json create mode 100644 exercises/practice/word-count/..articles/performance/code/Benchmark.py create mode 100644 exercises/practice/word-count/..articles/performance/content.md create mode 100644 exercises/practice/word-count/..articles/performance/snippet.md create mode 100644 exercises/practice/word-count/.approaches/config.json create mode 100644 exercises/practice/word-count/.approaches/filter-with-counter/content.md create mode 100644 exercises/practice/word-count/.approaches/filter-with-counter/snippet.txt create mode 100644 exercises/practice/word-count/.approaches/introduction.md create mode 100644 exercises/practice/word-count/.approaches/str-iteration-with-dict/content.md create mode 100644 exercises/practice/word-count/.approaches/str-iteration-with-dict/snippet.txt create mode 100644 exercises/practice/word-count/.approaches/str-replace-with-collections/content.md create mode 100644 exercises/practice/word-count/.approaches/str-replace-with-collections/snippet.txt create mode 100644 exercises/practice/word-count/.approaches/str-replace-with-comprehensions/content.md create mode 100644 exercises/practice/word-count/.approaches/str-replace-with-comprehensions/snippet.txt create mode 100644 exercises/practice/word-count/.approaches/str-replace-with-dict/content.md create mode 100644 exercises/practice/word-count/.approaches/str-replace-with-dict/snippet.txt create mode 100644 exercises/practice/word-count/.approaches/str-translate-with-counter/content.md create mode 100644 exercises/practice/word-count/.approaches/str-translate-with-counter/snippet.txt create mode 100644 exercises/practice/word-count/.approaches/using-the-re-module/content.md create mode 100644 exercises/practice/word-count/.approaches/using-the-re-module/snippet.txt diff --git a/exercises/practice/word-count/..articles/config.json b/exercises/practice/word-count/..articles/config.json new file mode 100644 index 0000000000..0b87f8bfd8 --- /dev/null +++ b/exercises/practice/word-count/..articles/config.json @@ -0,0 +1,11 @@ +{ + "articles": [ + { + "uuid": "c8434a98-716f-48d8-ba34-bb1d159fb420", + "slug": "performance", + "title": "Performance deep dive", + "blurb": "Deep dive to find out the most performant approach for counting words.", + "authors": ["bethanyg"] + } + ] +} \ No newline at end of file diff --git a/exercises/practice/word-count/..articles/performance/code/Benchmark.py b/exercises/practice/word-count/..articles/performance/code/Benchmark.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/exercises/practice/word-count/..articles/performance/content.md b/exercises/practice/word-count/..articles/performance/content.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/exercises/practice/word-count/..articles/performance/snippet.md b/exercises/practice/word-count/..articles/performance/snippet.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/exercises/practice/word-count/.approaches/config.json b/exercises/practice/word-count/.approaches/config.json new file mode 100644 index 0000000000..b74f5f9a7a --- /dev/null +++ b/exercises/practice/word-count/.approaches/config.json @@ -0,0 +1,56 @@ +{ + "introduction": { + "authors": ["bethanyg"] + }, + "approaches": [ + { + "uuid": "a3dd190d-ff92-4190-b8aa-2b4b46171b46\n\n", + "slug": "string-iteration-with-dict", + "title": "String Iteration with Dictionary", + "blurb": "Iterate over the sentence to filter out punctuation and sum word counts with a dict.", + "authors": ["bethanyg"] + }, + { + "uuid": "5a5f4f89-b016-48ed-9316-a49cadf9735f\n\n", + "slug": "str-replace-with-dict", + "title": "String Replace with Dictionary", + "blurb": "Use str.replace and str.strip to filter out punctuation and sum word counts with a dict.", + "authors": ["bethanyg"] + }, + { + "uuid": "3d5d629f-58b6-43c5-9470-30b30b193892\n\n", + "slug": "str-replace-with-comprehensions", + "title": "String Replace with Comprehensions", + "blurb": "Use str.replace and str.strip to filter out punctuation and sum word counts with comprehensions.", + "authors": ["bethanyg"] + }, + { + "uuid": "373182b1-8d21-4ee6-aa49-94ff6a9c57a1\n\n", + "slug": "str-replace-with-collections", + "title": "String Replace with Collections", + "blurb": "Use str.replace and str.strip to filter out punctuation and let collections.Counter do the rest.", + "authors": ["bethanyg"] + }, + { + "uuid": "2afed504-500d-4415-a507-b78311bde777\n\n", + "slug": "using-the-re-module", + "title": "Using the Re Module", + "blurb": "Python's Re module is both versatile and powerful.", + "authors": ["bethanyg"] + }, + { + "uuid": "0aa44eec-1c71-4937-8e3f-c45cbba22cc1\n\n", + "slug": "str-translate-with-counter", + "title": "String Translate with Counter", + "blurb": "Use str.translate to filter out punctuation and then tally words with collections.Counter.", + "authors": ["bethanyg"] + }, + { + "uuid": "182463b2-fcfa-4445-967c-8b185b7853c9\n\n", + "slug": "filter-with-counter", + "title": "Filter with Counter", + "blurb": "Use the built-in filter function to clean the sentence and collections.Counter to count the words.", + "authors": ["bethanyg"] + } + ] +} \ No newline at end of file diff --git a/exercises/practice/word-count/.approaches/filter-with-counter/content.md b/exercises/practice/word-count/.approaches/filter-with-counter/content.md new file mode 100644 index 0000000000..3ff0b8ce8a --- /dev/null +++ b/exercises/practice/word-count/.approaches/filter-with-counter/content.md @@ -0,0 +1,19 @@ +# Filter with Counter + + +```python +from collections import Counter +import string + +def count_words(sentence): + words = filter(None, (word.strip(string.punctuation) + for word in sentence + .lower() + .replace("_"," ") + .replace(",", " ") + .split()) + ) + return Counter(words) +``` + + diff --git a/exercises/practice/word-count/.approaches/filter-with-counter/snippet.txt b/exercises/practice/word-count/.approaches/filter-with-counter/snippet.txt new file mode 100644 index 0000000000..db209ab074 --- /dev/null +++ b/exercises/practice/word-count/.approaches/filter-with-counter/snippet.txt @@ -0,0 +1,8 @@ +def count_words(sentence): + words = filter(None, (word.strip(string.punctuation) + for word in sentence.lower() + .replace("_"," ") + .replace(",", " ") + .split())) + + return Counter(words) \ No newline at end of file diff --git a/exercises/practice/word-count/.approaches/introduction.md b/exercises/practice/word-count/.approaches/introduction.md new file mode 100644 index 0000000000..c09a990862 --- /dev/null +++ b/exercises/practice/word-count/.approaches/introduction.md @@ -0,0 +1,368 @@ +# Introduction + +There are many Pythonic ways to solve the Word Count exercise. +Among them are: + +- String iteration with `dict.get()` +- `str.replace()` for cleaning with: + - `collections.Counter` or `collections.defaultdict()` + - `dict.get()`or `dict.setdefault()` + - `list-comprehension` with `dictionary-comprehension` +- Regex (`re`) for cleaning (using`finditer`, `findall`, or `split`): +- `str.translate()` with `collections.Counter` and `walrus operator` +- `filter()` and `collections.Counter` + +Various parts of these strategies can also be combined or re-combined. + + +## General guidance + + +The goal of the Word Count exercise is to count the number of words used in a given phrase. + +Before an accurate count can be done, the phrase needs to be cleaned of non-word characters (punctuation, whitespace, tabs, etc.) and lowercased. + +This can be thought of in three parts: + +1. Remove unwanted characters from the phrase, +2. Lowercase and split the phrase into a list of words, +3. Count up the word groups in the word list. + +Because strings are immutable in Python, any cleaning action will return a new string. + + +Most idiomatic solutions use either a chain of `str.replace()`, or a regex via the `re` module to clean the phrase. +`str.srtip()` is also very useful for dropping unwanted characters. + +For efficiency, it is best to either lowercase the phrase prior to cleaning, or just before splitting. +Otherwise you risk adding overhead as you call `str.lower()` on individual words. + +To split, `str.split()` or `findall()`| `finditer()`| `split()` from the `re` module are usually the best strategies. +However, it is entirely possible to complete this exercise without splitting the phrase into a word list. + +Counting words most often employs the `Counter()` from `collections`, although `collections.defaultdict()`, `dict.get()` , `dict.setdefault()`, or a `dictionary comprehension` work just as well. + + +The temptation here is to go straight to regex, but `str.replace()` and `str.strip()` are surprisingly performant, and also easier to read for those unfamiliar with regex. + + + +## Approach: String Iteration with `dict.get()` or `dict.setdefault()` + + + +```python +def count_words(sentence): + sentence = sentence.lower() + '\n' #lowercase the sentence and add a carriage return to it. + word_list = {} + new_word = '' + + for pos, value in enumerate(sentence): #enumerate() hands back both index and value + if value.isalpha() or value.isdigit(): + new_word += value + elif new_word and value == "'" and sentence[pos + 1].isalpha(): + new_word += value + else: + if new_word: + word_list[new_word] = word_list.get(new_word, 0) + 1 + new_word = '' + + return word_list +``` + + +This approach avoids splitting the phrase into a word list by iterating over each _character_ in the sentence to determine if it belongs in a valid word (_only letters and numbers are allowed_). + +As each valid word is built up, it is added to the dictionary. + +Using `word_list.get()` allows for setting newly added words to value 1 while incrementing values for words already added to the dictionary. + +Once added to the dictionary, the variable `new_word` is reset to empty, ready for the next word. + + + +## Approach: `str.replace()` and `dict.get()` or `dict.setdefault()` + + + +```python +IGNORE = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c' + +def count_words(phrase): + words = (phrase.replace(',', ' ') + .replace('_', ' ') + .lower() + .split()) #Returns a list, ready for further processing. + + counts = {} + + for word in words: + word = word.strip(IGNORE) + if word: + # counts[word] = counts.setdefault(word, 0) + 1 can be used here instead. + # https://stackoverflow.com/questions/7423428/python-dict-get-vs-setdefault + counts[word] = counts.get(word, 0) + 1 + + return counts + +``` + + + +This approach replaces unwanted characters and lowercases the phrase before splitting it into a word list. + +It then loops through the word list and uses `str.strip()` to remove characters from the IGNORE string. + +If there is a word left after the `str.strip()`operation, it is added to the _counts_ dictionary. + + + + + +## Approach: `str.replace()` and `comprehensions` + + + +```python +IGNORE = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c' + +def count_words(phrase): + cleaned = (phrase.replace(',', ' ') + .replace('_', ' ') + .lower() + .split()) #Returns a list + + words = [word.strip(IGNORE) for word in cleaned] #Reprocesses cleaned, dropping unwaned characters. + + return {word : words.count(word) for word in words} + + + ###Alternatively, lines 4 and 9 can be combined### + + def count_words(phrase): + cleaned = [word.strip(IGNORE) for word in + phrase.replace(',', ' ') + .replace('_', ' ') + .lower() + .split() if word] + + return {word : cleaned.count(word) for word in cleaned} +``` + + + +This approach replaces the `for` loop in the previous approach with a `list comprehension`. + +The `dictionary comprehension` then processes the words list, calling word.count(word) for each word (key) in the words list. + +Note that a `generator expression` cannot be used in this scenario due to the use of `count()` in the dictionary comprehension. + + + + + +## Approach: `str.replace()` and `collections` + + + +```python +from string import punctuation, whitespace +from collections import Counter + +IGNORE = punctuation + whitespace + +def count_words(phrase): + words = (word.strip(IGNORE) for word in + phrase.replace(',', ' ') + .replace('_', ' ') + .lower() + .split() if word) + + return Counter(words) +``` + + + +This approach uses a `generator expression` to lowercase, clean, strip, and split the input phrase into words. + +Words is then consumed by the `Counter()` from the collections module, which counts up the words. + +For convenience, `string.punctuation` `string.whitespace` are imported to replace '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'. + + + +Alternatively, `collections.defaultdict()` can be used, but requires that the words generator be iterated through in an explicit loop: + +```python +from string import punctuation, whitespace +from collections import defaultdict + +IGNORE = punctuation + whitespace + +def count_words(phrase): + words = (word.strip(IGNORE) for word in + phrase.replace(',', ' ') + .replace('_', ' ') + .lower() + .split() if word) + + counts = defaultdict(int) + + for word in words: + counts[word] +=1 + + return counts +``` + + + +This a + + + +## Approach: Using the `re` module + + + +```python +import re +from collections import Counter +from string import punctuation + + +def count_words(phrase): + conditions = re.compile(r"[^_\s,:]+") + return Counter((match.group().strip(punctuation) for match in + re.finditer(conditions, phrase.lower()))) +``` + + + + + +```python +import re +from collections import Counter + + +def count_words(phrase): + conditions = re.compile(r"[a-z0-9]+(?:'[a-z]+)?") + return Counter(conditions.findall(phrase.lower())) +``` + + + + + +```python +import re +from collections import Counter + + +def count_words(sentence): + conditions = re.compile(r"[\s_,\"]+") + words = (re.sub(r'\A\W+', '', re.sub(r'\W+\Z', '', word)) + for word in conditions.split(sentence.lower()) if word) + + return Counter(words) +``` + + + +There are many variations that can be used in a regular expressions. + +This approach shows several different regexs, in combination with several different `re` methods. + +The first variation uses `re.findite()` to return a lazy iterator that is then fed to `collections.Counter`. + +The second variation uses `re.findall()`to return a list that is fed to `collections.Counter` + +The final variation uses `re.sub()` and `re.split()`to return a generator that is fed to `collections.Counter`. + + + +## Approach: `str.translate()`with `collections.Counter()` + + + +```python +from collections import Counter + +def count_words(phrase): + cleaner = phrase.maketrans({key: ' ' for key in ".,:-_!@$%^&"}) + cleaned = phrase.translate(cleaner).lower().split() + results = Counter((stripped for word in cleaned if + (stripped := word.strip("\"'")))) + + return results +``` + + + +This approach (somewhat unusually) uses `str.translate()` to filter out unwanted characters. + +`collections.Counter` is then used to count the words. + +The generator expression uses an assignment operator (_othewise know as the "walrus" operator `:=`_) to ensure that empty strings are excluded from the count. + + + +## Approach `filter()` with `collections.Counter()` + + + +```pyth +from collections import Counter +import string + +def count_words(sentence): + words = filter(None, (word.strip(string.punctuation) + for word in sentence + .lower() + .replace("_"," ") + .replace(",", " ") + .split()) + ) + return Counter(words) +``` + + + + + +This approach uses the built-in `filter()` to clean the phrase and `collections.Counter` to count the words. + +Filter is fed the same general generator expression that uses `str.replace()`, `str.lower()`, and `str.split()` seen in other approaches. + + + +For more information, see the [Filter with ] + + + + + +## Which approach to use? + + + +The most performant of these approaches is __ for longer text, and __ for smaller text. Memory-wise, ___ has the lease overhead. + +Overall, using `str.replace()` with a `Counter` or `dict` might be the most readable to those unfamiliar with regex, and iterating through the string might be more readable to those not familiar with Python. + + + +To compare the performance and other tradeoffs of the approaches, take a look at the [Performance article][article-performance]. + + + + + +[approach-string-iteration-with-dict]: https://exercism.org/tracks/python/exercises/word-count/approaches/string-iteration-with-dict +[approach-str-replace-with-dict]: https://exercism.org/tracks/python/exercises/word-count/approaches/str-replace-with-dict +[approach-str-replace-with-comprehensions]: https://exercism.org/tracks/python/exercises/word-count/approaches/str-replace-with-comprehensions +[approach-str-replace-with-collections]: https://exercism.org/tracks/python/exercises/word-count/approaches/str-replace-with-collections +[approach-using-the-re-module]: https://exercism.org/tracks/python/exercises/word-count/approaches/using-the-re-module +[approach-str-translate-with-counter]: https://exercism.org/tracks/python/exercises/word-count/approaches/str-translate-with-counter +[approach-filter-with-counter]: https://exercism.org/tracks/python/exercises/word-count/approaches/filter-with-counter +[article-performance]:https://exercism.org/tracks/python/exercises/word-count/articles/performance \ No newline at end of file diff --git a/exercises/practice/word-count/.approaches/str-iteration-with-dict/content.md b/exercises/practice/word-count/.approaches/str-iteration-with-dict/content.md new file mode 100644 index 0000000000..87bf0f0005 --- /dev/null +++ b/exercises/practice/word-count/.approaches/str-iteration-with-dict/content.md @@ -0,0 +1,23 @@ +# String Iteration with Dictionary + + + +```python +def count_words(sentence): + sentence = sentence.lower() + '\n' #lowercase the sentence and add a carriage return to it. + word_list = {} + new_word = '' + + for pos, value in enumerate(sentence): #enumerate() hands back both index and value + if value.isalpha() or value.isdigit(): + new_word += value + elif new_word and value == "'" and sentence[pos + 1].isalpha(): + new_word += value + else: + if new_word: + word_list[new_word] = word_list.get(new_word, 0) + 1 + new_word = '' + + return word_list +``` + diff --git a/exercises/practice/word-count/.approaches/str-iteration-with-dict/snippet.txt b/exercises/practice/word-count/.approaches/str-iteration-with-dict/snippet.txt new file mode 100644 index 0000000000..1e50deb4ab --- /dev/null +++ b/exercises/practice/word-count/.approaches/str-iteration-with-dict/snippet.txt @@ -0,0 +1,8 @@ +for pos, value in enumerate(sentence): + if value.isalpha() or value.isdigit(): + new_word += value + elif new_word and value == "'" and sentence[pos + 1].isalpha(): + new_word += value + else: + if new_word: + word_list[new_word] = word_list.get(new_word, 0) + 1; new_word = '' \ No newline at end of file diff --git a/exercises/practice/word-count/.approaches/str-replace-with-collections/content.md b/exercises/practice/word-count/.approaches/str-replace-with-collections/content.md new file mode 100644 index 0000000000..058f11adb0 --- /dev/null +++ b/exercises/practice/word-count/.approaches/str-replace-with-collections/content.md @@ -0,0 +1,50 @@ +# String Replace with Collections + + +```python +from string import punctuation, whitespace +from collections import Counter + +IGNORE = punctuation + whitespace + +def count_words(phrase): + words = (word.strip(IGNORE) for word in + phrase.replace(',', ' ') + .replace('_', ' ') + .lower() + .split() if word) + + return Counter(words) +``` + +This approach uses a `generator expression` to lowercase, clean, strip, and split the input phrase into words. + +Words is then consumed by the `Counter()` from the collections module, which counts up the words. + +For convenience, `string.punctuation` `string.whitespace` are imported to replace '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'. + + + +Alternatively, `collections.defaultdict()` can be used, but requires that the words generator be iterated through in an explicit loop: + +```python +from string import punctuation, whitespace +from collections import defaultdict + +IGNORE = punctuation + whitespace + +def count_words(phrase): + words = (word.strip(IGNORE) for word in + phrase.replace(',', ' ') + .replace('_', ' ') + .lower() + .split() if word) + + counts = defaultdict(int) + + for word in words: + counts[word] +=1 + + return counts +``` + diff --git a/exercises/practice/word-count/.approaches/str-replace-with-collections/snippet.txt b/exercises/practice/word-count/.approaches/str-replace-with-collections/snippet.txt new file mode 100644 index 0000000000..1bef2f45ea --- /dev/null +++ b/exercises/practice/word-count/.approaches/str-replace-with-collections/snippet.txt @@ -0,0 +1,8 @@ +IGNORE = punctuation + whitespace +words = (word.strip(IGNORE) for word in + phrase.replace(',', ' ') + .replace('_', ' ') + .lower() + .split() if word) + +return Counter(words) \ No newline at end of file diff --git a/exercises/practice/word-count/.approaches/str-replace-with-comprehensions/content.md b/exercises/practice/word-count/.approaches/str-replace-with-comprehensions/content.md new file mode 100644 index 0000000000..60027d376e --- /dev/null +++ b/exercises/practice/word-count/.approaches/str-replace-with-comprehensions/content.md @@ -0,0 +1,36 @@ +# String Replace with Comprehensions + + + +```python +IGNORE = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c' + +def count_words(phrase): + cleaned = (phrase.replace(',', ' ') + .replace('_', ' ') + .lower().split()) #Returns a list + + words = [word.strip(IGNORE) for word in cleaned] #Reprocesses cleaned, dropping unwaned characters. + + return {word : words.count(word) for word in words} + + + ###Alternatively, lines 4 and 9 can be combined### + + def count_words(phrase): + cleaned = [word.strip(IGNORE) for word in + phrase.replace(',', ' ') + .replace('_', ' ') + .lower() + .split() if word] + + return {word : cleaned.count(word) for word in cleaned} +``` + + + +This approach replaces the `for` loop in the previous approach with a `list comprehension`. + +The `dictionary comprehension` then processes the words list, calling word.count(word) for each word (key) in the words list. + +Note that a `generator expression` cannot be used in this scenario due to the use of `count()` in the dictionary comprehension. \ No newline at end of file diff --git a/exercises/practice/word-count/.approaches/str-replace-with-comprehensions/snippet.txt b/exercises/practice/word-count/.approaches/str-replace-with-comprehensions/snippet.txt new file mode 100644 index 0000000000..71430c680f --- /dev/null +++ b/exercises/practice/word-count/.approaches/str-replace-with-comprehensions/snippet.txt @@ -0,0 +1,8 @@ +IGNORE = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c' + +def count_words(phrase): + cleaned = (phrase.replace(',', ' ') + .replace('_', ' ') + .lower().split()) + words = [word.strip(IGNORE) for word in cleaned] + return {word : words.count(word) for word in words} \ No newline at end of file diff --git a/exercises/practice/word-count/.approaches/str-replace-with-dict/content.md b/exercises/practice/word-count/.approaches/str-replace-with-dict/content.md new file mode 100644 index 0000000000..2fe90f5688 --- /dev/null +++ b/exercises/practice/word-count/.approaches/str-replace-with-dict/content.md @@ -0,0 +1,31 @@ +# String Replace with Dictionary + + +```python +IGNORE = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c' + +def count_words(phrase): + words = (phrase.replace(',', ' ') + .replace('_', ' ') + .lower() + .split()) #Returns a list, ready for further processing. + + counts = {} + + for word in words: + word = word.strip(IGNORE) + if word: + # counts[word] = counts.setdefault(word, 0) + 1 can be used here instead. + # https://stackoverflow.com/questions/7423428/python-dict-get-vs-setdefault + counts[word] = counts.get(word, 0) + 1 + + return counts +``` + + + +This approach replaces unwanted characters and lowercases the phrase before splitting it into a word list. + +It then loops through the word list and uses `str.strip()` to remove characters from the IGNORE string. + +If there is a word left after the `str.strip()`operation, it is added to the _counts_ dictionary. \ No newline at end of file diff --git a/exercises/practice/word-count/.approaches/str-replace-with-dict/snippet.txt b/exercises/practice/word-count/.approaches/str-replace-with-dict/snippet.txt new file mode 100644 index 0000000000..269b632221 --- /dev/null +++ b/exercises/practice/word-count/.approaches/str-replace-with-dict/snippet.txt @@ -0,0 +1,8 @@ +counts = {} + +for word in words: + word = word.strip(IGNORE) + if word: + counts[word] = counts.get(word, 0) + 1 + +return counts \ No newline at end of file diff --git a/exercises/practice/word-count/.approaches/str-translate-with-counter/content.md b/exercises/practice/word-count/.approaches/str-translate-with-counter/content.md new file mode 100644 index 0000000000..276f1d2c8a --- /dev/null +++ b/exercises/practice/word-count/.approaches/str-translate-with-counter/content.md @@ -0,0 +1,23 @@ +# String Translate with Counter + + + +```python +from collections import Counter + +def count_words(phrase): + cleaner = phrase.maketrans({key: ' ' for key in ".,:-_!@$%^&"}) + cleaned = phrase.translate(cleaner).lower().split() + results = Counter((stripped for word in cleaned if + (stripped := word.strip("\"'")))) + + return results +``` + + + +This approach (somewhat unusually) uses `str.translate()` to filter out unwanted characters. + +`collections.Counter` is then used to count the words. + +The generator expression uses an assignment operator (_othewise know as the "walrus" operator `:=`_) to ensure that empty strings are excluded from the count. \ No newline at end of file diff --git a/exercises/practice/word-count/.approaches/str-translate-with-counter/snippet.txt b/exercises/practice/word-count/.approaches/str-translate-with-counter/snippet.txt new file mode 100644 index 0000000000..35965c2206 --- /dev/null +++ b/exercises/practice/word-count/.approaches/str-translate-with-counter/snippet.txt @@ -0,0 +1,7 @@ +def count_words(phrase): + cleaner = phrase.maketrans({key: ' ' for key in ".,:-_!@$%^&"}) + cleaned = phrase.translate(cleaner).lower().split() + results = Counter((stripped for word in cleaned if + (stripped := word.strip("\"'")))) + + return results \ No newline at end of file diff --git a/exercises/practice/word-count/.approaches/using-the-re-module/content.md b/exercises/practice/word-count/.approaches/using-the-re-module/content.md new file mode 100644 index 0000000000..ad56f0323a --- /dev/null +++ b/exercises/practice/word-count/.approaches/using-the-re-module/content.md @@ -0,0 +1,57 @@ +# Using the Re Module + + + +```python +import re +from collections import Counter +from string import punctuation + + +def count_words(phrase): + conditions = re.compile(r"[^_\s,:]+") + return Counter((match.group().strip(punctuation) for match in + re.finditer(conditions, phrase.lower()))) +``` + + + + +```python +import re +from collections import Counter + + +def count_words(phrase): + conditions = re.compile(r"[a-z0-9]+(?:'[a-z]+)?") + return Counter(conditions.findall(phrase.lower())) +``` + + + + + +```python +import re +from collections import Counter + + +def count_words(sentence): + conditions = re.compile(r"[\s_,\"]+") + words = (re.sub(r'\A\W+', '', re.sub(r'\W+\Z', '', word)) + for word in conditions.split(sentence.lower()) if word) + + return Counter(words) +``` + + + +There are many variations that can be used in a regular expressions. + +This approach shows several different regexs, in combination with several different `re` methods. + +The first variation uses `re.findite()` to return a lazy iterator that is then fed to `collections.Counter`. + +The second variation uses `re.findall()`to return a list that is fed to `collections.Counter` + +The final variation uses `re.sub()` and `re.split()`to return a generator that is fed to `collections.Counter`. \ No newline at end of file diff --git a/exercises/practice/word-count/.approaches/using-the-re-module/snippet.txt b/exercises/practice/word-count/.approaches/using-the-re-module/snippet.txt new file mode 100644 index 0000000000..623a3b78e0 --- /dev/null +++ b/exercises/practice/word-count/.approaches/using-the-re-module/snippet.txt @@ -0,0 +1,7 @@ +import re +from collections import Counter + + +def count_words(phrase): + conditions = re.compile(r"[a-z0-9]+(?:'[a-z]+)?") + return Counter(conditions.findall(phrase.lower())) \ No newline at end of file From 283741163e1ee11b13c5f5bf45263d0d1e506ed5 Mon Sep 17 00:00:00 2001 From: BethanyG Date: Tue, 21 Mar 2023 02:46:22 -0700 Subject: [PATCH 2/4] Removed the \n characters from the UUIDs. --- .../practice/word-count/.approaches/config.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/exercises/practice/word-count/.approaches/config.json b/exercises/practice/word-count/.approaches/config.json index b74f5f9a7a..299be22627 100644 --- a/exercises/practice/word-count/.approaches/config.json +++ b/exercises/practice/word-count/.approaches/config.json @@ -4,49 +4,49 @@ }, "approaches": [ { - "uuid": "a3dd190d-ff92-4190-b8aa-2b4b46171b46\n\n", + "uuid": "a3dd190d-ff92-4190-b8aa-2b4b46171b46", "slug": "string-iteration-with-dict", "title": "String Iteration with Dictionary", "blurb": "Iterate over the sentence to filter out punctuation and sum word counts with a dict.", "authors": ["bethanyg"] }, { - "uuid": "5a5f4f89-b016-48ed-9316-a49cadf9735f\n\n", + "uuid": "5a5f4f89-b016-48ed-9316-a49cadf9735f", "slug": "str-replace-with-dict", "title": "String Replace with Dictionary", "blurb": "Use str.replace and str.strip to filter out punctuation and sum word counts with a dict.", "authors": ["bethanyg"] }, { - "uuid": "3d5d629f-58b6-43c5-9470-30b30b193892\n\n", + "uuid": "3d5d629f-58b6-43c5-9470-30b30b193892", "slug": "str-replace-with-comprehensions", "title": "String Replace with Comprehensions", "blurb": "Use str.replace and str.strip to filter out punctuation and sum word counts with comprehensions.", "authors": ["bethanyg"] }, { - "uuid": "373182b1-8d21-4ee6-aa49-94ff6a9c57a1\n\n", + "uuid": "373182b1-8d21-4ee6-aa49-94ff6a9c57a1", "slug": "str-replace-with-collections", "title": "String Replace with Collections", "blurb": "Use str.replace and str.strip to filter out punctuation and let collections.Counter do the rest.", "authors": ["bethanyg"] }, { - "uuid": "2afed504-500d-4415-a507-b78311bde777\n\n", + "uuid": "2afed504-500d-4415-a507-b78311bde777", "slug": "using-the-re-module", "title": "Using the Re Module", "blurb": "Python's Re module is both versatile and powerful.", "authors": ["bethanyg"] }, { - "uuid": "0aa44eec-1c71-4937-8e3f-c45cbba22cc1\n\n", + "uuid": "0aa44eec-1c71-4937-8e3f-c45cbba22cc1", "slug": "str-translate-with-counter", "title": "String Translate with Counter", "blurb": "Use str.translate to filter out punctuation and then tally words with collections.Counter.", "authors": ["bethanyg"] }, { - "uuid": "182463b2-fcfa-4445-967c-8b185b7853c9\n\n", + "uuid": "182463b2-fcfa-4445-967c-8b185b7853c9", "slug": "filter-with-counter", "title": "Filter with Counter", "blurb": "Use the built-in filter function to clean the sentence and collections.Counter to count the words.", From f56c8634f772299c17ea3276e8ed9a7e206eb03c Mon Sep 17 00:00:00 2001 From: BethanyG Date: Tue, 21 Mar 2023 02:48:30 -0700 Subject: [PATCH 3/4] Fixed slug for str-iteration-with-dict. --- exercises/practice/word-count/.approaches/config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exercises/practice/word-count/.approaches/config.json b/exercises/practice/word-count/.approaches/config.json index 299be22627..9c72ccbccd 100644 --- a/exercises/practice/word-count/.approaches/config.json +++ b/exercises/practice/word-count/.approaches/config.json @@ -5,7 +5,7 @@ "approaches": [ { "uuid": "a3dd190d-ff92-4190-b8aa-2b4b46171b46", - "slug": "string-iteration-with-dict", + "slug": "str-iteration-with-dict", "title": "String Iteration with Dictionary", "blurb": "Iterate over the sentence to filter out punctuation and sum word counts with a dict.", "authors": ["bethanyg"] From 45dd93904c0005d7bf85c736f603037229e72785 Mon Sep 17 00:00:00 2001 From: BethanyG Date: Thu, 6 Apr 2023 12:45:24 -0700 Subject: [PATCH 4/4] Cleaned up intro and added specific links to detailed approaches. --- .../word-count/.approaches/introduction.md | 126 +++++++----------- 1 file changed, 50 insertions(+), 76 deletions(-) diff --git a/exercises/practice/word-count/.approaches/introduction.md b/exercises/practice/word-count/.approaches/introduction.md index c09a990862..53624ae6e6 100644 --- a/exercises/practice/word-count/.approaches/introduction.md +++ b/exercises/practice/word-count/.approaches/introduction.md @@ -4,15 +4,20 @@ There are many Pythonic ways to solve the Word Count exercise. Among them are: - String iteration with `dict.get()` -- `str.replace()` for cleaning with: - - `collections.Counter` or `collections.defaultdict()` - - `dict.get()`or `dict.setdefault()` - - `list-comprehension` with `dictionary-comprehension` -- Regex (`re`) for cleaning (using`finditer`, `findall`, or `split`): -- `str.translate()` with `collections.Counter` and `walrus operator` -- `filter()` and `collections.Counter` +- Using `str.replace()` for cleaning with: + - `collections.Counter` or `collections.defaultdict()` for counting + - `dict.get()` or `dict.setdefault()` for counting + - A `list-comprehension` with a `dictionary-comprehension` +- Employing regex (_the `re` module_) for cleaning: + - `re.finditer()` with `collections.counter()`, + - `re.findall()` with `collections.counter()`, + - Or `re.split()` with `collections.counter()`. +- Using `str.translate()` with `collections.Counter` and a `walrus operator` +- Combining the built-in `filter()` with `collections.Counter` Various parts of these strategies can also be combined or re-combined. +For example, `collections.defaultdict()`, `dict.get()`/`dict.setdefault()`, or a `dictionary-comprehension` can be swapped for `collections.Counter()`, or vice-versa in most, if not all of the solutions. +Likewise, `str.replace()` can be swapped with regex. ## General guidance @@ -20,7 +25,7 @@ Various parts of these strategies can also be combined or re-combined. The goal of the Word Count exercise is to count the number of words used in a given phrase. -Before an accurate count can be done, the phrase needs to be cleaned of non-word characters (punctuation, whitespace, tabs, etc.) and lowercased. +Before an accurate count can be done, the phrase needs to be cleaned of non-word characters (_punctuation, whitespace, tabs, etc._) and lower-cased. This can be thought of in three parts: @@ -35,15 +40,16 @@ Most idiomatic solutions use either a chain of `str.replace()`, or a regex via `str.srtip()` is also very useful for dropping unwanted characters. For efficiency, it is best to either lowercase the phrase prior to cleaning, or just before splitting. -Otherwise you risk adding overhead as you call `str.lower()` on individual words. +Otherwise, you risk adding overhead as you call `str.lower()` on individual words. -To split, `str.split()` or `findall()`| `finditer()`| `split()` from the `re` module are usually the best strategies. +To split, `str.split()` or `findall()`/`finditer()`/`split()` from the `re` module are usually the best strategies. However, it is entirely possible to complete this exercise without splitting the phrase into a word list. Counting words most often employs the `Counter()` from `collections`, although `collections.defaultdict()`, `dict.get()` , `dict.setdefault()`, or a `dictionary comprehension` work just as well. The temptation here is to go straight to regex, but `str.replace()` and `str.strip()` are surprisingly performant, and also easier to read for those unfamiliar with regex. +A complex regular expression that involves backtracking can also be much slower than `str.replace()` or `str.translate()`, so regexs should be composed and tested carefully. @@ -72,13 +78,12 @@ def count_words(sentence): This approach avoids splitting the phrase into a word list by iterating over each _character_ in the sentence to determine if it belongs in a valid word (_only letters and numbers are allowed_). - As each valid word is built up, it is added to the dictionary. Using `word_list.get()` allows for setting newly added words to value 1 while incrementing values for words already added to the dictionary. - Once added to the dictionary, the variable `new_word` is reset to empty, ready for the next word. +For more details, see the [String iteration with Dictionary Methods][approach-string-iteration-with-dict] approach. ## Approach: `str.replace()` and `dict.get()` or `dict.setdefault()` @@ -107,16 +112,11 @@ def count_words(phrase): ``` - - -This approach replaces unwanted characters and lowercases the phrase before splitting it into a word list. - +This approach replaces unwanted characters and lower-cases the phrase before splitting it into a word list. It then loops through the word list and uses `str.strip()` to remove characters from the IGNORE string. - If there is a word left after the `str.strip()`operation, it is added to the _counts_ dictionary. - - +For more information, read the [`str.replace()` with Dictionary Methods][approach-str-replace-with-dict] approach. ## Approach: `str.replace()` and `comprehensions` @@ -150,16 +150,19 @@ def count_words(phrase): ``` - This approach replaces the `for` loop in the previous approach with a `list comprehension`. - The `dictionary comprehension` then processes the words list, calling word.count(word) for each word (key) in the words list. -Note that a `generator expression` cannot be used in this scenario due to the use of `count()` in the dictionary comprehension. +~~~exercism/note +A `generator expression` cannot be used in this scenario due to the use of `count()` in the dictionary comprehension. +Generator expressions are not indexable, and can only be iterated through once. +~~~~ +For more information, read the [`str.replace()` with Comprehensions][approach-str-replace-with-comprehensions] approach. + ## Approach: `str.replace()` and `collections` @@ -182,16 +185,13 @@ def count_words(phrase): ``` - -This approach uses a `generator expression` to lowercase, clean, strip, and split the input phrase into words. - -Words is then consumed by the `Counter()` from the collections module, which counts up the words. - +This approach uses a `generator expression` to lower-case, clean, strip, and split the input phrase into words. +The words generator is then consumed by `collections.Counter()`, which counts up the words. For convenience, `string.punctuation` `string.whitespace` are imported to replace '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'. +Alternatively, `collections.defaultdict()` can be used in place of `Counter()`, but requires that the generator be iterated through in an explicit loop: -Alternatively, `collections.defaultdict()` can be used, but requires that the words generator be iterated through in an explicit loop: ```python from string import punctuation, whitespace @@ -214,10 +214,7 @@ def count_words(phrase): return counts ``` - - -This a - +For details on these two approaches, see the [`str.replace()` with collections][approach-str-replace-with-collections] approach. ## Approach: Using the `re` module @@ -237,9 +234,6 @@ def count_words(phrase): ``` - - - ```python import re from collections import Counter @@ -252,8 +246,6 @@ def count_words(phrase): - - ```python import re from collections import Counter @@ -268,20 +260,17 @@ def count_words(sentence): ``` +Regular Expressions have an almost endless variety. +This approach shows several different regular expression in combination with several different `re` module methods. -There are many variations that can be used in a regular expressions. +1. `re.finditer()` to return a lazy iterator that is then fed to `collections.Counter()`. +2. `re.findall()`to return a list that is fed to `collections.Counter()` +3. `re.sub()` and `re.split()`to return a generator that is fed to `collections.Counter()`. -This approach shows several different regexs, in combination with several different `re` methods. +For all the details on these variations, take a look at the [Using the `re` Module][approach-using-the-re-module] approach. -The first variation uses `re.findite()` to return a lazy iterator that is then fed to `collections.Counter`. -The second variation uses `re.findall()`to return a list that is fed to `collections.Counter` - -The final variation uses `re.sub()` and `re.split()`to return a generator that is fed to `collections.Counter`. - - - -## Approach: `str.translate()`with `collections.Counter()` +## Approach: `str.translate()` with `collections.Counter()` @@ -298,20 +287,21 @@ def count_words(phrase): ``` - -This approach (somewhat unusually) uses `str.translate()` to filter out unwanted characters. - +This approach uses (the somewhat unusual) `str.translate()` to filter out unwanted characters. `collections.Counter` is then used to count the words. + The generator expression uses an assignment operator (_othewise know as the "walrus" operator `:=`_) to ensure that empty strings are excluded from the count. +For more details, see the [Using `str.translate()` with `collections.counter()`][approach-str-translate-with-counter] approach. + ## Approach `filter()` with `collections.Counter()` -```pyth +```python from collections import Counter import string @@ -326,43 +316,27 @@ def count_words(sentence): return Counter(words) ``` - - - - -This approach uses the built-in `filter()` to clean the phrase and `collections.Counter` to count the words. - +This approach uses the built-in `filter()` to clean the phrase and `collections.Counter()` to count the words. Filter is fed the same general generator expression that uses `str.replace()`, `str.lower()`, and `str.split()` seen in other approaches. - -For more information, see the [Filter with ] - - - +For more information, read the [Filter with `collections.Counter()`][approach-filter-with-counter] approach. ## Which approach to use? - The most performant of these approaches is __ for longer text, and __ for smaller text. Memory-wise, ___ has the lease overhead. -Overall, using `str.replace()` with a `Counter` or `dict` might be the most readable to those unfamiliar with regex, and iterating through the string might be more readable to those not familiar with Python. - - +Overall, using `str.replace()` with a `collections.Counter()` or `dict` might be the most readable to those unfamiliar with regex, and iterating through the string might be more readable to those not familiar with Python. To compare the performance and other tradeoffs of the approaches, take a look at the [Performance article][article-performance]. - - - - -[approach-string-iteration-with-dict]: https://exercism.org/tracks/python/exercises/word-count/approaches/string-iteration-with-dict -[approach-str-replace-with-dict]: https://exercism.org/tracks/python/exercises/word-count/approaches/str-replace-with-dict -[approach-str-replace-with-comprehensions]: https://exercism.org/tracks/python/exercises/word-count/approaches/str-replace-with-comprehensions +[approach-filter-with-counter]: https://exercism.org/tracks/python/exercises/word-count/approaches/filter-with-counter [approach-str-replace-with-collections]: https://exercism.org/tracks/python/exercises/word-count/approaches/str-replace-with-collections -[approach-using-the-re-module]: https://exercism.org/tracks/python/exercises/word-count/approaches/using-the-re-module +[approach-str-replace-with-comprehensions]: https://exercism.org/tracks/python/exercises/word-count/approaches/str-replace-with-comprehensions +[approach-str-replace-with-dict]: https://exercism.org/tracks/python/exercises/word-count/approaches/str-replace-with-dict [approach-str-translate-with-counter]: https://exercism.org/tracks/python/exercises/word-count/approaches/str-translate-with-counter -[approach-filter-with-counter]: https://exercism.org/tracks/python/exercises/word-count/approaches/filter-with-counter -[article-performance]:https://exercism.org/tracks/python/exercises/word-count/articles/performance \ No newline at end of file +[approach-string-iteration-with-dict]: https://exercism.org/tracks/python/exercises/word-count/approaches/string-iteration-with-dict +[approach-using-the-re-module]: https://exercism.org/tracks/python/exercises/word-count/approaches/using-the-re-module +[article-performance]:https://exercism.org/tracks/python/exercises/word-count/articles/performance