Bug fix for duplicate words

This word list is still pretty bad ...
2014-11-17 16:38:15 +01:00 · 2014-11-17 16:38:15 +01:00 · f88c7f856d
parent 2b514a981d
commit f88c7f856d
4 changed files with 32772 additions and 32773 deletions
--- a/thesethreewords.py
+++ b/thesethreewords.py
@ -11,9 +11,8 @@ import geohash
 def get_google_words():
    lines = open("words/google-ngram-list")
    words = []
-    for line in lines:
+    for word in lines:
-        _, word = line.split()
+        words.append(word.strip())
        words.append(word)
    lines.close()
    random.seed(634634)
--- a/words/README.md
+++ b/words/README.md
@ -21,7 +21,7 @@ To get a list of the top 300 words:
 To create the wordlist used by `These3Words` run:
-   sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | tail -n32768 > google-ngram-list
+   sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort | uniq | tail -n32768 > google-ngram-list
 Check that your list is long enough by counting the lines
 in `google-ngram-list`, you need exactly 32768 words
--- a/words/google-ngram-list
+++ b/words/google-ngram-list
--- a/words/normalise-words.py
+++ b/words/normalise-words.py
@ -109,4 +109,4 @@ for line in fileinput.input():
    if len(word) == 4:
        continue
-    print count, word.lower()
+    print word.lower()