Bug fix for duplicate words
This word list is still pretty bad ...
This commit is contained in:
parent
2b514a981d
commit
f88c7f856d
|
@ -11,9 +11,8 @@ import geohash
|
|||
def get_google_words():
|
||||
lines = open("words/google-ngram-list")
|
||||
words = []
|
||||
for line in lines:
|
||||
_, word = line.split()
|
||||
words.append(word)
|
||||
for word in lines:
|
||||
words.append(word.strip())
|
||||
|
||||
lines.close()
|
||||
random.seed(634634)
|
||||
|
|
|
@ -21,7 +21,7 @@ To get a list of the top 300 words:
|
|||
|
||||
To create the wordlist used by `These3Words` run:
|
||||
|
||||
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | tail -n32768 > google-ngram-list
|
||||
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort | uniq | tail -n32768 > google-ngram-list
|
||||
|
||||
Check that your list is long enough by counting the lines
|
||||
in `google-ngram-list`, you need exactly 32768 words
|
65536
words/google-ngram-list
65536
words/google-ngram-list
File diff suppressed because it is too large
Load Diff
|
@ -109,4 +109,4 @@ for line in fileinput.input():
|
|||
if len(word) == 4:
|
||||
continue
|
||||
|
||||
print count, word.lower()
|
||||
print word.lower()
|
||||
|
|
Loading…
Reference in New Issue