Bug fix for duplicate words
This word list is still pretty bad ...
This commit is contained in:
parent
2b514a981d
commit
f88c7f856d
|
@ -11,9 +11,8 @@ import geohash
|
||||||
def get_google_words():
|
def get_google_words():
|
||||||
lines = open("words/google-ngram-list")
|
lines = open("words/google-ngram-list")
|
||||||
words = []
|
words = []
|
||||||
for line in lines:
|
for word in lines:
|
||||||
_, word = line.split()
|
words.append(word.strip())
|
||||||
words.append(word)
|
|
||||||
|
|
||||||
lines.close()
|
lines.close()
|
||||||
random.seed(634634)
|
random.seed(634634)
|
||||||
|
|
|
@ -21,7 +21,7 @@ To get a list of the top 300 words:
|
||||||
|
|
||||||
To create the wordlist used by `These3Words` run:
|
To create the wordlist used by `These3Words` run:
|
||||||
|
|
||||||
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | tail -n32768 > google-ngram-list
|
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort | uniq | tail -n32768 > google-ngram-list
|
||||||
|
|
||||||
Check that your list is long enough by counting the lines
|
Check that your list is long enough by counting the lines
|
||||||
in `google-ngram-list`, you need exactly 32768 words
|
in `google-ngram-list`, you need exactly 32768 words
|
65536
words/google-ngram-list
65536
words/google-ngram-list
File diff suppressed because it is too large
Load Diff
|
@ -109,4 +109,4 @@ for line in fileinput.input():
|
||||||
if len(word) == 4:
|
if len(word) == 4:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print count, word.lower()
|
print word.lower()
|
||||||
|
|
Loading…
Reference in New Issue