New word list based on google ngram from 2008

In addition made handling of word lists in thesethreewords.py
more modular.
This commit is contained in:
Tim Head 2014-11-21 09:23:07 +01:00
parent 4aa7e921e7
commit e89bbe2399
5 changed files with 32782 additions and 32795 deletions

View File

@ -22,7 +22,7 @@ def get_words(fname):
return words
# These read like alien races from a sci-fi book
GOOGLE_WORDLIST = get_words("words/google-ngram-list")
GOOGLE_WORDLIST = get_words("words/google-ngram-list2")
# current best list for the three word hash
WORDNET_LEMMAS = get_words("words/wordnet-list")
@ -74,7 +74,8 @@ class WordHasher(object):
self._symbols = "0123456789bcdefghjkmnpqrstuvwxyz"
self._decode_symbols = dict((ch, i) for (i, ch) in enumerate(self._symbols))
self._encode_symbols = dict((i, ch) for (i, ch) in enumerate(self._symbols))
self.six_wordlist = HUMAN_WORDLIST
self.three_wordlist = GOOGLE_WORDLIST
def three_words(self, (lat, lon)):
"""Convert coordinate to a combination of three words
@ -83,7 +84,7 @@ class WordHasher(object):
in degrees.
"""
gh = geohash.encode(lat, lon, 9)
words = "-".join(WORDNET_LEMMAS[p] for p in self.to_rugbits(self.geo_to_int(gh)))
words = "-".join(self.three_wordlist[p] for p in self.to_rugbits(self.geo_to_int(gh)))
return words
def six_words(self, (lat, lon)):
@ -96,17 +97,17 @@ class WordHasher(object):
which are short, easy to pronounce and easy distinguish.
"""
gh = geohash.encode(lat, lon, 9)
words = "-".join(HUMAN_WORDLIST[p] for p in self.to_bytes(self.pad(gh)))
words = "-".join(self.six_wordlist[p] for p in self.to_bytes(self.pad(gh)))
return words
def decode(self, words):
"""Decode words back to latitude and longitude"""
words = words.split("-")
if len(words) == 3:
i = self.rugbits_to_int([WORDNET_LEMMAS.index(w) for w in words])
i = self.rugbits_to_int([self.three_wordlist.index(w) for w in words])
elif len(words) == 6:
i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])
i = self.bytes_to_int([self.six_wordlist.index(w) for w in words])
i = self.unpad(i)
else:

View File

@ -20,7 +20,8 @@ Download the corpus from [google ngram][googlengram] with:
[wordnet]: http://wordnet.princeton.edu/
[googlengram]: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
then you can filter the words like this:
Filter out unpopular words, not between four and seven characters,
containing punctuation and numbers, etc like this:
for L in a b c d e f g h i j k l m n o p q r s t u v w x y z; do
gzcat googlebooks-eng-all-1gram-20120701-$L.gz | python ngram-filter.py > googlebooks-eng-all-1gram-20120701-$L-filtered;
@ -30,9 +31,9 @@ To get a list of the top 300 words:
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | tail -n 300
To create the wordlist used by `These3Words` run:
Final step in creating a wordlist useable by `These3Words` is to run:
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort | uniq | tail -n32768 > google-ngram-list
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort -k 2 | uniq -f 1 | sort -n | tail -n32768 | awk '{print $2}' > google-ngram-list
Check that your list is long enough by counting the lines
in `google-ngram-list`, you need exactly 32768 words
in `google-ngram-list`, you need exactly 32768 words.

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,9 @@
"""Filter and reformat the google ngram corpus"""
"""Filter and reformat the google ngram corpus
Remove really unpopular words, use 2008's count,
try to remove abbreviations and words containing
punctuation and digits.
"""
import string
import fileinput

View File

@ -1,5 +1,6 @@
import difflib
import fileinput
import unicodedata
import nltk.stem as stem
@ -11,7 +12,8 @@ fellatio felching fuck fudgepacker fudge packer flange Goddamn God
damn hell homo jerk jizz knobend knob end labia lmao lmfao muff nigger
nigga omg penis piss poop prick pube pussy queer scrotum sex shit sh1t
slut smegma spunk suicide tit tosser turd twat vagina wank whore wtf
xxx""".split()
xxx sexual sexily sexist sexing sexta sextet sexier sexton sextus
wessex sexism sussex sexes sexual""".split()
# Words that sound similar to others
HOMOPHONES = """there their than then hear here capital capitol won to too lose
@ -102,46 +104,24 @@ wile whine, wine whirl, whorl whirled, world whit, wit white, wight
who's, whose woe, whoa wood, would yaw, yore, your, you're yoke, yolk
you'll, yule""".replace(",", " ").lower().split()
HANDPICKED = """buttel buttle wholes""".lower().split()
wnl = stem.WordNetLemmatizer()
REMOVE = HOMOPHONES + MORE_HOMOPHONES + RUDEWORDS
REMOVE = set(wnl.lemmatize(R) for R in REMOVE)
seen_words = []
N = 0
for line in fileinput.input():
count, word = line.split()
word = word.lower()
#try:
# word = wnl.lemmatize(word)
#
#except UnicodeDecodeError:
# continue
if word.startswith("z"):
continue
if word in REMOVE:
continue
if len(word) == 4:
try:
word.decode('ascii')
except UnicodeDecodeError:
continue
reject = False
s = difflib.SequenceMatcher(None, word, "A")
for w in seen_words:
s.set_seq2(w)
if s.ratio() > 0.8:
reject = True
break
if reject:
continue
seen_words.append(word)
N += 1
if N >= 10000:
seen_words = seen_words[-N:]
N = 0
print word
print count, word