Merge pull request #14 from betatim/google-ngram-wordlist

New word list based on google ngram from 2008
This commit is contained in:
Kevin Dungs 2014-11-21 10:06:08 +01:00
commit 1311a32577
5 changed files with 32781 additions and 32794 deletions

View File

@ -74,7 +74,8 @@ class WordHasher(object):
self._symbols = "0123456789bcdefghjkmnpqrstuvwxyz"
self._decode_symbols = dict((ch, i) for (i, ch) in enumerate(self._symbols))
self._encode_symbols = dict((i, ch) for (i, ch) in enumerate(self._symbols))
self.six_wordlist = HUMAN_WORDLIST
self.three_wordlist = GOOGLE_WORDLIST
def three_words(self, (lat, lon)):
"""Convert coordinate to a combination of three words
@ -83,7 +84,7 @@ class WordHasher(object):
in degrees.
"""
gh = geohash.encode(lat, lon, 9)
words = "-".join(WORDNET_LEMMAS[p] for p in self.to_rugbits(self.geo_to_int(gh)))
words = "-".join(self.three_wordlist[p] for p in self.to_rugbits(self.geo_to_int(gh)))
return words
def six_words(self, (lat, lon)):
@ -96,17 +97,17 @@ class WordHasher(object):
which are short, easy to pronounce and easy distinguish.
"""
gh = geohash.encode(lat, lon, 9)
words = "-".join(HUMAN_WORDLIST[p] for p in self.to_bytes(self.pad(gh)))
words = "-".join(self.six_wordlist[p] for p in self.to_bytes(self.pad(gh)))
return words
def decode(self, words):
"""Decode words back to latitude and longitude"""
words = words.split("-")
if len(words) == 3:
i = self.rugbits_to_int([WORDNET_LEMMAS.index(w) for w in words])
i = self.rugbits_to_int([self.three_wordlist.index(w) for w in words])
elif len(words) == 6:
i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])
i = self.bytes_to_int([self.six_wordlist.index(w) for w in words])
i = self.unpad(i)
else:

View File

@ -20,7 +20,8 @@ Download the corpus from [google ngram][googlengram] with:
[wordnet]: http://wordnet.princeton.edu/
[googlengram]: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
then you can filter the words like this:
Filter out unpopular words, not between four and seven characters,
containing punctuation and numbers, etc like this:
for L in a b c d e f g h i j k l m n o p q r s t u v w x y z; do
gzcat googlebooks-eng-all-1gram-20120701-$L.gz | python ngram-filter.py > googlebooks-eng-all-1gram-20120701-$L-filtered;
@ -30,9 +31,9 @@ To get a list of the top 300 words:
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | tail -n 300
To create the wordlist used by `These3Words` run:
Final step in creating a wordlist useable by `These3Words` is to run:
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort | uniq | tail -n32768 > google-ngram-list
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort -k 2 | uniq -f 1 | sort -n | tail -n32768 | awk '{print $2}' > google-ngram-list
Check that your list is long enough by counting the lines
in `google-ngram-list`, you need exactly 32768 words
in `google-ngram-list`, you need exactly 32768 words.

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,9 @@
"""Filter and reformat the google ngram corpus"""
"""Filter and reformat the google ngram corpus
Remove really unpopular words, use 2008's count,
try to remove abbreviations and words containing
punctuation and digits.
"""
import string
import fileinput

View File

@ -1,5 +1,6 @@
import difflib
import fileinput
import unicodedata
import nltk.stem as stem
@ -11,7 +12,8 @@ fellatio felching fuck fudgepacker fudge packer flange Goddamn God
damn hell homo jerk jizz knobend knob end labia lmao lmfao muff nigger
nigga omg penis piss poop prick pube pussy queer scrotum sex shit sh1t
slut smegma spunk suicide tit tosser turd twat vagina wank whore wtf
xxx""".split()
xxx sexual sexily sexist sexing sexta sextet sexier sexton sextus
wessex sexism sussex sexes sexual""".split()
# Words that sound similar to others
HOMOPHONES = """there their than then hear here capital capitol won to too lose
@ -102,46 +104,24 @@ wile whine, wine whirl, whorl whirled, world whit, wit white, wight
who's, whose woe, whoa wood, would yaw, yore, your, you're yoke, yolk
you'll, yule""".replace(",", " ").lower().split()
HANDPICKED = """buttel buttle wholes""".lower().split()
wnl = stem.WordNetLemmatizer()
REMOVE = HOMOPHONES + MORE_HOMOPHONES + RUDEWORDS
REMOVE = set(wnl.lemmatize(R) for R in REMOVE)
seen_words = []
N = 0
for line in fileinput.input():
count, word = line.split()
word = word.lower()
#try:
# word = wnl.lemmatize(word)
#
#except UnicodeDecodeError:
# continue
if word.startswith("z"):
continue
if word in REMOVE:
continue
if len(word) == 4:
try:
word.decode('ascii')
except UnicodeDecodeError:
continue
reject = False
s = difflib.SequenceMatcher(None, word, "A")
for w in seen_words:
s.set_seq2(w)
if s.ratio() > 0.8:
reject = True
break
if reject:
continue
seen_words.append(word)
N += 1
if N >= 10000:
seen_words = seen_words[-N:]
N = 0
print word
print count, word