Merge pull request #14 from betatim/google-ngram-wordlist
New word list based on google ngram from 2008
This commit is contained in:
commit
1311a32577
|
@ -74,7 +74,8 @@ class WordHasher(object):
|
|||
self._symbols = "0123456789bcdefghjkmnpqrstuvwxyz"
|
||||
self._decode_symbols = dict((ch, i) for (i, ch) in enumerate(self._symbols))
|
||||
self._encode_symbols = dict((i, ch) for (i, ch) in enumerate(self._symbols))
|
||||
|
||||
self.six_wordlist = HUMAN_WORDLIST
|
||||
self.three_wordlist = GOOGLE_WORDLIST
|
||||
|
||||
def three_words(self, (lat, lon)):
|
||||
"""Convert coordinate to a combination of three words
|
||||
|
@ -83,7 +84,7 @@ class WordHasher(object):
|
|||
in degrees.
|
||||
"""
|
||||
gh = geohash.encode(lat, lon, 9)
|
||||
words = "-".join(WORDNET_LEMMAS[p] for p in self.to_rugbits(self.geo_to_int(gh)))
|
||||
words = "-".join(self.three_wordlist[p] for p in self.to_rugbits(self.geo_to_int(gh)))
|
||||
return words
|
||||
|
||||
def six_words(self, (lat, lon)):
|
||||
|
@ -96,17 +97,17 @@ class WordHasher(object):
|
|||
which are short, easy to pronounce and easy distinguish.
|
||||
"""
|
||||
gh = geohash.encode(lat, lon, 9)
|
||||
words = "-".join(HUMAN_WORDLIST[p] for p in self.to_bytes(self.pad(gh)))
|
||||
words = "-".join(self.six_wordlist[p] for p in self.to_bytes(self.pad(gh)))
|
||||
return words
|
||||
|
||||
def decode(self, words):
|
||||
"""Decode words back to latitude and longitude"""
|
||||
words = words.split("-")
|
||||
if len(words) == 3:
|
||||
i = self.rugbits_to_int([WORDNET_LEMMAS.index(w) for w in words])
|
||||
i = self.rugbits_to_int([self.three_wordlist.index(w) for w in words])
|
||||
|
||||
elif len(words) == 6:
|
||||
i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])
|
||||
i = self.bytes_to_int([self.six_wordlist.index(w) for w in words])
|
||||
i = self.unpad(i)
|
||||
|
||||
else:
|
||||
|
|
|
@ -20,7 +20,8 @@ Download the corpus from [google ngram][googlengram] with:
|
|||
[wordnet]: http://wordnet.princeton.edu/
|
||||
[googlengram]: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
|
||||
|
||||
then you can filter the words like this:
|
||||
Filter out unpopular words, not between four and seven characters,
|
||||
containing punctuation and numbers, etc like this:
|
||||
|
||||
for L in a b c d e f g h i j k l m n o p q r s t u v w x y z; do
|
||||
gzcat googlebooks-eng-all-1gram-20120701-$L.gz | python ngram-filter.py > googlebooks-eng-all-1gram-20120701-$L-filtered;
|
||||
|
@ -30,9 +31,9 @@ To get a list of the top 300 words:
|
|||
|
||||
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | tail -n 300
|
||||
|
||||
To create the wordlist used by `These3Words` run:
|
||||
Final step in creating a wordlist useable by `These3Words` is to run:
|
||||
|
||||
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort | uniq | tail -n32768 > google-ngram-list
|
||||
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort -k 2 | uniq -f 1 | sort -n | tail -n32768 | awk '{print $2}' > google-ngram-list
|
||||
|
||||
Check that your list is long enough by counting the lines
|
||||
in `google-ngram-list`, you need exactly 32768 words
|
||||
in `google-ngram-list`, you need exactly 32768 words.
|
||||
|
|
65506
words/google-ngram-list
65506
words/google-ngram-list
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,9 @@
|
|||
"""Filter and reformat the google ngram corpus"""
|
||||
"""Filter and reformat the google ngram corpus
|
||||
|
||||
Remove really unpopular words, use 2008's count,
|
||||
try to remove abbreviations and words containing
|
||||
punctuation and digits.
|
||||
"""
|
||||
import string
|
||||
import fileinput
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import difflib
|
||||
import fileinput
|
||||
import unicodedata
|
||||
|
||||
import nltk.stem as stem
|
||||
|
||||
|
@ -11,7 +12,8 @@ fellatio felching fuck fudgepacker fudge packer flange Goddamn God
|
|||
damn hell homo jerk jizz knobend knob end labia lmao lmfao muff nigger
|
||||
nigga omg penis piss poop prick pube pussy queer scrotum sex shit sh1t
|
||||
slut smegma spunk suicide tit tosser turd twat vagina wank whore wtf
|
||||
xxx""".split()
|
||||
xxx sexual sexily sexist sexing sexta sextet sexier sexton sextus
|
||||
wessex sexism sussex sexes sexual""".split()
|
||||
|
||||
# Words that sound similar to others
|
||||
HOMOPHONES = """there their than then hear here capital capitol won to too lose
|
||||
|
@ -102,46 +104,24 @@ wile whine, wine whirl, whorl whirled, world whit, wit white, wight
|
|||
who's, whose woe, whoa wood, would yaw, yore, your, you're yoke, yolk
|
||||
you'll, yule""".replace(",", " ").lower().split()
|
||||
|
||||
HANDPICKED = """buttel buttle wholes""".lower().split()
|
||||
|
||||
wnl = stem.WordNetLemmatizer()
|
||||
|
||||
REMOVE = HOMOPHONES + MORE_HOMOPHONES + RUDEWORDS
|
||||
REMOVE = set(wnl.lemmatize(R) for R in REMOVE)
|
||||
|
||||
seen_words = []
|
||||
N = 0
|
||||
for line in fileinput.input():
|
||||
count, word = line.split()
|
||||
word = word.lower()
|
||||
#try:
|
||||
# word = wnl.lemmatize(word)
|
||||
#
|
||||
#except UnicodeDecodeError:
|
||||
# continue
|
||||
|
||||
if word.startswith("z"):
|
||||
continue
|
||||
|
||||
if word in REMOVE:
|
||||
continue
|
||||
|
||||
if len(word) == 4:
|
||||
|
||||
try:
|
||||
word.decode('ascii')
|
||||
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
reject = False
|
||||
s = difflib.SequenceMatcher(None, word, "A")
|
||||
for w in seen_words:
|
||||
s.set_seq2(w)
|
||||
if s.ratio() > 0.8:
|
||||
reject = True
|
||||
break
|
||||
|
||||
if reject:
|
||||
continue
|
||||
|
||||
seen_words.append(word)
|
||||
N += 1
|
||||
if N >= 10000:
|
||||
seen_words = seen_words[-N:]
|
||||
N = 0
|
||||
|
||||
print word
|
||||
print count, word
|
||||
|
|
Loading…
Reference in New Issue