Merge pull request #14 from betatim/google-ngram-wordlist

New word list based on google ngram from 2008
2014-11-21 10:06:08 +01:00 · 2014-11-21 10:06:08 +01:00 · 1311a32577
parent 4aa7e921e7 0bc618049e
commit 1311a32577
5 changed files with 32781 additions and 32794 deletions
--- a/thesethreewords.py
+++ b/thesethreewords.py
@ -74,7 +74,8 @@ class WordHasher(object):
        self._symbols = "0123456789bcdefghjkmnpqrstuvwxyz"
        self._decode_symbols = dict((ch, i) for (i, ch) in enumerate(self._symbols))
        self._encode_symbols = dict((i, ch) for (i, ch) in enumerate(self._symbols))
-
+        self.six_wordlist = HUMAN_WORDLIST
+        self.three_wordlist = GOOGLE_WORDLIST
        
    def three_words(self, (lat, lon)):
        """Convert coordinate to a combination of three words
@ -83,7 +84,7 @@ class WordHasher(object):
        in degrees.
        """
        gh = geohash.encode(lat, lon, 9)
-        words = "-".join(WORDNET_LEMMAS[p] for p in self.to_rugbits(self.geo_to_int(gh)))
+        words = "-".join(self.three_wordlist[p] for p in self.to_rugbits(self.geo_to_int(gh)))
        return words

    def six_words(self, (lat, lon)):
@ -96,17 +97,17 @@ class WordHasher(object):
        which are short, easy to pronounce and easy distinguish.
        """
        gh = geohash.encode(lat, lon, 9)
-        words = "-".join(HUMAN_WORDLIST[p] for p in self.to_bytes(self.pad(gh)))
+        words = "-".join(self.six_wordlist[p] for p in self.to_bytes(self.pad(gh)))
        return words

    def decode(self, words):
        """Decode words back to latitude and longitude"""
        words = words.split("-")
        if len(words) == 3:
-            i = self.rugbits_to_int([WORDNET_LEMMAS.index(w) for w in words])
+            i = self.rugbits_to_int([self.three_wordlist.index(w) for w in words])

        elif len(words) == 6:
-            i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])
+            i = self.bytes_to_int([self.six_wordlist.index(w) for w in words])
            i = self.unpad(i)

        else:
--- a/words/README.md
+++ b/words/README.md
@ -20,7 +20,8 @@ Download the corpus from [google ngram][googlengram] with:
 [wordnet]: http://wordnet.princeton.edu/
 [googlengram]: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html

-then you can filter the words like this:
+Filter out unpopular words, not between four and seven characters,
+containing punctuation and numbers, etc like this:

    for L in a b c d e f g h i j k l m n o p q r s t u v w x y z; do
        gzcat googlebooks-eng-all-1gram-20120701-$L.gz | python ngram-filter.py > googlebooks-eng-all-1gram-20120701-$L-filtered;
@ -30,9 +31,9 @@ To get a list of the top 300 words:

    sort -n googlebooks-eng-all-1gram-20120701-*-filtered | tail -n 300

-To create the wordlist used by `These3Words` run:
+Final step in creating a wordlist useable by `These3Words` is to run:

-   sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort | uniq | tail -n32768 > google-ngram-list
+   sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort -k 2 | uniq -f 1 | sort -n | tail -n32768 | awk '{print $2}' > google-ngram-list

 Check that your list is long enough by counting the lines
-in `google-ngram-list`, you need exactly 32768 words
+in `google-ngram-list`, you need exactly 32768 words.
--- a/words/google-ngram-list
+++ b/words/google-ngram-list
--- a/words/ngram-filter.py
+++ b/words/ngram-filter.py
@ -1,4 +1,9 @@
-"""Filter and reformat the google ngram corpus"""
+"""Filter and reformat the google ngram corpus
+
+Remove really unpopular words, use 2008's count,
+try to remove abbreviations and words containing
+punctuation and digits.
+"""
 import string
 import fileinput

--- a/words/normalise-words.py
+++ b/words/normalise-words.py
@ -1,5 +1,6 @@
 import difflib
 import fileinput
+import unicodedata

 import nltk.stem as stem

@ -11,7 +12,8 @@ fellatio felching fuck fudgepacker fudge packer flange Goddamn God
 damn hell homo jerk jizz knobend knob end labia lmao lmfao muff nigger
 nigga omg penis piss poop prick pube pussy queer scrotum sex shit sh1t
 slut smegma spunk suicide tit tosser turd twat vagina wank whore wtf
-xxx""".split()
+xxx sexual sexily sexist sexing sexta sextet sexier sexton sextus
+wessex sexism sussex sexes sexual""".split()

 # Words that sound similar to others
 HOMOPHONES = """there their than then hear here capital capitol won to too lose
@ -102,46 +104,24 @@ wile whine, wine whirl, whorl whirled, world whit, wit white, wight
 who's, whose woe, whoa wood, would yaw, yore, your, you're yoke, yolk
 you'll, yule""".replace(",", " ").lower().split()

+HANDPICKED = """buttel buttle wholes""".lower().split()
+
 wnl = stem.WordNetLemmatizer()

 REMOVE = HOMOPHONES + MORE_HOMOPHONES + RUDEWORDS
 REMOVE = set(wnl.lemmatize(R) for R in REMOVE)

-seen_words = []
-N = 0
 for line in fileinput.input():
    count, word = line.split()
    word = word.lower()
-    #try:
-    #    word = wnl.lemmatize(word)
-    #
-    #except UnicodeDecodeError:
-    #    continue
-
-    if word.startswith("z"):
-        continue
        
    if word in REMOVE:
        continue
-
-    if len(word) == 4:
+        
+    try:
+        word.decode('ascii')
+        
+    except UnicodeDecodeError:
        continue

-    reject = False
-    s = difflib.SequenceMatcher(None, word, "A")
-    for w in seen_words:
-        s.set_seq2(w)
-        if s.ratio() > 0.8:
-            reject = True
-            break
-
-    if reject:
-        continue
-
-    seen_words.append(word)
-    N += 1
-    if N >= 10000:
-        seen_words = seen_words[-N:]
-        N = 0
-
-    print word
+    print count, word