Yet another wordlist, based on WordNet this time

2014-11-17 18:25:37 +01:00 · 2014-11-17 18:25:37 +01:00 · ff5d158a4e
parent 7d657e99ca
commit ff5d158a4e
7 changed files with 38272 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -16,7 +16,7 @@ example

    >>> three = these.three_words(CERN)
    >>> print three
-    'zeljka-worry-suhai'
+    'spitting-ripple-fontanel' 
    >>> these.decode(three)
    (46.232335567474365, 6.055419445037842)

--- a/thesethreewords.py
+++ b/thesethreewords.py
@ -8,8 +8,8 @@ import random
 import geohash


-def get_google_words():
-    lines = open("words/google-ngram-list")
+def get_words(fname):
+    lines = open(fname)
    words = []
    for word in lines:
        words.append(word.strip())
@ -17,10 +17,18 @@ def get_google_words():
    lines.close()
    random.seed(634634)
    random.shuffle(words)
+    words = words[:2**15]
+    assert len(words) == len(set(words))
    return words
-GOOGLE_WORDLIST = get_google_words()
+
+# These read like alien races from a sci-fi book
+GOOGLE_WORDLIST = get_words("words/google-ngram-list")
+# current best list for the three word hash
+WORDNET_LEMMAS = get_words("words/wordnet-list")

 # Human friendly word list, taken directly from humanhash project
+# these are the best words but there are not enough of
+# them so we only use them for the six word hash
 HUMAN_WORDLIST = (
        'ack', 'alabama', 'alanine', 'alaska', 'alpha', 'angel', 'apart', 'april',
        'arizona', 'arkansas', 'artist', 'asparagus', 'aspen', 'august', 'autumn',
@ -75,7 +83,7 @@ class WordHasher(object):
        in degrees.
        """
        gh = geohash.encode(lat, lon, 9)
-        words = "-".join(GOOGLE_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh)))
+        words = "-".join(WORDNET_LEMMAS[p] for p in self.to_rugbits(self.geo_to_int(gh)))
        return words

    def six_words(self, (lat, lon)):
@ -95,7 +103,7 @@ class WordHasher(object):
        """Decode words back to latitude and longitude"""
        words = words.split("-")
        if len(words) == 3:
-            i = self.rugbits_to_int([GOOGLE_WORDLIST.index(w) for w in words])
+            i = self.rugbits_to_int([WORDNET_LEMMAS.index(w) for w in words])

        elif len(words) == 6:
            i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])
--- a/views/index.html
+++ b/views/index.html
@ -25,7 +25,7 @@
      </p>
      <p class="text-warning">This is a very early pre-alpha release. Don't expect anything to work properly or reproducible. Especially since the underlying word-list will most likely change soon.</p>
      <p id="input3wordsContainer">
-        <input type="text" class="form-control" id="input3words" placeholder="zeljka-worry-suhai">
+        <input type="text" class="form-control" id="input3words" placeholder="spitting-ripple-fontanel">
      </p>
      <p>
        <button id="button3words" class="btn btn-lg btn-primary">Find on Map</button>
@ -36,7 +36,7 @@
    </footer>
    <script>
      (function() {
-        var default3words = 'zeljka-worry-suhai';
+        var default3words = 'spitting-ripple-fontanel';
        var threewordsField = document.getElementById('input3words');
        document.getElementById('button3words').addEventListener('click', function(evt) {
          var threewords = threewordsField.value;
--- a/words/README.md
+++ b/words/README.md
@ -1,5 +1,15 @@
-Creating a word list
-====================
+Wordnet wordlist
+================
+
+This is a wordlist based on the lemmas in [WordNet][wordnet]. It
+produces a list of words much less esoteric than the google ngram
+list below.
+
+Run `wordnet.py` to create the wordnet wordlist.
+
+
+Creating the google word list
+=============================

 Download the corpus from [google ngram][googlengram] with:

@ -7,6 +17,7 @@ Download the corpus from [google ngram][googlengram] with:
        wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-$a.gz;
    done

+[wordnet]: http://wordnet.princeton.edu/
 [googlengram]: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html

 then you can filter the words like this:
@ -24,4 +35,4 @@ To create the wordlist used by `These3Words` run:
   sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort | uniq | tail -n32768 > google-ngram-list

 Check that your list is long enough by counting the lines
-in `google-ngram-list`, you need exactly 32768 words
+in `google-ngram-list`, you need exactly 32768 words
--- a/words/normalise-words.py
+++ b/words/normalise-words.py
@ -1,5 +1,8 @@
+import difflib
 import fileinput

+import nltk.stem as stem
+

 RUDEWORDS = """anal anus arse ass ballsack balls bastard bitch biatch bloody
 blowjob blow job bollock bollok boner boob bugger bum butt buttplug
@ -13,7 +16,7 @@ xxx""".split()
 # Words that sound similar to others
 HOMOPHONES = """there their than then hear here capital capitol won to too lose
 loose dessert desert affect effect beech beet beat blew chili chilly
-dear deer days daze die dye lie lye""".split()
+dear deer days daze die dye lie lye""".lower().split()
 MORE_HOMOPHONES = """accessary, accessory ad, add ail, ale air, heir
 aisle, I'll, isle all, awl allowed, aloud alms, arms altar, alter arc,
 ark aren't, aunt ate, eight auger, augur auk, orc aural, oral away,
@ -97,16 +100,48 @@ we'll, wheel wean, ween weather, whether weaver, weever weir, we're
 were, whirr wet, whet wheald, wheeled which, witch whig, wig while,
 wile whine, wine whirl, whorl whirled, world whit, wit white, wight
 who's, whose woe, whoa wood, would yaw, yore, your, you're yoke, yolk
-you'll, yule""".replace(",", " ").split()
+you'll, yule""".replace(",", " ").lower().split()
+
+wnl = stem.WordNetLemmatizer()

 REMOVE = HOMOPHONES + MORE_HOMOPHONES + RUDEWORDS
+REMOVE = set(wnl.lemmatize(R) for R in REMOVE)

+seen_words = []
+N = 0
 for line in fileinput.input():
    count, word = line.split()
+    word = word.lower()
+    #try:
+    #    word = wnl.lemmatize(word)
+    #
+    #except UnicodeDecodeError:
+    #    continue
+
+    if word.startswith("z"):
+        continue
+        
    if word in REMOVE:
        continue

    if len(word) == 4:
        continue
-    
-    print word.lower()
+
+    reject = False
+    s = difflib.SequenceMatcher(None, word, "A")
+    for w in seen_words:
+        s.set_seq2(w)
+        if s.ratio() > 0.8:
+            reject = True
+            break
+
+    if reject:
+        continue
+
+    seen_words.append(word)
+    N += 1
+    if N >= 10000:
+        seen_words = seen_words[-N:]
+        N = 0
+
+    print word
--- a/words/wordnet-list
+++ b/words/wordnet-list
--- a/words/wordnet.py
+++ b/words/wordnet.py
@ -0,0 +1,13 @@
+import nltk
+import string
+
+
+lemmas = nltk.corpus.wordnet.all_lemma_names()
+wordnet_lemmas = list(w.lower() for w in lemmas if 4<=len(w)<9 and
+                      not any(c in string.punctuation for c in w) and
+                      not any(c in string.digits for c in w))
+assert len(wordnet_lemmas) == len(set(wordnet_lemmas))
+
+f = open("wordnet-list", "w")
+f.write("\n".join(wordnet_lemmas))
+f.close()