Yet another wordlist, based on WordNet this time

2014-11-17 18:25:37 +01:00 · 2014-11-17 18:25:37 +01:00 · ff5d158a4e
parent 7d657e99ca
commit ff5d158a4e
7 changed files with 38272 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -16,7 +16,7 @@ example
    >>> three = these.three_words(CERN)
    >>> print three
-    'zeljka-worry-suhai'
+    'spitting-ripple-fontanel' 
    >>> these.decode(three)
    (46.232335567474365, 6.055419445037842)
--- a/thesethreewords.py
+++ b/thesethreewords.py
@ -8,8 +8,8 @@ import random
 import geohash
-def get_google_words():
+def get_words(fname):
-    lines = open("words/google-ngram-list")
+    lines = open(fname)
    words = []
    for word in lines:
        words.append(word.strip())
@ -17,10 +17,18 @@ def get_google_words():
    lines.close()
    random.seed(634634)
    random.shuffle(words)
    words = words[:2**15]
    assert len(words) == len(set(words))
    return words
-GOOGLE_WORDLIST = get_google_words()
+
 # These read like alien races from a sci-fi book
 GOOGLE_WORDLIST = get_words("words/google-ngram-list")
 # current best list for the three word hash
 WORDNET_LEMMAS = get_words("words/wordnet-list")
 # Human friendly word list, taken directly from humanhash project
 # these are the best words but there are not enough of
 # them so we only use them for the six word hash
 HUMAN_WORDLIST = (
        'ack', 'alabama', 'alanine', 'alaska', 'alpha', 'angel', 'apart', 'april',
        'arizona', 'arkansas', 'artist', 'asparagus', 'aspen', 'august', 'autumn',
@ -75,7 +83,7 @@ class WordHasher(object):
        in degrees.
        """
        gh = geohash.encode(lat, lon, 9)
-        words = "-".join(GOOGLE_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh)))
+        words = "-".join(WORDNET_LEMMAS[p] for p in self.to_rugbits(self.geo_to_int(gh)))
        return words
    def six_words(self, (lat, lon)):
@ -95,7 +103,7 @@ class WordHasher(object):
        """Decode words back to latitude and longitude"""
        words = words.split("-")
        if len(words) == 3:
-            i = self.rugbits_to_int([GOOGLE_WORDLIST.index(w) for w in words])
+            i = self.rugbits_to_int([WORDNET_LEMMAS.index(w) for w in words])
        elif len(words) == 6:
            i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])
--- a/views/index.html
+++ b/views/index.html
@ -25,7 +25,7 @@
      </p>
      <p class="text-warning">This is a very early pre-alpha release. Don't expect anything to work properly or reproducible. Especially since the underlying word-list will most likely change soon.</p>
      <p id="input3wordsContainer">
-        <input type="text" class="form-control" id="input3words" placeholder="zeljka-worry-suhai">
+        <input type="text" class="form-control" id="input3words" placeholder="spitting-ripple-fontanel">
      </p>
      <p>
        <button id="button3words" class="btn btn-lg btn-primary">Find on Map</button>
@ -36,7 +36,7 @@
    </footer>
    <script>
      (function() {
-        var default3words = 'zeljka-worry-suhai';
+        var default3words = 'spitting-ripple-fontanel';
        var threewordsField = document.getElementById('input3words');
        document.getElementById('button3words').addEventListener('click', function(evt) {
          var threewords = threewordsField.value;
--- a/words/README.md
+++ b/words/README.md
@ -1,5 +1,15 @@
-Creating a word list
+Wordnet wordlist
-====================
+================
 This is a wordlist based on the lemmas in [WordNet][wordnet]. It
 produces a list of words much less esoteric than the google ngram
 list below.
 Run `wordnet.py` to create the wordnet wordlist.
 Creating the google word list
 =============================
 Download the corpus from [google ngram][googlengram] with:
@ -7,6 +17,7 @@ Download the corpus from [google ngram][googlengram] with:
        wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-$a.gz;
    done
 [wordnet]: http://wordnet.princeton.edu/
 [googlengram]: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
 then you can filter the words like this:
@ -24,4 +35,4 @@ To create the wordlist used by `These3Words` run:
   sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort | uniq | tail -n32768 > google-ngram-list
 Check that your list is long enough by counting the lines
-in `google-ngram-list`, you need exactly 32768 words
+in `google-ngram-list`, you need exactly 32768 words
--- a/words/normalise-words.py
+++ b/words/normalise-words.py
@ -1,5 +1,8 @@
 import difflib
 import fileinput
 import nltk.stem as stem
 RUDEWORDS = """anal anus arse ass ballsack balls bastard bitch biatch bloody
 blowjob blow job bollock bollok boner boob bugger bum butt buttplug
@ -13,7 +16,7 @@ xxx""".split()
 # Words that sound similar to others
 HOMOPHONES = """there their than then hear here capital capitol won to too lose
 loose dessert desert affect effect beech beet beat blew chili chilly
-dear deer days daze die dye lie lye""".split()
+dear deer days daze die dye lie lye""".lower().split()
 MORE_HOMOPHONES = """accessary, accessory ad, add ail, ale air, heir
 aisle, I'll, isle all, awl allowed, aloud alms, arms altar, alter arc,
 ark aren't, aunt ate, eight auger, augur auk, orc aural, oral away,
@ -97,16 +100,48 @@ we'll, wheel wean, ween weather, whether weaver, weever weir, we're
 were, whirr wet, whet wheald, wheeled which, witch whig, wig while,
 wile whine, wine whirl, whorl whirled, world whit, wit white, wight
 who's, whose woe, whoa wood, would yaw, yore, your, you're yoke, yolk
-you'll, yule""".replace(",", " ").split()
+you'll, yule""".replace(",", " ").lower().split()
 wnl = stem.WordNetLemmatizer()
 REMOVE = HOMOPHONES + MORE_HOMOPHONES + RUDEWORDS
 REMOVE = set(wnl.lemmatize(R) for R in REMOVE)
 seen_words = []
 N = 0
 for line in fileinput.input():
    count, word = line.split()
    word = word.lower()
    #try:
    #    word = wnl.lemmatize(word)
    #
    #except UnicodeDecodeError:
    #    continue
    if word.startswith("z"):
        continue
    if word in REMOVE:
        continue
    if len(word) == 4:
        continue
-    
+
-    print word.lower()
+    reject = False
    s = difflib.SequenceMatcher(None, word, "A")
    for w in seen_words:
        s.set_seq2(w)
        if s.ratio() > 0.8:
            reject = True
            break
    if reject:
        continue
    seen_words.append(word)
    N += 1
    if N >= 10000:
        seen_words = seen_words[-N:]
        N = 0
    print word
--- a/words/wordnet-list
+++ b/words/wordnet-list
--- a/words/wordnet.py
+++ b/words/wordnet.py
@ -0,0 +1,13 @@
 import nltk
 import string
 lemmas = nltk.corpus.wordnet.all_lemma_names()
 wordnet_lemmas = list(w.lower() for w in lemmas if 4<=len(w)<9 and
                      not any(c in string.punctuation for c in w) and
                      not any(c in string.digits for c in w))
 assert len(wordnet_lemmas) == len(set(wordnet_lemmas))
 f = open("wordnet-list", "w")
 f.write("\n".join(wordnet_lemmas))
 f.close()