From ff5d158a4e1d830296fe61534ee1138d9fde7c4c Mon Sep 17 00:00:00 2001
From: Tim Head
Date: Mon, 17 Nov 2014 18:25:37 +0100
Subject: [PATCH] Yet another wordlist, based on WordNet this time
---
README.md | 2 +-
thesethreewords.py | 18 +-
views/index.html | 4 +-
words/README.md | 17 +-
words/normalise-words.py | 43 +-
words/wordnet-list | 38190 +++++++++++++++++++++++++++++++++++++
words/wordnet.py | 13 +
7 files changed, 38272 insertions(+), 15 deletions(-)
create mode 100644 words/wordnet-list
create mode 100644 words/wordnet.py
diff --git a/README.md b/README.md
index c126e61..4f7f4a8 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ example
>>> three = these.three_words(CERN)
>>> print three
- 'zeljka-worry-suhai'
+ 'spitting-ripple-fontanel'
>>> these.decode(three)
(46.232335567474365, 6.055419445037842)
diff --git a/thesethreewords.py b/thesethreewords.py
index d88ed2e..0afa919 100644
--- a/thesethreewords.py
+++ b/thesethreewords.py
@@ -8,8 +8,8 @@ import random
import geohash
-def get_google_words():
- lines = open("words/google-ngram-list")
+def get_words(fname):
+ lines = open(fname)
words = []
for word in lines:
words.append(word.strip())
@@ -17,10 +17,18 @@ def get_google_words():
lines.close()
random.seed(634634)
random.shuffle(words)
+ words = words[:2**15]
+ assert len(words) == len(set(words))
return words
-GOOGLE_WORDLIST = get_google_words()
+
+# These read like alien races from a sci-fi book
+GOOGLE_WORDLIST = get_words("words/google-ngram-list")
+# current best list for the three word hash
+WORDNET_LEMMAS = get_words("words/wordnet-list")
# Human friendly word list, taken directly from humanhash project
+# these are the best words but there are not enough of
+# them so we only use them for the six word hash
HUMAN_WORDLIST = (
'ack', 'alabama', 'alanine', 'alaska', 'alpha', 'angel', 'apart', 'april',
'arizona', 'arkansas', 'artist', 'asparagus', 'aspen', 'august', 'autumn',
@@ -75,7 +83,7 @@ class WordHasher(object):
in degrees.
"""
gh = geohash.encode(lat, lon, 9)
- words = "-".join(GOOGLE_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh)))
+ words = "-".join(WORDNET_LEMMAS[p] for p in self.to_rugbits(self.geo_to_int(gh)))
return words
def six_words(self, (lat, lon)):
@@ -95,7 +103,7 @@ class WordHasher(object):
"""Decode words back to latitude and longitude"""
words = words.split("-")
if len(words) == 3:
- i = self.rugbits_to_int([GOOGLE_WORDLIST.index(w) for w in words])
+ i = self.rugbits_to_int([WORDNET_LEMMAS.index(w) for w in words])
elif len(words) == 6:
i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])
diff --git a/views/index.html b/views/index.html
index 1c4618f..511c355 100644
--- a/views/index.html
+++ b/views/index.html
@@ -25,7 +25,7 @@
This is a very early pre-alpha release. Don't expect anything to work properly or reproducible. Especially since the underlying word-list will most likely change soon.