From ff5d158a4e1d830296fe61534ee1138d9fde7c4c Mon Sep 17 00:00:00 2001 From: Tim Head Date: Mon, 17 Nov 2014 18:25:37 +0100 Subject: [PATCH] Yet another wordlist, based on WordNet this time --- README.md | 2 +- thesethreewords.py | 18 +- views/index.html | 4 +- words/README.md | 17 +- words/normalise-words.py | 43 +- words/wordnet-list | 38190 +++++++++++++++++++++++++++++++++++++ words/wordnet.py | 13 + 7 files changed, 38272 insertions(+), 15 deletions(-) create mode 100644 words/wordnet-list create mode 100644 words/wordnet.py diff --git a/README.md b/README.md index c126e61..4f7f4a8 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ example >>> three = these.three_words(CERN) >>> print three - 'zeljka-worry-suhai' + 'spitting-ripple-fontanel' >>> these.decode(three) (46.232335567474365, 6.055419445037842) diff --git a/thesethreewords.py b/thesethreewords.py index d88ed2e..0afa919 100644 --- a/thesethreewords.py +++ b/thesethreewords.py @@ -8,8 +8,8 @@ import random import geohash -def get_google_words(): - lines = open("words/google-ngram-list") +def get_words(fname): + lines = open(fname) words = [] for word in lines: words.append(word.strip()) @@ -17,10 +17,18 @@ def get_google_words(): lines.close() random.seed(634634) random.shuffle(words) + words = words[:2**15] + assert len(words) == len(set(words)) return words -GOOGLE_WORDLIST = get_google_words() + +# These read like alien races from a sci-fi book +GOOGLE_WORDLIST = get_words("words/google-ngram-list") +# current best list for the three word hash +WORDNET_LEMMAS = get_words("words/wordnet-list") # Human friendly word list, taken directly from humanhash project +# these are the best words but there are not enough of +# them so we only use them for the six word hash HUMAN_WORDLIST = ( 'ack', 'alabama', 'alanine', 'alaska', 'alpha', 'angel', 'apart', 'april', 'arizona', 'arkansas', 'artist', 'asparagus', 'aspen', 'august', 'autumn', @@ -75,7 +83,7 @@ class WordHasher(object): in degrees. """ gh = geohash.encode(lat, lon, 9) - words = "-".join(GOOGLE_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh))) + words = "-".join(WORDNET_LEMMAS[p] for p in self.to_rugbits(self.geo_to_int(gh))) return words def six_words(self, (lat, lon)): @@ -95,7 +103,7 @@ class WordHasher(object): """Decode words back to latitude and longitude""" words = words.split("-") if len(words) == 3: - i = self.rugbits_to_int([GOOGLE_WORDLIST.index(w) for w in words]) + i = self.rugbits_to_int([WORDNET_LEMMAS.index(w) for w in words]) elif len(words) == 6: i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words]) diff --git a/views/index.html b/views/index.html index 1c4618f..511c355 100644 --- a/views/index.html +++ b/views/index.html @@ -25,7 +25,7 @@

This is a very early pre-alpha release. Don't expect anything to work properly or reproducible. Especially since the underlying word-list will most likely change soon.

- +

@@ -36,7 +36,7 @@