From 9c9f32c2a7160791e93de04d5316ac1b8d82c2f9 Mon Sep 17 00:00:00 2001
From: Tim Head
Date: Mon, 17 Nov 2014 16:09:10 +0100
Subject: [PATCH] Added a word list generator using google ngram as input
---
.gitignore | 3 +
README.md | 4 +-
thesethreewords.py | 31 +-
views/index.html | 4 +-
words/README.md | 27 +
words/google-ngram-list | 32768 +++++++++++++++++++++++++++++++++++++
words/ngram-filter.py | 24 +
words/normalise-words.py | 112 +
8 files changed, 32952 insertions(+), 21 deletions(-)
create mode 100644 words/README.md
create mode 100644 words/google-ngram-list
create mode 100644 words/ngram-filter.py
create mode 100644 words/normalise-words.py
diff --git a/.gitignore b/.gitignore
index db4561e..8a341b0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,3 +52,6 @@ docs/_build/
# PyBuilder
target/
+
+# google ngram input files
+googlebooks-eng-all-1gram-20120701*
diff --git a/README.md b/README.md
index f34e9f0..fb07019 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ example
>>> three = these.three_words(CERN)
>>> print three
- 'engirt-aleutic-canun'
+ 'treaty-crane-seldes'
>>> these.decode(three)
(46.232335567474365, 6.055419445037842)
@@ -74,7 +74,7 @@ have similar `these-3-words` hashes
>>> other_CERN_site = (46.256811, 6.056792)
>>> six = these.six_words(other_CERN_site)
>>> print six
- ''spaghetti-carolina-kentucky-utah-seventeen-neptune'
+ 'spaghetti-carolina-kentucky-utah-seventeen-neptune'
>>> these.decode(six)
(46.256797313690186, 6.056792736053467)
diff --git a/thesethreewords.py b/thesethreewords.py
index 1009266..e95e9d6 100644
--- a/thesethreewords.py
+++ b/thesethreewords.py
@@ -8,21 +8,18 @@ import random
import geohash
-def get_random_words():
- words = open("/usr/share/dict/words")
- random.seed(3346346)
- useful = []
- for w in words:
- w = w.strip()
- if 5 <= len(w) < 8:
- useful.append(w.lower())
-
- words.close()
- useful = useful[:2**15]
- random.shuffle(useful)
- assert len(useful) == 2**15
- return useful
-RANDOM_WORDLIST = get_random_words()
+def get_google_words():
+ lines = open("words/google-ngram-list")
+ words = []
+ for line in lines:
+ _, word = line.split()
+ words.append(word)
+
+ lines.close()
+ random.seed(634634)
+ random.shuffle(words)
+ return words
+GOOGLE_WORDLIST = get_google_words()
# Human friendly word list, taken directly from humanhash project
HUMAN_WORDLIST = (
@@ -79,7 +76,7 @@ class WordHasher(object):
in degrees.
"""
gh = geohash.encode(lat, lon, 9)
- words = "-".join(RANDOM_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh)))
+ words = "-".join(GOOGLE_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh)))
return words
def six_words(self, (lat, lon)):
@@ -99,7 +96,7 @@ class WordHasher(object):
"""Decode words back to latitude and longitude"""
words = words.split("-")
if len(words) == 3:
- i = self.rugbits_to_int([RANDOM_WORDLIST.index(w) for w in words])
+ i = self.rugbits_to_int([GOOGLE_WORDLIST.index(w) for w in words])
elif len(words) == 6:
i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])
diff --git a/views/index.html b/views/index.html
index 04fb4a8..9a0c195 100644
--- a/views/index.html
+++ b/views/index.html
@@ -19,7 +19,7 @@
Find a location anywhere in the world identified by three simple words.
-
+
@@ -27,7 +27,7 @@