Yet another wordlist, based on WordNet this time
This commit is contained in:
parent
7d657e99ca
commit
ff5d158a4e
|
@ -16,7 +16,7 @@ example
|
|||
|
||||
>>> three = these.three_words(CERN)
|
||||
>>> print three
|
||||
'zeljka-worry-suhai'
|
||||
'spitting-ripple-fontanel'
|
||||
>>> these.decode(three)
|
||||
(46.232335567474365, 6.055419445037842)
|
||||
|
||||
|
|
|
@ -8,8 +8,8 @@ import random
|
|||
import geohash
|
||||
|
||||
|
||||
def get_google_words():
|
||||
lines = open("words/google-ngram-list")
|
||||
def get_words(fname):
|
||||
lines = open(fname)
|
||||
words = []
|
||||
for word in lines:
|
||||
words.append(word.strip())
|
||||
|
@ -17,10 +17,18 @@ def get_google_words():
|
|||
lines.close()
|
||||
random.seed(634634)
|
||||
random.shuffle(words)
|
||||
words = words[:2**15]
|
||||
assert len(words) == len(set(words))
|
||||
return words
|
||||
GOOGLE_WORDLIST = get_google_words()
|
||||
|
||||
# These read like alien races from a sci-fi book
|
||||
GOOGLE_WORDLIST = get_words("words/google-ngram-list")
|
||||
# current best list for the three word hash
|
||||
WORDNET_LEMMAS = get_words("words/wordnet-list")
|
||||
|
||||
# Human friendly word list, taken directly from humanhash project
|
||||
# these are the best words but there are not enough of
|
||||
# them so we only use them for the six word hash
|
||||
HUMAN_WORDLIST = (
|
||||
'ack', 'alabama', 'alanine', 'alaska', 'alpha', 'angel', 'apart', 'april',
|
||||
'arizona', 'arkansas', 'artist', 'asparagus', 'aspen', 'august', 'autumn',
|
||||
|
@ -75,7 +83,7 @@ class WordHasher(object):
|
|||
in degrees.
|
||||
"""
|
||||
gh = geohash.encode(lat, lon, 9)
|
||||
words = "-".join(GOOGLE_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh)))
|
||||
words = "-".join(WORDNET_LEMMAS[p] for p in self.to_rugbits(self.geo_to_int(gh)))
|
||||
return words
|
||||
|
||||
def six_words(self, (lat, lon)):
|
||||
|
@ -95,7 +103,7 @@ class WordHasher(object):
|
|||
"""Decode words back to latitude and longitude"""
|
||||
words = words.split("-")
|
||||
if len(words) == 3:
|
||||
i = self.rugbits_to_int([GOOGLE_WORDLIST.index(w) for w in words])
|
||||
i = self.rugbits_to_int([WORDNET_LEMMAS.index(w) for w in words])
|
||||
|
||||
elif len(words) == 6:
|
||||
i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])
|
||||
|
|
|
@ -25,7 +25,7 @@
|
|||
</p>
|
||||
<p class="text-warning">This is a very early pre-alpha release. Don't expect anything to work properly or reproducible. Especially since the underlying word-list will most likely change soon.</p>
|
||||
<p id="input3wordsContainer">
|
||||
<input type="text" class="form-control" id="input3words" placeholder="zeljka-worry-suhai">
|
||||
<input type="text" class="form-control" id="input3words" placeholder="spitting-ripple-fontanel">
|
||||
</p>
|
||||
<p>
|
||||
<button id="button3words" class="btn btn-lg btn-primary">Find on Map</button>
|
||||
|
@ -36,7 +36,7 @@
|
|||
</footer>
|
||||
<script>
|
||||
(function() {
|
||||
var default3words = 'zeljka-worry-suhai';
|
||||
var default3words = 'spitting-ripple-fontanel';
|
||||
var threewordsField = document.getElementById('input3words');
|
||||
document.getElementById('button3words').addEventListener('click', function(evt) {
|
||||
var threewords = threewordsField.value;
|
||||
|
|
|
@ -1,5 +1,15 @@
|
|||
Creating a word list
|
||||
====================
|
||||
Wordnet wordlist
|
||||
================
|
||||
|
||||
This is a wordlist based on the lemmas in [WordNet][wordnet]. It
|
||||
produces a list of words much less esoteric than the google ngram
|
||||
list below.
|
||||
|
||||
Run `wordnet.py` to create the wordnet wordlist.
|
||||
|
||||
|
||||
Creating the google word list
|
||||
=============================
|
||||
|
||||
Download the corpus from [google ngram][googlengram] with:
|
||||
|
||||
|
@ -7,6 +17,7 @@ Download the corpus from [google ngram][googlengram] with:
|
|||
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-$a.gz;
|
||||
done
|
||||
|
||||
[wordnet]: http://wordnet.princeton.edu/
|
||||
[googlengram]: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
|
||||
|
||||
then you can filter the words like this:
|
||||
|
@ -24,4 +35,4 @@ To create the wordlist used by `These3Words` run:
|
|||
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort | uniq | tail -n32768 > google-ngram-list
|
||||
|
||||
Check that your list is long enough by counting the lines
|
||||
in `google-ngram-list`, you need exactly 32768 words
|
||||
in `google-ngram-list`, you need exactly 32768 words
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
import difflib
|
||||
import fileinput
|
||||
|
||||
import nltk.stem as stem
|
||||
|
||||
|
||||
RUDEWORDS = """anal anus arse ass ballsack balls bastard bitch biatch bloody
|
||||
blowjob blow job bollock bollok boner boob bugger bum butt buttplug
|
||||
|
@ -13,7 +16,7 @@ xxx""".split()
|
|||
# Words that sound similar to others
|
||||
HOMOPHONES = """there their than then hear here capital capitol won to too lose
|
||||
loose dessert desert affect effect beech beet beat blew chili chilly
|
||||
dear deer days daze die dye lie lye""".split()
|
||||
dear deer days daze die dye lie lye""".lower().split()
|
||||
MORE_HOMOPHONES = """accessary, accessory ad, add ail, ale air, heir
|
||||
aisle, I'll, isle all, awl allowed, aloud alms, arms altar, alter arc,
|
||||
ark aren't, aunt ate, eight auger, augur auk, orc aural, oral away,
|
||||
|
@ -97,16 +100,48 @@ we'll, wheel wean, ween weather, whether weaver, weever weir, we're
|
|||
were, whirr wet, whet wheald, wheeled which, witch whig, wig while,
|
||||
wile whine, wine whirl, whorl whirled, world whit, wit white, wight
|
||||
who's, whose woe, whoa wood, would yaw, yore, your, you're yoke, yolk
|
||||
you'll, yule""".replace(",", " ").split()
|
||||
you'll, yule""".replace(",", " ").lower().split()
|
||||
|
||||
wnl = stem.WordNetLemmatizer()
|
||||
|
||||
REMOVE = HOMOPHONES + MORE_HOMOPHONES + RUDEWORDS
|
||||
REMOVE = set(wnl.lemmatize(R) for R in REMOVE)
|
||||
|
||||
seen_words = []
|
||||
N = 0
|
||||
for line in fileinput.input():
|
||||
count, word = line.split()
|
||||
word = word.lower()
|
||||
#try:
|
||||
# word = wnl.lemmatize(word)
|
||||
#
|
||||
#except UnicodeDecodeError:
|
||||
# continue
|
||||
|
||||
if word.startswith("z"):
|
||||
continue
|
||||
|
||||
if word in REMOVE:
|
||||
continue
|
||||
|
||||
if len(word) == 4:
|
||||
continue
|
||||
|
||||
print word.lower()
|
||||
|
||||
reject = False
|
||||
s = difflib.SequenceMatcher(None, word, "A")
|
||||
for w in seen_words:
|
||||
s.set_seq2(w)
|
||||
if s.ratio() > 0.8:
|
||||
reject = True
|
||||
break
|
||||
|
||||
if reject:
|
||||
continue
|
||||
|
||||
seen_words.append(word)
|
||||
N += 1
|
||||
if N >= 10000:
|
||||
seen_words = seen_words[-N:]
|
||||
N = 0
|
||||
|
||||
print word
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,13 @@
|
|||
import nltk
|
||||
import string
|
||||
|
||||
|
||||
lemmas = nltk.corpus.wordnet.all_lemma_names()
|
||||
wordnet_lemmas = list(w.lower() for w in lemmas if 4<=len(w)<9 and
|
||||
not any(c in string.punctuation for c in w) and
|
||||
not any(c in string.digits for c in w))
|
||||
assert len(wordnet_lemmas) == len(set(wordnet_lemmas))
|
||||
|
||||
f = open("wordnet-list", "w")
|
||||
f.write("\n".join(wordnet_lemmas))
|
||||
f.close()
|
Loading…
Reference in New Issue