Yet another wordlist, based on WordNet this time
This commit is contained in:
parent
7d657e99ca
commit
ff5d158a4e
|
@ -16,7 +16,7 @@ example
|
||||||
|
|
||||||
>>> three = these.three_words(CERN)
|
>>> three = these.three_words(CERN)
|
||||||
>>> print three
|
>>> print three
|
||||||
'zeljka-worry-suhai'
|
'spitting-ripple-fontanel'
|
||||||
>>> these.decode(three)
|
>>> these.decode(three)
|
||||||
(46.232335567474365, 6.055419445037842)
|
(46.232335567474365, 6.055419445037842)
|
||||||
|
|
||||||
|
|
|
@ -8,8 +8,8 @@ import random
|
||||||
import geohash
|
import geohash
|
||||||
|
|
||||||
|
|
||||||
def get_google_words():
|
def get_words(fname):
|
||||||
lines = open("words/google-ngram-list")
|
lines = open(fname)
|
||||||
words = []
|
words = []
|
||||||
for word in lines:
|
for word in lines:
|
||||||
words.append(word.strip())
|
words.append(word.strip())
|
||||||
|
@ -17,10 +17,18 @@ def get_google_words():
|
||||||
lines.close()
|
lines.close()
|
||||||
random.seed(634634)
|
random.seed(634634)
|
||||||
random.shuffle(words)
|
random.shuffle(words)
|
||||||
|
words = words[:2**15]
|
||||||
|
assert len(words) == len(set(words))
|
||||||
return words
|
return words
|
||||||
GOOGLE_WORDLIST = get_google_words()
|
|
||||||
|
# These read like alien races from a sci-fi book
|
||||||
|
GOOGLE_WORDLIST = get_words("words/google-ngram-list")
|
||||||
|
# current best list for the three word hash
|
||||||
|
WORDNET_LEMMAS = get_words("words/wordnet-list")
|
||||||
|
|
||||||
# Human friendly word list, taken directly from humanhash project
|
# Human friendly word list, taken directly from humanhash project
|
||||||
|
# these are the best words but there are not enough of
|
||||||
|
# them so we only use them for the six word hash
|
||||||
HUMAN_WORDLIST = (
|
HUMAN_WORDLIST = (
|
||||||
'ack', 'alabama', 'alanine', 'alaska', 'alpha', 'angel', 'apart', 'april',
|
'ack', 'alabama', 'alanine', 'alaska', 'alpha', 'angel', 'apart', 'april',
|
||||||
'arizona', 'arkansas', 'artist', 'asparagus', 'aspen', 'august', 'autumn',
|
'arizona', 'arkansas', 'artist', 'asparagus', 'aspen', 'august', 'autumn',
|
||||||
|
@ -75,7 +83,7 @@ class WordHasher(object):
|
||||||
in degrees.
|
in degrees.
|
||||||
"""
|
"""
|
||||||
gh = geohash.encode(lat, lon, 9)
|
gh = geohash.encode(lat, lon, 9)
|
||||||
words = "-".join(GOOGLE_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh)))
|
words = "-".join(WORDNET_LEMMAS[p] for p in self.to_rugbits(self.geo_to_int(gh)))
|
||||||
return words
|
return words
|
||||||
|
|
||||||
def six_words(self, (lat, lon)):
|
def six_words(self, (lat, lon)):
|
||||||
|
@ -95,7 +103,7 @@ class WordHasher(object):
|
||||||
"""Decode words back to latitude and longitude"""
|
"""Decode words back to latitude and longitude"""
|
||||||
words = words.split("-")
|
words = words.split("-")
|
||||||
if len(words) == 3:
|
if len(words) == 3:
|
||||||
i = self.rugbits_to_int([GOOGLE_WORDLIST.index(w) for w in words])
|
i = self.rugbits_to_int([WORDNET_LEMMAS.index(w) for w in words])
|
||||||
|
|
||||||
elif len(words) == 6:
|
elif len(words) == 6:
|
||||||
i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])
|
i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])
|
||||||
|
|
|
@ -25,7 +25,7 @@
|
||||||
</p>
|
</p>
|
||||||
<p class="text-warning">This is a very early pre-alpha release. Don't expect anything to work properly or reproducible. Especially since the underlying word-list will most likely change soon.</p>
|
<p class="text-warning">This is a very early pre-alpha release. Don't expect anything to work properly or reproducible. Especially since the underlying word-list will most likely change soon.</p>
|
||||||
<p id="input3wordsContainer">
|
<p id="input3wordsContainer">
|
||||||
<input type="text" class="form-control" id="input3words" placeholder="zeljka-worry-suhai">
|
<input type="text" class="form-control" id="input3words" placeholder="spitting-ripple-fontanel">
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
<button id="button3words" class="btn btn-lg btn-primary">Find on Map</button>
|
<button id="button3words" class="btn btn-lg btn-primary">Find on Map</button>
|
||||||
|
@ -36,7 +36,7 @@
|
||||||
</footer>
|
</footer>
|
||||||
<script>
|
<script>
|
||||||
(function() {
|
(function() {
|
||||||
var default3words = 'zeljka-worry-suhai';
|
var default3words = 'spitting-ripple-fontanel';
|
||||||
var threewordsField = document.getElementById('input3words');
|
var threewordsField = document.getElementById('input3words');
|
||||||
document.getElementById('button3words').addEventListener('click', function(evt) {
|
document.getElementById('button3words').addEventListener('click', function(evt) {
|
||||||
var threewords = threewordsField.value;
|
var threewords = threewordsField.value;
|
||||||
|
|
|
@ -1,5 +1,15 @@
|
||||||
Creating a word list
|
Wordnet wordlist
|
||||||
====================
|
================
|
||||||
|
|
||||||
|
This is a wordlist based on the lemmas in [WordNet][wordnet]. It
|
||||||
|
produces a list of words much less esoteric than the google ngram
|
||||||
|
list below.
|
||||||
|
|
||||||
|
Run `wordnet.py` to create the wordnet wordlist.
|
||||||
|
|
||||||
|
|
||||||
|
Creating the google word list
|
||||||
|
=============================
|
||||||
|
|
||||||
Download the corpus from [google ngram][googlengram] with:
|
Download the corpus from [google ngram][googlengram] with:
|
||||||
|
|
||||||
|
@ -7,6 +17,7 @@ Download the corpus from [google ngram][googlengram] with:
|
||||||
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-$a.gz;
|
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-$a.gz;
|
||||||
done
|
done
|
||||||
|
|
||||||
|
[wordnet]: http://wordnet.princeton.edu/
|
||||||
[googlengram]: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
|
[googlengram]: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
|
||||||
|
|
||||||
then you can filter the words like this:
|
then you can filter the words like this:
|
||||||
|
@ -24,4 +35,4 @@ To create the wordlist used by `These3Words` run:
|
||||||
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort | uniq | tail -n32768 > google-ngram-list
|
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | sort | uniq | tail -n32768 > google-ngram-list
|
||||||
|
|
||||||
Check that your list is long enough by counting the lines
|
Check that your list is long enough by counting the lines
|
||||||
in `google-ngram-list`, you need exactly 32768 words
|
in `google-ngram-list`, you need exactly 32768 words
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
|
import difflib
|
||||||
import fileinput
|
import fileinput
|
||||||
|
|
||||||
|
import nltk.stem as stem
|
||||||
|
|
||||||
|
|
||||||
RUDEWORDS = """anal anus arse ass ballsack balls bastard bitch biatch bloody
|
RUDEWORDS = """anal anus arse ass ballsack balls bastard bitch biatch bloody
|
||||||
blowjob blow job bollock bollok boner boob bugger bum butt buttplug
|
blowjob blow job bollock bollok boner boob bugger bum butt buttplug
|
||||||
|
@ -13,7 +16,7 @@ xxx""".split()
|
||||||
# Words that sound similar to others
|
# Words that sound similar to others
|
||||||
HOMOPHONES = """there their than then hear here capital capitol won to too lose
|
HOMOPHONES = """there their than then hear here capital capitol won to too lose
|
||||||
loose dessert desert affect effect beech beet beat blew chili chilly
|
loose dessert desert affect effect beech beet beat blew chili chilly
|
||||||
dear deer days daze die dye lie lye""".split()
|
dear deer days daze die dye lie lye""".lower().split()
|
||||||
MORE_HOMOPHONES = """accessary, accessory ad, add ail, ale air, heir
|
MORE_HOMOPHONES = """accessary, accessory ad, add ail, ale air, heir
|
||||||
aisle, I'll, isle all, awl allowed, aloud alms, arms altar, alter arc,
|
aisle, I'll, isle all, awl allowed, aloud alms, arms altar, alter arc,
|
||||||
ark aren't, aunt ate, eight auger, augur auk, orc aural, oral away,
|
ark aren't, aunt ate, eight auger, augur auk, orc aural, oral away,
|
||||||
|
@ -97,16 +100,48 @@ we'll, wheel wean, ween weather, whether weaver, weever weir, we're
|
||||||
were, whirr wet, whet wheald, wheeled which, witch whig, wig while,
|
were, whirr wet, whet wheald, wheeled which, witch whig, wig while,
|
||||||
wile whine, wine whirl, whorl whirled, world whit, wit white, wight
|
wile whine, wine whirl, whorl whirled, world whit, wit white, wight
|
||||||
who's, whose woe, whoa wood, would yaw, yore, your, you're yoke, yolk
|
who's, whose woe, whoa wood, would yaw, yore, your, you're yoke, yolk
|
||||||
you'll, yule""".replace(",", " ").split()
|
you'll, yule""".replace(",", " ").lower().split()
|
||||||
|
|
||||||
|
wnl = stem.WordNetLemmatizer()
|
||||||
|
|
||||||
REMOVE = HOMOPHONES + MORE_HOMOPHONES + RUDEWORDS
|
REMOVE = HOMOPHONES + MORE_HOMOPHONES + RUDEWORDS
|
||||||
|
REMOVE = set(wnl.lemmatize(R) for R in REMOVE)
|
||||||
|
|
||||||
|
seen_words = []
|
||||||
|
N = 0
|
||||||
for line in fileinput.input():
|
for line in fileinput.input():
|
||||||
count, word = line.split()
|
count, word = line.split()
|
||||||
|
word = word.lower()
|
||||||
|
#try:
|
||||||
|
# word = wnl.lemmatize(word)
|
||||||
|
#
|
||||||
|
#except UnicodeDecodeError:
|
||||||
|
# continue
|
||||||
|
|
||||||
|
if word.startswith("z"):
|
||||||
|
continue
|
||||||
|
|
||||||
if word in REMOVE:
|
if word in REMOVE:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if len(word) == 4:
|
if len(word) == 4:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print word.lower()
|
reject = False
|
||||||
|
s = difflib.SequenceMatcher(None, word, "A")
|
||||||
|
for w in seen_words:
|
||||||
|
s.set_seq2(w)
|
||||||
|
if s.ratio() > 0.8:
|
||||||
|
reject = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if reject:
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_words.append(word)
|
||||||
|
N += 1
|
||||||
|
if N >= 10000:
|
||||||
|
seen_words = seen_words[-N:]
|
||||||
|
N = 0
|
||||||
|
|
||||||
|
print word
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,13 @@
|
||||||
|
import nltk
|
||||||
|
import string
|
||||||
|
|
||||||
|
|
||||||
|
lemmas = nltk.corpus.wordnet.all_lemma_names()
|
||||||
|
wordnet_lemmas = list(w.lower() for w in lemmas if 4<=len(w)<9 and
|
||||||
|
not any(c in string.punctuation for c in w) and
|
||||||
|
not any(c in string.digits for c in w))
|
||||||
|
assert len(wordnet_lemmas) == len(set(wordnet_lemmas))
|
||||||
|
|
||||||
|
f = open("wordnet-list", "w")
|
||||||
|
f.write("\n".join(wordnet_lemmas))
|
||||||
|
f.close()
|
Loading…
Reference in New Issue