Yet another wordlist, based on WordNet this time

This commit is contained in:
Tim Head 2014-11-17 18:25:37 +01:00
parent 7d657e99ca
commit ff5d158a4e
7 changed files with 38272 additions and 15 deletions

View File

@ -16,7 +16,7 @@ example
>>> three = these.three_words(CERN) >>> three = these.three_words(CERN)
>>> print three >>> print three
'zeljka-worry-suhai' 'spitting-ripple-fontanel'
>>> these.decode(three) >>> these.decode(three)
(46.232335567474365, 6.055419445037842) (46.232335567474365, 6.055419445037842)

View File

@ -8,8 +8,8 @@ import random
import geohash import geohash
def get_google_words(): def get_words(fname):
lines = open("words/google-ngram-list") lines = open(fname)
words = [] words = []
for word in lines: for word in lines:
words.append(word.strip()) words.append(word.strip())
@ -17,10 +17,18 @@ def get_google_words():
lines.close() lines.close()
random.seed(634634) random.seed(634634)
random.shuffle(words) random.shuffle(words)
words = words[:2**15]
assert len(words) == len(set(words))
return words return words
GOOGLE_WORDLIST = get_google_words()
# These read like alien races from a sci-fi book
GOOGLE_WORDLIST = get_words("words/google-ngram-list")
# current best list for the three word hash
WORDNET_LEMMAS = get_words("words/wordnet-list")
# Human friendly word list, taken directly from humanhash project # Human friendly word list, taken directly from humanhash project
# these are the best words but there are not enough of
# them so we only use them for the six word hash
HUMAN_WORDLIST = ( HUMAN_WORDLIST = (
'ack', 'alabama', 'alanine', 'alaska', 'alpha', 'angel', 'apart', 'april', 'ack', 'alabama', 'alanine', 'alaska', 'alpha', 'angel', 'apart', 'april',
'arizona', 'arkansas', 'artist', 'asparagus', 'aspen', 'august', 'autumn', 'arizona', 'arkansas', 'artist', 'asparagus', 'aspen', 'august', 'autumn',
@ -75,7 +83,7 @@ class WordHasher(object):
in degrees. in degrees.
""" """
gh = geohash.encode(lat, lon, 9) gh = geohash.encode(lat, lon, 9)
words = "-".join(GOOGLE_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh))) words = "-".join(WORDNET_LEMMAS[p] for p in self.to_rugbits(self.geo_to_int(gh)))
return words return words
def six_words(self, (lat, lon)): def six_words(self, (lat, lon)):
@ -95,7 +103,7 @@ class WordHasher(object):
"""Decode words back to latitude and longitude""" """Decode words back to latitude and longitude"""
words = words.split("-") words = words.split("-")
if len(words) == 3: if len(words) == 3:
i = self.rugbits_to_int([GOOGLE_WORDLIST.index(w) for w in words]) i = self.rugbits_to_int([WORDNET_LEMMAS.index(w) for w in words])
elif len(words) == 6: elif len(words) == 6:
i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words]) i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])

View File

@ -25,7 +25,7 @@
</p> </p>
<p class="text-warning">This is a very early pre-alpha release. Don't expect anything to work properly or reproducible. Especially since the underlying word-list will most likely change soon.</p> <p class="text-warning">This is a very early pre-alpha release. Don't expect anything to work properly or reproducible. Especially since the underlying word-list will most likely change soon.</p>
<p id="input3wordsContainer"> <p id="input3wordsContainer">
<input type="text" class="form-control" id="input3words" placeholder="zeljka-worry-suhai"> <input type="text" class="form-control" id="input3words" placeholder="spitting-ripple-fontanel">
</p> </p>
<p> <p>
<button id="button3words" class="btn btn-lg btn-primary">Find on Map</button> <button id="button3words" class="btn btn-lg btn-primary">Find on Map</button>
@ -36,7 +36,7 @@
</footer> </footer>
<script> <script>
(function() { (function() {
var default3words = 'zeljka-worry-suhai'; var default3words = 'spitting-ripple-fontanel';
var threewordsField = document.getElementById('input3words'); var threewordsField = document.getElementById('input3words');
document.getElementById('button3words').addEventListener('click', function(evt) { document.getElementById('button3words').addEventListener('click', function(evt) {
var threewords = threewordsField.value; var threewords = threewordsField.value;

View File

@ -1,5 +1,15 @@
Creating a word list Wordnet wordlist
==================== ================
This is a wordlist based on the lemmas in [WordNet][wordnet]. It
produces a list of words much less esoteric than the google ngram
list below.
Run `wordnet.py` to create the wordnet wordlist.
Creating the google word list
=============================
Download the corpus from [google ngram][googlengram] with: Download the corpus from [google ngram][googlengram] with:
@ -7,6 +17,7 @@ Download the corpus from [google ngram][googlengram] with:
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-$a.gz; wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-$a.gz;
done done
[wordnet]: http://wordnet.princeton.edu/
[googlengram]: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html [googlengram]: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
then you can filter the words like this: then you can filter the words like this:

View File

@ -1,5 +1,8 @@
import difflib
import fileinput import fileinput
import nltk.stem as stem
RUDEWORDS = """anal anus arse ass ballsack balls bastard bitch biatch bloody RUDEWORDS = """anal anus arse ass ballsack balls bastard bitch biatch bloody
blowjob blow job bollock bollok boner boob bugger bum butt buttplug blowjob blow job bollock bollok boner boob bugger bum butt buttplug
@ -13,7 +16,7 @@ xxx""".split()
# Words that sound similar to others # Words that sound similar to others
HOMOPHONES = """there their than then hear here capital capitol won to too lose HOMOPHONES = """there their than then hear here capital capitol won to too lose
loose dessert desert affect effect beech beet beat blew chili chilly loose dessert desert affect effect beech beet beat blew chili chilly
dear deer days daze die dye lie lye""".split() dear deer days daze die dye lie lye""".lower().split()
MORE_HOMOPHONES = """accessary, accessory ad, add ail, ale air, heir MORE_HOMOPHONES = """accessary, accessory ad, add ail, ale air, heir
aisle, I'll, isle all, awl allowed, aloud alms, arms altar, alter arc, aisle, I'll, isle all, awl allowed, aloud alms, arms altar, alter arc,
ark aren't, aunt ate, eight auger, augur auk, orc aural, oral away, ark aren't, aunt ate, eight auger, augur auk, orc aural, oral away,
@ -97,16 +100,48 @@ we'll, wheel wean, ween weather, whether weaver, weever weir, we're
were, whirr wet, whet wheald, wheeled which, witch whig, wig while, were, whirr wet, whet wheald, wheeled which, witch whig, wig while,
wile whine, wine whirl, whorl whirled, world whit, wit white, wight wile whine, wine whirl, whorl whirled, world whit, wit white, wight
who's, whose woe, whoa wood, would yaw, yore, your, you're yoke, yolk who's, whose woe, whoa wood, would yaw, yore, your, you're yoke, yolk
you'll, yule""".replace(",", " ").split() you'll, yule""".replace(",", " ").lower().split()
wnl = stem.WordNetLemmatizer()
REMOVE = HOMOPHONES + MORE_HOMOPHONES + RUDEWORDS REMOVE = HOMOPHONES + MORE_HOMOPHONES + RUDEWORDS
REMOVE = set(wnl.lemmatize(R) for R in REMOVE)
seen_words = []
N = 0
for line in fileinput.input(): for line in fileinput.input():
count, word = line.split() count, word = line.split()
word = word.lower()
#try:
# word = wnl.lemmatize(word)
#
#except UnicodeDecodeError:
# continue
if word.startswith("z"):
continue
if word in REMOVE: if word in REMOVE:
continue continue
if len(word) == 4: if len(word) == 4:
continue continue
print word.lower() reject = False
s = difflib.SequenceMatcher(None, word, "A")
for w in seen_words:
s.set_seq2(w)
if s.ratio() > 0.8:
reject = True
break
if reject:
continue
seen_words.append(word)
N += 1
if N >= 10000:
seen_words = seen_words[-N:]
N = 0
print word

38190
words/wordnet-list Normal file

File diff suppressed because it is too large Load Diff

13
words/wordnet.py Normal file
View File

@ -0,0 +1,13 @@
import nltk
import string
lemmas = nltk.corpus.wordnet.all_lemma_names()
wordnet_lemmas = list(w.lower() for w in lemmas if 4<=len(w)<9 and
not any(c in string.punctuation for c in w) and
not any(c in string.digits for c in w))
assert len(wordnet_lemmas) == len(set(wordnet_lemmas))
f = open("wordnet-list", "w")
f.write("\n".join(wordnet_lemmas))
f.close()