Added a word list generator using google ngram as input
This commit is contained in:
parent
6902b59dbf
commit
9c9f32c2a7
|
@ -52,3 +52,6 @@ docs/_build/
|
||||||
|
|
||||||
# PyBuilder
|
# PyBuilder
|
||||||
target/
|
target/
|
||||||
|
|
||||||
|
# google ngram input files
|
||||||
|
googlebooks-eng-all-1gram-20120701*
|
||||||
|
|
|
@ -16,7 +16,7 @@ example
|
||||||
|
|
||||||
>>> three = these.three_words(CERN)
|
>>> three = these.three_words(CERN)
|
||||||
>>> print three
|
>>> print three
|
||||||
'engirt-aleutic-canun'
|
'treaty-crane-seldes'
|
||||||
>>> these.decode(three)
|
>>> these.decode(three)
|
||||||
(46.232335567474365, 6.055419445037842)
|
(46.232335567474365, 6.055419445037842)
|
||||||
|
|
||||||
|
@ -74,7 +74,7 @@ have similar `these-3-words` hashes
|
||||||
>>> other_CERN_site = (46.256811, 6.056792)
|
>>> other_CERN_site = (46.256811, 6.056792)
|
||||||
>>> six = these.six_words(other_CERN_site)
|
>>> six = these.six_words(other_CERN_site)
|
||||||
>>> print six
|
>>> print six
|
||||||
''spaghetti-carolina-kentucky-utah-seventeen-neptune'
|
'spaghetti-carolina-kentucky-utah-seventeen-neptune'
|
||||||
>>> these.decode(six)
|
>>> these.decode(six)
|
||||||
(46.256797313690186, 6.056792736053467)
|
(46.256797313690186, 6.056792736053467)
|
||||||
|
|
||||||
|
|
|
@ -8,21 +8,18 @@ import random
|
||||||
import geohash
|
import geohash
|
||||||
|
|
||||||
|
|
||||||
def get_random_words():
|
def get_google_words():
|
||||||
words = open("/usr/share/dict/words")
|
lines = open("words/google-ngram-list")
|
||||||
random.seed(3346346)
|
words = []
|
||||||
useful = []
|
for line in lines:
|
||||||
for w in words:
|
_, word = line.split()
|
||||||
w = w.strip()
|
words.append(word)
|
||||||
if 5 <= len(w) < 8:
|
|
||||||
useful.append(w.lower())
|
|
||||||
|
|
||||||
words.close()
|
lines.close()
|
||||||
useful = useful[:2**15]
|
random.seed(634634)
|
||||||
random.shuffle(useful)
|
random.shuffle(words)
|
||||||
assert len(useful) == 2**15
|
return words
|
||||||
return useful
|
GOOGLE_WORDLIST = get_google_words()
|
||||||
RANDOM_WORDLIST = get_random_words()
|
|
||||||
|
|
||||||
# Human friendly word list, taken directly from humanhash project
|
# Human friendly word list, taken directly from humanhash project
|
||||||
HUMAN_WORDLIST = (
|
HUMAN_WORDLIST = (
|
||||||
|
@ -79,7 +76,7 @@ class WordHasher(object):
|
||||||
in degrees.
|
in degrees.
|
||||||
"""
|
"""
|
||||||
gh = geohash.encode(lat, lon, 9)
|
gh = geohash.encode(lat, lon, 9)
|
||||||
words = "-".join(RANDOM_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh)))
|
words = "-".join(GOOGLE_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh)))
|
||||||
return words
|
return words
|
||||||
|
|
||||||
def six_words(self, (lat, lon)):
|
def six_words(self, (lat, lon)):
|
||||||
|
@ -99,7 +96,7 @@ class WordHasher(object):
|
||||||
"""Decode words back to latitude and longitude"""
|
"""Decode words back to latitude and longitude"""
|
||||||
words = words.split("-")
|
words = words.split("-")
|
||||||
if len(words) == 3:
|
if len(words) == 3:
|
||||||
i = self.rugbits_to_int([RANDOM_WORDLIST.index(w) for w in words])
|
i = self.rugbits_to_int([GOOGLE_WORDLIST.index(w) for w in words])
|
||||||
|
|
||||||
elif len(words) == 6:
|
elif len(words) == 6:
|
||||||
i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])
|
i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
Find a location anywhere in the world identified by three simple words.
|
Find a location anywhere in the world identified by three simple words.
|
||||||
</p>
|
</p>
|
||||||
<p id="input3wordsContainer">
|
<p id="input3wordsContainer">
|
||||||
<input type="text" class="form-control" id="input3words" placeholder="engirt-aleutic-canun">
|
<input type="text" class="form-control" id="input3words" placeholder="treaty-crane-seldes">
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
<button id="button3words" class="btn btn-lg btn-primary">Find on Map</button>
|
<button id="button3words" class="btn btn-lg btn-primary">Find on Map</button>
|
||||||
|
@ -27,7 +27,7 @@
|
||||||
</main>
|
</main>
|
||||||
<script>
|
<script>
|
||||||
(function() {
|
(function() {
|
||||||
var default3words = 'engirt-aleutic-canun';
|
var default3words = 'treaty-crane-seldes';
|
||||||
var threewordsField = document.getElementById('input3words');
|
var threewordsField = document.getElementById('input3words');
|
||||||
document.getElementById('button3words').addEventListener('click', function(evt) {
|
document.getElementById('button3words').addEventListener('click', function(evt) {
|
||||||
var threewords = threewordsField.value;
|
var threewords = threewordsField.value;
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
Creating a word list
|
||||||
|
====================
|
||||||
|
|
||||||
|
Download the corpus from [google ngram][googlengram] with:
|
||||||
|
|
||||||
|
for a in a b c d e f g h i j k l m n o p q r s t u v w x y z; do
|
||||||
|
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-$a.gz;
|
||||||
|
done
|
||||||
|
|
||||||
|
[googlengram]: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
|
||||||
|
|
||||||
|
then you can filter the words like this:
|
||||||
|
|
||||||
|
for L in a b c d e f g h i j k l m n o p q r s t u v w x y z; do
|
||||||
|
gzcat googlebooks-eng-all-1gram-20120701-$L.gz | python ngram-filter.py > googlebooks-eng-all-1gram-20120701-$L-filtered;
|
||||||
|
done
|
||||||
|
|
||||||
|
To get a list of the top 300 words:
|
||||||
|
|
||||||
|
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | tail -n 300
|
||||||
|
|
||||||
|
To create the wordlist used by `These3Words` run:
|
||||||
|
|
||||||
|
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | tail -n32768 > google-ngram-list
|
||||||
|
|
||||||
|
Check that your list is long enough by counting the lines
|
||||||
|
in `google-ngram-list`, you need exactly 32768 words
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,24 @@
|
||||||
|
"""Filter and reformat the google ngram corpus"""
|
||||||
|
import string
|
||||||
|
import fileinput
|
||||||
|
|
||||||
|
for line in fileinput.input():
|
||||||
|
word, year, count, _ = line.strip().split()
|
||||||
|
|
||||||
|
if year != "2008":
|
||||||
|
continue
|
||||||
|
|
||||||
|
count = int(count)
|
||||||
|
if count < 40:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not (4 <= len(word) < 7):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if word.upper() == word or word[:2].upper() == word[:2]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if any(c in word for c in string.punctuation + string.digits):
|
||||||
|
continue
|
||||||
|
|
||||||
|
print count, word
|
|
@ -0,0 +1,112 @@
|
||||||
|
import fileinput
|
||||||
|
|
||||||
|
|
||||||
|
RUDEWORDS = """anal anus arse ass ballsack balls bastard bitch biatch bloody
|
||||||
|
blowjob blow job bollock bollok boner boob bugger bum butt buttplug
|
||||||
|
clitoris cock coon crap cunt damn dick dildo dyke fag feck fellate
|
||||||
|
fellatio felching fuck fudgepacker fudge packer flange Goddamn God
|
||||||
|
damn hell homo jerk jizz knobend knob end labia lmao lmfao muff nigger
|
||||||
|
nigga omg penis piss poop prick pube pussy queer scrotum sex shit sh1t
|
||||||
|
slut smegma spunk suicide tit tosser turd twat vagina wank whore wtf
|
||||||
|
xxx""".split()
|
||||||
|
|
||||||
|
# Words that sound similar to others
|
||||||
|
HOMOPHONES = """there their than then hear here capital capitol won to too lose
|
||||||
|
loose dessert desert affect effect beech beet beat blew chili chilly
|
||||||
|
dear deer days daze die dye lie lye""".split()
|
||||||
|
MORE_HOMOPHONES = """accessary, accessory ad, add ail, ale air, heir
|
||||||
|
aisle, I'll, isle all, awl allowed, aloud alms, arms altar, alter arc,
|
||||||
|
ark aren't, aunt ate, eight auger, augur auk, orc aural, oral away,
|
||||||
|
aweigh awe, oar, or, ore axel, axle aye, eye, I bail, bale bait, bate
|
||||||
|
baize, bays bald, bawled ball, bawl band, banned bard, barred bare,
|
||||||
|
bear bark, barque baron, barren base, bass bay, bey bazaar, bizarre
|
||||||
|
be, bee beach, beech bean, been beat, beet beau, bow beer, bier bel,
|
||||||
|
bell, belle berry, bury berth, birth bight, bite, byte billed, build
|
||||||
|
bitten, bittern blew, blue bloc, block boar, bore board, bored
|
||||||
|
boarder, border bold, bowled boos, booze born, borne bough, bow boy,
|
||||||
|
buoy brae, bray braid, brayed braise, brays, braze brake, break bread,
|
||||||
|
bred brews, bruise bridal, bridle broach, brooch bur, burr but, butt
|
||||||
|
buy, by, bye buyer, byre calendar, calender call, caul canvas, canvass
|
||||||
|
cast, caste caster, castor caught, court caw, core, corps cede, seed
|
||||||
|
ceiling, sealing cell, sell censer, censor, sensor cent, scent, sent
|
||||||
|
cereal, serial cheap, cheep check, cheque choir, quire chord, cord
|
||||||
|
cite, sight, site clack, claque clew, clue climb, clime close, cloze
|
||||||
|
coal, kohl coarse, course coign, coin colonel, kernel complacent,
|
||||||
|
complaisant complement, compliment coo, coup cops, copse council,
|
||||||
|
counsel cousin, cozen creak, creek crews, cruise cue, kyu, queue curb,
|
||||||
|
kerb currant, current cymbol, symbol dam, damn days, daze dear, deer
|
||||||
|
descent, dissent desert, dessert deviser, divisor dew, due die, dye
|
||||||
|
discreet, discrete doe, doh, dough done, dun douse, dowse draft,
|
||||||
|
draught dual, duel earn, urn eery, eyrie ewe, yew, you faint, feint
|
||||||
|
fah, far fair, fare farther, father fate, fete faun, fawn fay, fey
|
||||||
|
faze, phase feat, feet ferrule, ferule few, phew fie, phi file, phial
|
||||||
|
find, fined fir, fur fizz, phiz flair, flare flaw, floor flea, flee
|
||||||
|
flex, flecks flew, flu, flue floe, flow flour, flower foaled, fold
|
||||||
|
for, fore, four foreword, forward fort, fought forth, fourth foul,
|
||||||
|
fowl franc, frank freeze, frieze friar, fryer furs, furze gait, gate
|
||||||
|
galipot, gallipot gallop, galop gamble, gambol gays, gaze genes, jeans
|
||||||
|
gild, guild gilt, guilt giro, gyro gnaw, nor gneiss, nice gorilla,
|
||||||
|
guerilla grate, great greave, grieve greys, graze grisly, grizzly
|
||||||
|
groan, grown guessed, guest hail, hale hair, hare hall, haul hangar,
|
||||||
|
hanger hart, heart haw, hoar, whore hay, hey heal, heel, he'll hear,
|
||||||
|
here heard, herd he'd, heed heroin, heroine hew, hue hi, high higher,
|
||||||
|
hire him, hymn ho, hoe hoard, horde hoarse, horse holey, holy, wholly
|
||||||
|
hour, our idle, idol in, inn indict, indite it's, its jewel, joule
|
||||||
|
key, quay knave, nave knead, need knew, new knight, night knit, nit
|
||||||
|
knob, nob knock, nock knot, not know, no knows, nose laager, lager
|
||||||
|
lac, lack lade, laid lain, lane lam, lamb laps, lapse larva, lava
|
||||||
|
lase, laze law, lore lay, ley lea, lee leach, leech lead, led leak,
|
||||||
|
leek lean, lien lessen, lesson levee, levy liar, lyre licence, license
|
||||||
|
licker, liquor lie, lye lieu, loo links, lynx lo, low load, lode loan,
|
||||||
|
lone locks, lox loop, loupe loot, lute made, maid mail, male main,
|
||||||
|
mane maize, maze mall, maul manna, manner mantel, mantle mare, mayor
|
||||||
|
mark, marque marshal, martial marten, martin mask, masque maw, more
|
||||||
|
me, mi mean, mien meat, meet, mete medal, meddle metal, mettle meter,
|
||||||
|
metre might, mite miner, minor, mynah mind, mined missed, mist moat,
|
||||||
|
mote mode, mowed moor, more moose, mousse morning, mourning muscle,
|
||||||
|
mussel naval, navel nay, neigh nigh, nye none, nun od, odd ode, owed
|
||||||
|
oh, owe one, won packed, pact packs, pax pail, pale pain, pane pair,
|
||||||
|
pare, pear palate, palette, pallet pascal, paschal paten, patten,
|
||||||
|
pattern pause, paws, pores, pours pawn, porn pea, pee peace, piece
|
||||||
|
peak, peek, peke, pique peal, peel pearl, purl pedal, peddle peer,
|
||||||
|
pier pi, pie pica, pika place, plaice plain, plane pleas, please plum,
|
||||||
|
plumb pole, poll poof, pouffe practice, practise praise, prays, preys
|
||||||
|
principal, principle profit, prophet quarts, quartz quean, queen rain,
|
||||||
|
reign, rein raise, rays, raze rap, wrap raw, roar read, reed read, red
|
||||||
|
real, reel reek, wreak rest, wrest retch, wretch review, revue rheum,
|
||||||
|
room right, rite, wright, write ring, wring road, rode roe, row role,
|
||||||
|
roll roo, roux, rue rood, rude root, route rose, rows rota, rotor
|
||||||
|
rote, wrote rough, ruff rouse, rows rung, wrung rye, wry saver, savour
|
||||||
|
spade, spayed sale, sail sane, seine satire, satyr sauce, source saw,
|
||||||
|
soar, sore scene, seen scull, skull sea, see seam, seem sear, seer,
|
||||||
|
sere seas, sees, seize sew, so, sow shake, sheikh shear, sheer shoe,
|
||||||
|
shoo sic, sick side, sighed sign, sine sink, synch slay, sleigh sloe,
|
||||||
|
slow sole, soul some, sum son, sun sort, sought spa, spar staid,
|
||||||
|
stayed stair, stare stake, steak stalk, stork stationary, stationery
|
||||||
|
steal, steel stile, style storey, story straight, strait sweet, suite
|
||||||
|
swat, swot tacks, tax tale, tail talk, torque tare, tear taught, taut,
|
||||||
|
tort te, tea, tee team, teem tear, tier teas, tease terce, terse tern,
|
||||||
|
turn there, their, they're threw, through throes, throws throne,
|
||||||
|
thrown thyme, time tic, tick tide, tied tire, tyre to, too, two toad,
|
||||||
|
toed, towed told, tolled tole, toll ton, tun tor, tore tough, tuff
|
||||||
|
troop, troupe tuba, tuber vain, vane, vein vale, veil vial, vile wail,
|
||||||
|
wale, whale wain, wane waist, waste wait, weight waive, wave wall,
|
||||||
|
waul war, wore ware, wear, where warn, worn wart, wort watt, what wax,
|
||||||
|
whacks way, weigh, whey we, wee, whee weak, week we'd, weed weal,
|
||||||
|
we'll, wheel wean, ween weather, whether weaver, weever weir, we're
|
||||||
|
were, whirr wet, whet wheald, wheeled which, witch whig, wig while,
|
||||||
|
wile whine, wine whirl, whorl whirled, world whit, wit white, wight
|
||||||
|
who's, whose woe, whoa wood, would yaw, yore, your, you're yoke, yolk
|
||||||
|
you'll, yule""".replace(",", " ").split()
|
||||||
|
|
||||||
|
REMOVE = HOMOPHONES + MORE_HOMOPHONES + RUDEWORDS
|
||||||
|
|
||||||
|
for line in fileinput.input():
|
||||||
|
count, word = line.split()
|
||||||
|
if word in REMOVE:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(word) == 4:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print count, word.lower()
|
Loading…
Reference in New Issue