Added a word list generator using google ngram as input

This commit is contained in:
Tim Head 2014-11-17 16:09:10 +01:00
parent 6902b59dbf
commit 9c9f32c2a7
8 changed files with 32952 additions and 21 deletions

3
.gitignore vendored
View File

@ -52,3 +52,6 @@ docs/_build/
# PyBuilder
target/
# google ngram input files
googlebooks-eng-all-1gram-20120701*

View File

@ -16,7 +16,7 @@ example
>>> three = these.three_words(CERN)
>>> print three
'engirt-aleutic-canun'
'treaty-crane-seldes'
>>> these.decode(three)
(46.232335567474365, 6.055419445037842)
@ -74,7 +74,7 @@ have similar `these-3-words` hashes
>>> other_CERN_site = (46.256811, 6.056792)
>>> six = these.six_words(other_CERN_site)
>>> print six
''spaghetti-carolina-kentucky-utah-seventeen-neptune'
'spaghetti-carolina-kentucky-utah-seventeen-neptune'
>>> these.decode(six)
(46.256797313690186, 6.056792736053467)

View File

@ -8,21 +8,18 @@ import random
import geohash
def get_random_words():
words = open("/usr/share/dict/words")
random.seed(3346346)
useful = []
for w in words:
w = w.strip()
if 5 <= len(w) < 8:
useful.append(w.lower())
def get_google_words():
lines = open("words/google-ngram-list")
words = []
for line in lines:
_, word = line.split()
words.append(word)
words.close()
useful = useful[:2**15]
random.shuffle(useful)
assert len(useful) == 2**15
return useful
RANDOM_WORDLIST = get_random_words()
lines.close()
random.seed(634634)
random.shuffle(words)
return words
GOOGLE_WORDLIST = get_google_words()
# Human friendly word list, taken directly from humanhash project
HUMAN_WORDLIST = (
@ -79,7 +76,7 @@ class WordHasher(object):
in degrees.
"""
gh = geohash.encode(lat, lon, 9)
words = "-".join(RANDOM_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh)))
words = "-".join(GOOGLE_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh)))
return words
def six_words(self, (lat, lon)):
@ -99,7 +96,7 @@ class WordHasher(object):
"""Decode words back to latitude and longitude"""
words = words.split("-")
if len(words) == 3:
i = self.rugbits_to_int([RANDOM_WORDLIST.index(w) for w in words])
i = self.rugbits_to_int([GOOGLE_WORDLIST.index(w) for w in words])
elif len(words) == 6:
i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])

View File

@ -19,7 +19,7 @@
Find a location anywhere in the world identified by three simple words.
</p>
<p id="input3wordsContainer">
<input type="text" class="form-control" id="input3words" placeholder="engirt-aleutic-canun">
<input type="text" class="form-control" id="input3words" placeholder="treaty-crane-seldes">
</p>
<p>
<button id="button3words" class="btn btn-lg btn-primary">Find on Map</button>
@ -27,7 +27,7 @@
</main>
<script>
(function() {
var default3words = 'engirt-aleutic-canun';
var default3words = 'treaty-crane-seldes';
var threewordsField = document.getElementById('input3words');
document.getElementById('button3words').addEventListener('click', function(evt) {
var threewords = threewordsField.value;

27
words/README.md Normal file
View File

@ -0,0 +1,27 @@
Creating a word list
====================
Download the corpus from [google ngram][googlengram] with:
for a in a b c d e f g h i j k l m n o p q r s t u v w x y z; do
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-$a.gz;
done
[googlengram]: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
then you can filter the words like this:
for L in a b c d e f g h i j k l m n o p q r s t u v w x y z; do
gzcat googlebooks-eng-all-1gram-20120701-$L.gz | python ngram-filter.py > googlebooks-eng-all-1gram-20120701-$L-filtered;
done
To get a list of the top 300 words:
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | tail -n 300
To create the wordlist used by `These3Words` run:
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | tail -n32768 > google-ngram-list
Check that your list is long enough by counting the lines
in `google-ngram-list`, you need exactly 32768 words

32768
words/google-ngram-list Normal file

File diff suppressed because it is too large Load Diff

24
words/ngram-filter.py Normal file
View File

@ -0,0 +1,24 @@
"""Filter and reformat the google ngram corpus"""
import string
import fileinput
for line in fileinput.input():
word, year, count, _ = line.strip().split()
if year != "2008":
continue
count = int(count)
if count < 40:
continue
if not (4 <= len(word) < 7):
continue
if word.upper() == word or word[:2].upper() == word[:2]:
continue
if any(c in word for c in string.punctuation + string.digits):
continue
print count, word

112
words/normalise-words.py Normal file
View File

@ -0,0 +1,112 @@
import fileinput
RUDEWORDS = """anal anus arse ass ballsack balls bastard bitch biatch bloody
blowjob blow job bollock bollok boner boob bugger bum butt buttplug
clitoris cock coon crap cunt damn dick dildo dyke fag feck fellate
fellatio felching fuck fudgepacker fudge packer flange Goddamn God
damn hell homo jerk jizz knobend knob end labia lmao lmfao muff nigger
nigga omg penis piss poop prick pube pussy queer scrotum sex shit sh1t
slut smegma spunk suicide tit tosser turd twat vagina wank whore wtf
xxx""".split()
# Words that sound similar to others
HOMOPHONES = """there their than then hear here capital capitol won to too lose
loose dessert desert affect effect beech beet beat blew chili chilly
dear deer days daze die dye lie lye""".split()
MORE_HOMOPHONES = """accessary, accessory ad, add ail, ale air, heir
aisle, I'll, isle all, awl allowed, aloud alms, arms altar, alter arc,
ark aren't, aunt ate, eight auger, augur auk, orc aural, oral away,
aweigh awe, oar, or, ore axel, axle aye, eye, I bail, bale bait, bate
baize, bays bald, bawled ball, bawl band, banned bard, barred bare,
bear bark, barque baron, barren base, bass bay, bey bazaar, bizarre
be, bee beach, beech bean, been beat, beet beau, bow beer, bier bel,
bell, belle berry, bury berth, birth bight, bite, byte billed, build
bitten, bittern blew, blue bloc, block boar, bore board, bored
boarder, border bold, bowled boos, booze born, borne bough, bow boy,
buoy brae, bray braid, brayed braise, brays, braze brake, break bread,
bred brews, bruise bridal, bridle broach, brooch bur, burr but, butt
buy, by, bye buyer, byre calendar, calender call, caul canvas, canvass
cast, caste caster, castor caught, court caw, core, corps cede, seed
ceiling, sealing cell, sell censer, censor, sensor cent, scent, sent
cereal, serial cheap, cheep check, cheque choir, quire chord, cord
cite, sight, site clack, claque clew, clue climb, clime close, cloze
coal, kohl coarse, course coign, coin colonel, kernel complacent,
complaisant complement, compliment coo, coup cops, copse council,
counsel cousin, cozen creak, creek crews, cruise cue, kyu, queue curb,
kerb currant, current cymbol, symbol dam, damn days, daze dear, deer
descent, dissent desert, dessert deviser, divisor dew, due die, dye
discreet, discrete doe, doh, dough done, dun douse, dowse draft,
draught dual, duel earn, urn eery, eyrie ewe, yew, you faint, feint
fah, far fair, fare farther, father fate, fete faun, fawn fay, fey
faze, phase feat, feet ferrule, ferule few, phew fie, phi file, phial
find, fined fir, fur fizz, phiz flair, flare flaw, floor flea, flee
flex, flecks flew, flu, flue floe, flow flour, flower foaled, fold
for, fore, four foreword, forward fort, fought forth, fourth foul,
fowl franc, frank freeze, frieze friar, fryer furs, furze gait, gate
galipot, gallipot gallop, galop gamble, gambol gays, gaze genes, jeans
gild, guild gilt, guilt giro, gyro gnaw, nor gneiss, nice gorilla,
guerilla grate, great greave, grieve greys, graze grisly, grizzly
groan, grown guessed, guest hail, hale hair, hare hall, haul hangar,
hanger hart, heart haw, hoar, whore hay, hey heal, heel, he'll hear,
here heard, herd he'd, heed heroin, heroine hew, hue hi, high higher,
hire him, hymn ho, hoe hoard, horde hoarse, horse holey, holy, wholly
hour, our idle, idol in, inn indict, indite it's, its jewel, joule
key, quay knave, nave knead, need knew, new knight, night knit, nit
knob, nob knock, nock knot, not know, no knows, nose laager, lager
lac, lack lade, laid lain, lane lam, lamb laps, lapse larva, lava
lase, laze law, lore lay, ley lea, lee leach, leech lead, led leak,
leek lean, lien lessen, lesson levee, levy liar, lyre licence, license
licker, liquor lie, lye lieu, loo links, lynx lo, low load, lode loan,
lone locks, lox loop, loupe loot, lute made, maid mail, male main,
mane maize, maze mall, maul manna, manner mantel, mantle mare, mayor
mark, marque marshal, martial marten, martin mask, masque maw, more
me, mi mean, mien meat, meet, mete medal, meddle metal, mettle meter,
metre might, mite miner, minor, mynah mind, mined missed, mist moat,
mote mode, mowed moor, more moose, mousse morning, mourning muscle,
mussel naval, navel nay, neigh nigh, nye none, nun od, odd ode, owed
oh, owe one, won packed, pact packs, pax pail, pale pain, pane pair,
pare, pear palate, palette, pallet pascal, paschal paten, patten,
pattern pause, paws, pores, pours pawn, porn pea, pee peace, piece
peak, peek, peke, pique peal, peel pearl, purl pedal, peddle peer,
pier pi, pie pica, pika place, plaice plain, plane pleas, please plum,
plumb pole, poll poof, pouffe practice, practise praise, prays, preys
principal, principle profit, prophet quarts, quartz quean, queen rain,
reign, rein raise, rays, raze rap, wrap raw, roar read, reed read, red
real, reel reek, wreak rest, wrest retch, wretch review, revue rheum,
room right, rite, wright, write ring, wring road, rode roe, row role,
roll roo, roux, rue rood, rude root, route rose, rows rota, rotor
rote, wrote rough, ruff rouse, rows rung, wrung rye, wry saver, savour
spade, spayed sale, sail sane, seine satire, satyr sauce, source saw,
soar, sore scene, seen scull, skull sea, see seam, seem sear, seer,
sere seas, sees, seize sew, so, sow shake, sheikh shear, sheer shoe,
shoo sic, sick side, sighed sign, sine sink, synch slay, sleigh sloe,
slow sole, soul some, sum son, sun sort, sought spa, spar staid,
stayed stair, stare stake, steak stalk, stork stationary, stationery
steal, steel stile, style storey, story straight, strait sweet, suite
swat, swot tacks, tax tale, tail talk, torque tare, tear taught, taut,
tort te, tea, tee team, teem tear, tier teas, tease terce, terse tern,
turn there, their, they're threw, through throes, throws throne,
thrown thyme, time tic, tick tide, tied tire, tyre to, too, two toad,
toed, towed told, tolled tole, toll ton, tun tor, tore tough, tuff
troop, troupe tuba, tuber vain, vane, vein vale, veil vial, vile wail,
wale, whale wain, wane waist, waste wait, weight waive, wave wall,
waul war, wore ware, wear, where warn, worn wart, wort watt, what wax,
whacks way, weigh, whey we, wee, whee weak, week we'd, weed weal,
we'll, wheel wean, ween weather, whether weaver, weever weir, we're
were, whirr wet, whet wheald, wheeled which, witch whig, wig while,
wile whine, wine whirl, whorl whirled, world whit, wit white, wight
who's, whose woe, whoa wood, would yaw, yore, your, you're yoke, yolk
you'll, yule""".replace(",", " ").split()
REMOVE = HOMOPHONES + MORE_HOMOPHONES + RUDEWORDS
for line in fileinput.input():
count, word = line.split()
if word in REMOVE:
continue
if len(word) == 4:
continue
print count, word.lower()