Added a word list generator using google ngram as input
This commit is contained in:
parent
6902b59dbf
commit
9c9f32c2a7
|
@ -52,3 +52,6 @@ docs/_build/
|
|||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# google ngram input files
|
||||
googlebooks-eng-all-1gram-20120701*
|
||||
|
|
|
@ -16,7 +16,7 @@ example
|
|||
|
||||
>>> three = these.three_words(CERN)
|
||||
>>> print three
|
||||
'engirt-aleutic-canun'
|
||||
'treaty-crane-seldes'
|
||||
>>> these.decode(three)
|
||||
(46.232335567474365, 6.055419445037842)
|
||||
|
||||
|
@ -74,7 +74,7 @@ have similar `these-3-words` hashes
|
|||
>>> other_CERN_site = (46.256811, 6.056792)
|
||||
>>> six = these.six_words(other_CERN_site)
|
||||
>>> print six
|
||||
''spaghetti-carolina-kentucky-utah-seventeen-neptune'
|
||||
'spaghetti-carolina-kentucky-utah-seventeen-neptune'
|
||||
>>> these.decode(six)
|
||||
(46.256797313690186, 6.056792736053467)
|
||||
|
||||
|
|
|
@ -8,21 +8,18 @@ import random
|
|||
import geohash
|
||||
|
||||
|
||||
def get_random_words():
|
||||
words = open("/usr/share/dict/words")
|
||||
random.seed(3346346)
|
||||
useful = []
|
||||
for w in words:
|
||||
w = w.strip()
|
||||
if 5 <= len(w) < 8:
|
||||
useful.append(w.lower())
|
||||
def get_google_words():
|
||||
lines = open("words/google-ngram-list")
|
||||
words = []
|
||||
for line in lines:
|
||||
_, word = line.split()
|
||||
words.append(word)
|
||||
|
||||
words.close()
|
||||
useful = useful[:2**15]
|
||||
random.shuffle(useful)
|
||||
assert len(useful) == 2**15
|
||||
return useful
|
||||
RANDOM_WORDLIST = get_random_words()
|
||||
lines.close()
|
||||
random.seed(634634)
|
||||
random.shuffle(words)
|
||||
return words
|
||||
GOOGLE_WORDLIST = get_google_words()
|
||||
|
||||
# Human friendly word list, taken directly from humanhash project
|
||||
HUMAN_WORDLIST = (
|
||||
|
@ -79,7 +76,7 @@ class WordHasher(object):
|
|||
in degrees.
|
||||
"""
|
||||
gh = geohash.encode(lat, lon, 9)
|
||||
words = "-".join(RANDOM_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh)))
|
||||
words = "-".join(GOOGLE_WORDLIST[p] for p in self.to_rugbits(self.geo_to_int(gh)))
|
||||
return words
|
||||
|
||||
def six_words(self, (lat, lon)):
|
||||
|
@ -99,7 +96,7 @@ class WordHasher(object):
|
|||
"""Decode words back to latitude and longitude"""
|
||||
words = words.split("-")
|
||||
if len(words) == 3:
|
||||
i = self.rugbits_to_int([RANDOM_WORDLIST.index(w) for w in words])
|
||||
i = self.rugbits_to_int([GOOGLE_WORDLIST.index(w) for w in words])
|
||||
|
||||
elif len(words) == 6:
|
||||
i = self.bytes_to_int([HUMAN_WORDLIST.index(w) for w in words])
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
Find a location anywhere in the world identified by three simple words.
|
||||
</p>
|
||||
<p id="input3wordsContainer">
|
||||
<input type="text" class="form-control" id="input3words" placeholder="engirt-aleutic-canun">
|
||||
<input type="text" class="form-control" id="input3words" placeholder="treaty-crane-seldes">
|
||||
</p>
|
||||
<p>
|
||||
<button id="button3words" class="btn btn-lg btn-primary">Find on Map</button>
|
||||
|
@ -27,7 +27,7 @@
|
|||
</main>
|
||||
<script>
|
||||
(function() {
|
||||
var default3words = 'engirt-aleutic-canun';
|
||||
var default3words = 'treaty-crane-seldes';
|
||||
var threewordsField = document.getElementById('input3words');
|
||||
document.getElementById('button3words').addEventListener('click', function(evt) {
|
||||
var threewords = threewordsField.value;
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
Creating a word list
|
||||
====================
|
||||
|
||||
Download the corpus from [google ngram][googlengram] with:
|
||||
|
||||
for a in a b c d e f g h i j k l m n o p q r s t u v w x y z; do
|
||||
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-$a.gz;
|
||||
done
|
||||
|
||||
[googlengram]: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
|
||||
|
||||
then you can filter the words like this:
|
||||
|
||||
for L in a b c d e f g h i j k l m n o p q r s t u v w x y z; do
|
||||
gzcat googlebooks-eng-all-1gram-20120701-$L.gz | python ngram-filter.py > googlebooks-eng-all-1gram-20120701-$L-filtered;
|
||||
done
|
||||
|
||||
To get a list of the top 300 words:
|
||||
|
||||
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | tail -n 300
|
||||
|
||||
To create the wordlist used by `These3Words` run:
|
||||
|
||||
sort -n googlebooks-eng-all-1gram-20120701-*-filtered | python normalise-words.py | tail -n32768 > google-ngram-list
|
||||
|
||||
Check that your list is long enough by counting the lines
|
||||
in `google-ngram-list`, you need exactly 32768 words
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,24 @@
|
|||
"""Filter and reformat the google ngram corpus"""
|
||||
import string
|
||||
import fileinput
|
||||
|
||||
for line in fileinput.input():
|
||||
word, year, count, _ = line.strip().split()
|
||||
|
||||
if year != "2008":
|
||||
continue
|
||||
|
||||
count = int(count)
|
||||
if count < 40:
|
||||
continue
|
||||
|
||||
if not (4 <= len(word) < 7):
|
||||
continue
|
||||
|
||||
if word.upper() == word or word[:2].upper() == word[:2]:
|
||||
continue
|
||||
|
||||
if any(c in word for c in string.punctuation + string.digits):
|
||||
continue
|
||||
|
||||
print count, word
|
|
@ -0,0 +1,112 @@
|
|||
import fileinput
|
||||
|
||||
|
||||
RUDEWORDS = """anal anus arse ass ballsack balls bastard bitch biatch bloody
|
||||
blowjob blow job bollock bollok boner boob bugger bum butt buttplug
|
||||
clitoris cock coon crap cunt damn dick dildo dyke fag feck fellate
|
||||
fellatio felching fuck fudgepacker fudge packer flange Goddamn God
|
||||
damn hell homo jerk jizz knobend knob end labia lmao lmfao muff nigger
|
||||
nigga omg penis piss poop prick pube pussy queer scrotum sex shit sh1t
|
||||
slut smegma spunk suicide tit tosser turd twat vagina wank whore wtf
|
||||
xxx""".split()
|
||||
|
||||
# Words that sound similar to others
|
||||
HOMOPHONES = """there their than then hear here capital capitol won to too lose
|
||||
loose dessert desert affect effect beech beet beat blew chili chilly
|
||||
dear deer days daze die dye lie lye""".split()
|
||||
MORE_HOMOPHONES = """accessary, accessory ad, add ail, ale air, heir
|
||||
aisle, I'll, isle all, awl allowed, aloud alms, arms altar, alter arc,
|
||||
ark aren't, aunt ate, eight auger, augur auk, orc aural, oral away,
|
||||
aweigh awe, oar, or, ore axel, axle aye, eye, I bail, bale bait, bate
|
||||
baize, bays bald, bawled ball, bawl band, banned bard, barred bare,
|
||||
bear bark, barque baron, barren base, bass bay, bey bazaar, bizarre
|
||||
be, bee beach, beech bean, been beat, beet beau, bow beer, bier bel,
|
||||
bell, belle berry, bury berth, birth bight, bite, byte billed, build
|
||||
bitten, bittern blew, blue bloc, block boar, bore board, bored
|
||||
boarder, border bold, bowled boos, booze born, borne bough, bow boy,
|
||||
buoy brae, bray braid, brayed braise, brays, braze brake, break bread,
|
||||
bred brews, bruise bridal, bridle broach, brooch bur, burr but, butt
|
||||
buy, by, bye buyer, byre calendar, calender call, caul canvas, canvass
|
||||
cast, caste caster, castor caught, court caw, core, corps cede, seed
|
||||
ceiling, sealing cell, sell censer, censor, sensor cent, scent, sent
|
||||
cereal, serial cheap, cheep check, cheque choir, quire chord, cord
|
||||
cite, sight, site clack, claque clew, clue climb, clime close, cloze
|
||||
coal, kohl coarse, course coign, coin colonel, kernel complacent,
|
||||
complaisant complement, compliment coo, coup cops, copse council,
|
||||
counsel cousin, cozen creak, creek crews, cruise cue, kyu, queue curb,
|
||||
kerb currant, current cymbol, symbol dam, damn days, daze dear, deer
|
||||
descent, dissent desert, dessert deviser, divisor dew, due die, dye
|
||||
discreet, discrete doe, doh, dough done, dun douse, dowse draft,
|
||||
draught dual, duel earn, urn eery, eyrie ewe, yew, you faint, feint
|
||||
fah, far fair, fare farther, father fate, fete faun, fawn fay, fey
|
||||
faze, phase feat, feet ferrule, ferule few, phew fie, phi file, phial
|
||||
find, fined fir, fur fizz, phiz flair, flare flaw, floor flea, flee
|
||||
flex, flecks flew, flu, flue floe, flow flour, flower foaled, fold
|
||||
for, fore, four foreword, forward fort, fought forth, fourth foul,
|
||||
fowl franc, frank freeze, frieze friar, fryer furs, furze gait, gate
|
||||
galipot, gallipot gallop, galop gamble, gambol gays, gaze genes, jeans
|
||||
gild, guild gilt, guilt giro, gyro gnaw, nor gneiss, nice gorilla,
|
||||
guerilla grate, great greave, grieve greys, graze grisly, grizzly
|
||||
groan, grown guessed, guest hail, hale hair, hare hall, haul hangar,
|
||||
hanger hart, heart haw, hoar, whore hay, hey heal, heel, he'll hear,
|
||||
here heard, herd he'd, heed heroin, heroine hew, hue hi, high higher,
|
||||
hire him, hymn ho, hoe hoard, horde hoarse, horse holey, holy, wholly
|
||||
hour, our idle, idol in, inn indict, indite it's, its jewel, joule
|
||||
key, quay knave, nave knead, need knew, new knight, night knit, nit
|
||||
knob, nob knock, nock knot, not know, no knows, nose laager, lager
|
||||
lac, lack lade, laid lain, lane lam, lamb laps, lapse larva, lava
|
||||
lase, laze law, lore lay, ley lea, lee leach, leech lead, led leak,
|
||||
leek lean, lien lessen, lesson levee, levy liar, lyre licence, license
|
||||
licker, liquor lie, lye lieu, loo links, lynx lo, low load, lode loan,
|
||||
lone locks, lox loop, loupe loot, lute made, maid mail, male main,
|
||||
mane maize, maze mall, maul manna, manner mantel, mantle mare, mayor
|
||||
mark, marque marshal, martial marten, martin mask, masque maw, more
|
||||
me, mi mean, mien meat, meet, mete medal, meddle metal, mettle meter,
|
||||
metre might, mite miner, minor, mynah mind, mined missed, mist moat,
|
||||
mote mode, mowed moor, more moose, mousse morning, mourning muscle,
|
||||
mussel naval, navel nay, neigh nigh, nye none, nun od, odd ode, owed
|
||||
oh, owe one, won packed, pact packs, pax pail, pale pain, pane pair,
|
||||
pare, pear palate, palette, pallet pascal, paschal paten, patten,
|
||||
pattern pause, paws, pores, pours pawn, porn pea, pee peace, piece
|
||||
peak, peek, peke, pique peal, peel pearl, purl pedal, peddle peer,
|
||||
pier pi, pie pica, pika place, plaice plain, plane pleas, please plum,
|
||||
plumb pole, poll poof, pouffe practice, practise praise, prays, preys
|
||||
principal, principle profit, prophet quarts, quartz quean, queen rain,
|
||||
reign, rein raise, rays, raze rap, wrap raw, roar read, reed read, red
|
||||
real, reel reek, wreak rest, wrest retch, wretch review, revue rheum,
|
||||
room right, rite, wright, write ring, wring road, rode roe, row role,
|
||||
roll roo, roux, rue rood, rude root, route rose, rows rota, rotor
|
||||
rote, wrote rough, ruff rouse, rows rung, wrung rye, wry saver, savour
|
||||
spade, spayed sale, sail sane, seine satire, satyr sauce, source saw,
|
||||
soar, sore scene, seen scull, skull sea, see seam, seem sear, seer,
|
||||
sere seas, sees, seize sew, so, sow shake, sheikh shear, sheer shoe,
|
||||
shoo sic, sick side, sighed sign, sine sink, synch slay, sleigh sloe,
|
||||
slow sole, soul some, sum son, sun sort, sought spa, spar staid,
|
||||
stayed stair, stare stake, steak stalk, stork stationary, stationery
|
||||
steal, steel stile, style storey, story straight, strait sweet, suite
|
||||
swat, swot tacks, tax tale, tail talk, torque tare, tear taught, taut,
|
||||
tort te, tea, tee team, teem tear, tier teas, tease terce, terse tern,
|
||||
turn there, their, they're threw, through throes, throws throne,
|
||||
thrown thyme, time tic, tick tide, tied tire, tyre to, too, two toad,
|
||||
toed, towed told, tolled tole, toll ton, tun tor, tore tough, tuff
|
||||
troop, troupe tuba, tuber vain, vane, vein vale, veil vial, vile wail,
|
||||
wale, whale wain, wane waist, waste wait, weight waive, wave wall,
|
||||
waul war, wore ware, wear, where warn, worn wart, wort watt, what wax,
|
||||
whacks way, weigh, whey we, wee, whee weak, week we'd, weed weal,
|
||||
we'll, wheel wean, ween weather, whether weaver, weever weir, we're
|
||||
were, whirr wet, whet wheald, wheeled which, witch whig, wig while,
|
||||
wile whine, wine whirl, whorl whirled, world whit, wit white, wight
|
||||
who's, whose woe, whoa wood, would yaw, yore, your, you're yoke, yolk
|
||||
you'll, yule""".replace(",", " ").split()
|
||||
|
||||
REMOVE = HOMOPHONES + MORE_HOMOPHONES + RUDEWORDS
|
||||
|
||||
for line in fileinput.input():
|
||||
count, word = line.split()
|
||||
if word in REMOVE:
|
||||
continue
|
||||
|
||||
if len(word) == 4:
|
||||
continue
|
||||
|
||||
print count, word.lower()
|
Loading…
Reference in New Issue