#! /usr/bin/env python # OED - Search the online Oxford English Dictionary from the # command line # Written by Matthew Hall # OED is public domain software. # -matt import httplib import htmllib import formatter import urllib import mimetools import re import sys import getopt import string numpattern='[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]' # formats (...(flagname,queryname)...) formats=(('p','p'),('e','d'),('s','sp'),('q','qt'),('t','ct'),('a','ad')) def make_fmt(flags): tfmt=[] for i in range(0,len(formats)): if formats[i][0] in flags: tfmt.append(formats[i][1]+'=1') else: tfmt.append(formats[i][1]+'=0') return string.join(tfmt,'&') # literalize a char for urls. i.e. '?'->'% def lit(ch): return '%'+string.upper(hex(ord(ch))[2:]) def literalize(str): return string.join(map(lambda x:((x in '?% ') and lit(x)) or x,str),'') def stemp(dat): f=open('tmp','w');f.write(dat);f.close() #Exception thrown by getpage if all is not well (result!=200) class httperr: def __init__(self,c,m,h): self.err_code=c self.err_msg=m self.err_hdr=h # Warning- Lots of assumptions about how OED works here: # find_nums scans the text str (which is a page of word entries, # or a redirection if only one entry exists in oed) for all 8 digit # substrings, which I assume to be (and only to be) database references # to oed definitions def find_nums(str): result=[] tstr=str while 1: match=re.search(numpattern,tstr) if not match: break result.append(match.group()) tstr=tstr[match.end():] return result # elim_trips. Another assumption I made is that if an entry id (one of # them 8-digit things) is unique on the entry page, then it is an entry # for that word. Seemingly, oed will also list other entries if those # entries also contain that word: For instance, 'fever' produces 3 # entries to defs of fever (n,v,ad), but also refs to 'seasoning' and # 'parrot' (spicy and psittacosis? who knows.). These pseudo entries # seem to have their entry_id 3 times (successively) in the page, # where as true entries seem to have their id listed only once. This # routine takes a list of entry id's and removes all (sequential) # triplicates def elim_trips(lst): i=0 result=[] while iOED Online -",dat) pos2=re.search("",dat) print 'One entry found:' print '\t'+dat[pos1.end():pos2.start()] else: dat=string.replace(dat,"","

") print_page(file,dat) def print_wordentries(file,q): entries=find_entries(q.queryword,q.max,q.first) if q.extras: entries=uniquify(entries) else: entries=elim_trips(entries) if not entries: sys.stderr.write("Word '"+q.queryword+"' not found.\n") return idx=1 if len(entries)==1: file.write("1 entry for "+q.queryword+"\n") else: file.write("%d entries for %s\n"% (len(entries),q.queryword)) for e in entries: file.write("Entry #%d\n" % idx) output_entry(file,e,q.fmt) idx=idx+1 # Note- tags for entry: # p=1&d=0&sp=1&qt=0&ct=0&ad=1&print=1 # p = pronunciation # d=derivation # sp=spelling # qt=quotes # ct=timeline # ad=additions to 2nd ed. def output_entry(file,entry_id,fmt=""): page='/cgi/entry/'+entry_id+'?'+fmt+'&print=1' dat=getpage('dictionary.oed.com',page) htmlwrite=htmllib.HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(file))) htmlwrite.feed(dat) def prt(file,q): if q.max==-1: if q.listing:q.max=100 else: q.max=20 if q.listing: print 'Printing wordlist:' print_wordlist(file,q) else: print 'Printing entries:' print_wordentries(file,q) def wotd(file,fmt): dat=getpage('dictionary.oed.com','/cgi/display/wotd?print=1&'+fmt) if dat: htmlwrite=htmllib.HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(file))) htmlwrite.feed(dat) else: sys.stderr.write("Word of the day not working.\n") def usage(): print """ Usage: oed oed [-p][-e][-s][-q][-t][-a][-x] [-m #] [-f #] |wotd oed -l [-a][-m #] [-f #] where the flags enable printing of: p: pronunciation e: etymology s: alternate spellings q: quotes using the word t: graphical timeline. It ain't pretty yet. a: additions to the OED since the 2nd ed. x: related definitions (eXtras). These are entries which contain , though I'm not sure how OED Online chooses. For instance, the only extra in 'oed -x the' is 'overrun'. You can verify this on their webpage. You may need -x to find suffixed words (i.e. posterized only shows up with -x) In general, strip off suffixes before you search. oed -l prints only a listing of the entries found in OED,without printing any definitions or other options. For both options, '-m #' specifies the maximum number of entries you wish displayed. The default is 20 for full definitions and 100 for word listings. Similarly, '-f #' specifies the first entry to show. I.e. 'oed -m 1 -f 1 fever' will show only the 2nd entry for fever. The default is to print the definition only, without even the recent additions. I may change this. can be either a word you wish to look up, or you can check out OED's word of the day by entering 'wotd'. If you need to convince yourself that 'wotd' is not a word, try: oed "wotd*". Wildcards are okay, but you must surround the search string in quotes, i.e. oed "fi?er" to return the defs of filer,finer,fiver,etc. Send bug reports/accolades to mahall@ncsa.uiuc.edu. """ class oed_query: def __init__(self): self.fmt="" self.extras=0 self.queryword="" self.max=-1 self.first=1 self.listing=0 def parse_args(arglist): try: optlist,args=getopt.getopt(arglist,'pesqtaxlm:f:'); except getopt.GetoptError: usage() sys.exit(2) # Create new query object result=oed_query() #Rduce the options to a list of flags by: # 1st, map objects to x if y is '', or '' otherwise # next, filter out all '' in the result flags=filter(None,map(lambda (x,y): (not y) and x[1],optlist)) result.fmt=make_fmt(flags) for opt in optlist: if opt[0]=='-x':result.extras=1 if opt[0]=='-l':result.listing=1 if opt[0]=='-f':result.first=int(opt[1])+1 #humans begin at 1. if opt[0]=='-m':result.max=int(opt[1]) #read in query word if len(args)==0: usage() sys.exit(2) result.queryword=args[0] return result if __name__=='__main__': query=parse_args(sys.argv[1:]) if query.queryword=="": usage() sys.exit() if query.queryword=='wotd': wotd(sys.stdout,query.fmt) else: prt(sys.stdout,query)