···11+{stdenv, fetchsvn, python, wordnet, writeScript}:
22+33+stdenv.mkDerivation rec {
44+ version = "542";
55+ name = "dict-db-wordnet-${version}";
66+77+ buildInputs = [python wordnet];
88+ convert = ./wordnet_structures.py;
99+1010+ builder = writeScript "builder.sh" ''
1111+ . ${stdenv}/setup
1212+ ensureDir $out/share/dictd/
1313+ cd $out/share/dictd
1414+1515+ for i in ${wordnet}/dict/data.*; do
1616+ DATA="$DATA `echo $i | sed -e s,data,index,` $i";
1717+ done
1818+1919+ python ${convert} $DATA
2020+ echo en_US.UTF-8 > locale
2121+ '';
2222+2323+ meta = {
2424+ description = "dictd-compatible version of WordNet";
2525+2626+ longDescription =
2727+ '' WordNet® is a large lexical database of English. This package makes
2828+ the wordnet data available to dictd and by extension for lookup with
2929+ the dict command. '';
3030+3131+ homepage = http://wordnet.princeton.edu/;
3232+3333+ maintainers = [ stdenv.lib.maintainers.mornfall ];
3434+ platforms = stdenv.lib.platforms.all;
3535+ };
3636+}
+319
pkgs/servers/dict/wordnet_structures.py
···11+#!/usr/bin/env python
22+#Copyright 2007 Sebastian Hagen
33+# This file is part of wordnet_tools.
44+55+# wordnet_tools is free software; you can redistribute it and/or modify
66+# it under the terms of the GNU General Public License version 2
77+# as published by the Free Software Foundation
88+99+# wordnet_tools is distributed in the hope that it will be useful,
1010+# but WITHOUT ANY WARRANTY; without even the implied warranty of
1111+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1212+# GNU General Public License for more details.
1313+1414+# You should have received a copy of the GNU General Public License
1515+# along with wordnet_tools; if not, write to the Free Software
1616+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
1717+1818+# This program requires python >= 2.4.
1919+2020+# This program converts wordnet index/data file pairs into dict index/data
2121+# files usable by dictd.
2222+# This is basically a reimplementation of the wnfilter program by Rik Faith,
2323+# which unfortunately doesn't work correctly for wordnet files in the newer
2424+# formats. This version of wordnet_structures whould parse wordnet 2.1 files
2525+# correctly, and create output very similar to what wnfilter would have
2626+# written.
2727+2828+import datetime
2929+from textwrap import TextWrapper
3030+3131+CAT_ADJECTIVE = 0
3232+CAT_ADVERB = 1
3333+CAT_NOUN = 2
3434+CAT_VERB = 3
3535+3636+category_map = {
3737+ 'n': CAT_NOUN,
3838+ 'v': CAT_VERB,
3939+ 'a': CAT_ADJECTIVE,
4040+ 's': CAT_ADJECTIVE,
4141+ 'r': CAT_ADVERB
4242+}
4343+4444+4545+class WordIndex:
4646+ def __init__(self, lemma, category, ptrs, synsets, tagsense_count):
4747+ self.lemma = lemma
4848+ self.category = category
4949+ self.ptrs = ptrs
5050+ self.synsets = synsets
5151+ self.tagsense_count = tagsense_count
5252+5353+ @classmethod
5454+ def build_from_line(cls, line_data, synset_map):
5555+ line_split = line_data.split()
5656+ lemma = line_split[0]
5757+ category = category_map[line_split[1]]
5858+ synset_count = int(line_split[2],10)
5959+ ptr_count = int(line_split[3],10)
6060+ ptrs = [line_split[i] for i in range(3, 3+ptr_count)]
6161+ tagsense_count = int(line_split[5 + ptr_count],10)
6262+ synsets = [synset_map[int(line_split[i],10)] for i in range(6 + ptr_count, 6 + ptr_count + synset_count)]
6363+ return cls(lemma, category, ptrs, synsets, tagsense_count)
6464+6565+ @classmethod
6666+ def build_from_file(cls, f, synset_map, rv_base=None):
6767+ if (rv_base is None):
6868+ rv = {}
6969+ else:
7070+ rv = rv_base
7171+7272+ for line in f:
7373+ if (line.startswith(' ')):
7474+ continue
7575+ wi = cls.build_from_line(line, synset_map)
7676+ word = wi.lemma.lower()
7777+ if not (word in rv):
7878+ rv[word] = []
7979+ rv[word].append(wi)
8080+ return rv
8181+8282+ def __repr__(self):
8383+ return '%s%s' % (self.__class__.__name__, (self.lemma, self.category, self.ptrs, self.synsets, self.tagsense_count))
8484+8585+8686+class WordIndexDictFormatter(WordIndex):
8787+ category_map_rev = {
8888+ CAT_NOUN: 'n',
8989+ CAT_VERB: 'v',
9090+ CAT_ADJECTIVE: 'adj',
9191+ CAT_ADVERB: 'adv'
9292+ }
9393+ linesep = '\n'
9494+ LINE_WIDTH_MAX = 68
9595+ prefix_fmtf_line_first = '%5s 1: '
9696+ prefix_fmtn_line_first = ' '
9797+ prefix_fmtf_line_nonfirst = '%5d: '
9898+ prefix_fmtn_line_nonfirst = ' '
9999+100100+ def dict_str(self):
101101+ tw = TextWrapper(width=self.LINE_WIDTH_MAX,
102102+ initial_indent=(self.prefix_fmtf_line_first % self.category_map_rev[self.category]),
103103+ subsequent_indent=self.prefix_fmtn_line_first)
104104+105105+ lines = (tw.wrap(self.synsets[0].dict_str()))
106106+ i = 2
107107+ for synset in self.synsets[1:]:
108108+ tw = TextWrapper(width=self.LINE_WIDTH_MAX,
109109+ initial_indent=(self.prefix_fmtf_line_nonfirst % i),
110110+ subsequent_indent=self.prefix_fmtn_line_nonfirst)
111111+ lines.extend(tw.wrap(synset.dict_str()))
112112+ i += 1
113113+ return self.linesep.join(lines)
114114+115115+116116+class Synset:
117117+ def __init__(self, offset, ss_type, words, ptrs, gloss, frames=()):
118118+ self.offset = offset
119119+ self.type = ss_type
120120+ self.words = words
121121+ self.ptrs = ptrs
122122+ self.gloss = gloss
123123+ self.frames = frames
124124+ self.comments = []
125125+126126+ @classmethod
127127+ def build_from_line(cls, line_data):
128128+ line_split = line_data.split()
129129+ synset_offset = int(line_split[0],10)
130130+ ss_type = category_map[line_split[2]]
131131+ word_count = int(line_split[3],16)
132132+ words = [line_split[i] for i in range(4, 4 + word_count*2,2)]
133133+ ptr_count = int(line_split[4 + word_count*2],10)
134134+ ptrs = [(line_split[i], line_split[i+1], line_split[i+2], line_split[i+3]) for i in range(5 + word_count*2,4 + word_count*2 + ptr_count*4,4)]
135135+136136+ tok = line_split[5 + word_count*2 + ptr_count*4]
137137+ base = 6 + word_count*2 + ptr_count*4
138138+ if (tok != '|'):
139139+ frame_count = int(tok, 10)
140140+ frames = [(int(line_split[i+1],10), int(line_split[i+2],16)) for i in range(base, base + frame_count*3, 3)]
141141+ base += frame_count*3 + 1
142142+ else:
143143+ frames = []
144144+145145+ line_split2 = line_data.split(None, base)
146146+ if (len(line_split2) < base):
147147+ gloss = None
148148+ else:
149149+ gloss = line_split2[-1]
150150+151151+ return cls(synset_offset, ss_type, words, ptrs, gloss, frames)
152152+153153+ @classmethod
154154+ def build_from_file(cls, f):
155155+ rv = {}
156156+ comments = []
157157+158158+ for line in f:
159159+ if (line.startswith(' ')):
160160+ line_s = line.lstrip().rstrip('\n')
161161+ line_elements = line_s.split(None,1)
162162+ try:
163163+ int(line_elements[0])
164164+ except ValueError:
165165+ continue
166166+ if (len(line_elements) == 1):
167167+ line_elements.append('')
168168+ comments.append(line_elements[1])
169169+ continue
170170+ synset = cls.build_from_line(line.rstrip())
171171+ rv[synset.offset] = synset
172172+173173+ return (rv, comments)
174174+175175+ def dict_str(self):
176176+ rv = self.gloss
177177+ if (len(self.words) > 1):
178178+ rv += ' [syn: %s]' % (', '.join([('{%s}' % word) for word in self.words]))
179179+ return rv
180180+181181+ def __repr__(self):
182182+ return '%s%s' % (self.__class__.__name__, (self.offset, self.type, self.words, self.ptrs, self.gloss, self.frames))
183183+184184+185185+class WordnetDict:
186186+ db_info_fmt = '''This file was converted from the original database on:
187187+ %(conversion_datetime)s
188188+189189+The original data is available from:
190190+ %(wn_url)s
191191+192192+The original data was distributed with the notice shown below. No
193193+additional restrictions are claimed. Please redistribute this changed
194194+version under the same conditions and restriction that apply to the
195195+original version.\n\n
196196+%(wn_license)s'''
197197+198198+ datetime_fmt = '%Y-%m-%dT%H:%M:%S'
199199+ base64_map = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
200200+201201+ def __init__(self, wn_url, desc_short, desc_long):
202202+ self.word_data = {}
203203+ self.wn_url = wn_url
204204+ self.desc_short = desc_short
205205+ self.desc_long = desc_long
206206+ self.wn_license = None
207207+208208+ def wn_dict_add(self, file_index, file_data):
209209+ file_data.seek(0)
210210+ file_index.seek(0)
211211+ (synsets, license_lines) = Synset.build_from_file(file_data)
212212+ WordIndexDictFormatter.build_from_file(file_index, synsets, self.word_data)
213213+ if (license_lines):
214214+ self.wn_license = '\n'.join(license_lines) + '\n'
215215+216216+ @classmethod
217217+ def base64_encode(cls, i):
218218+ """Encode a non-negative integer into a dictd compatible base64 string"""
219219+ if (i < 0):
220220+ raise ValueError('Value %r for i is negative' % (i,))
221221+ r = 63
222222+ e = 1
223223+ while (r < i):
224224+ e += 1
225225+ r = 64**e - 1
226226+227227+ rv = ''
228228+ while (e > 0):
229229+ e -= 1
230230+ d = (i / 64**e)
231231+ rv += cls.base64_map[d]
232232+ i = i % (64**e)
233233+ return rv
234234+235235+ @classmethod
236236+ def dict_entry_write(cls, file_index, file_data, key, entry, linesep='\n'):
237237+ """Write a single dict entry for <key> to index and data files"""
238238+ entry_start = file_data.tell()
239239+ file_data.write(entry)
240240+ entry_len = len(entry)
241241+ file_index.write('%s\t%s\t%s%s' % (key, cls.base64_encode(entry_start),
242242+ cls.base64_encode(entry_len), linesep))
243243+244244+ def dict_generate(self, file_index, file_data):
245245+ file_index.seek(0)
246246+ file_data.seek(0)
247247+ # The dictd file format is fairly iffy on the subject of special
248248+ # headwords: either dictd is buggy, or the manpage doesn't tell the whole
249249+ # story about the format.
250250+ # The upshot is that order of these entries in the index *matters*.
251251+ # Putting them at the beginning and in alphabetic order is afaict ok.
252252+ # Some other orders completely and quietly break the ability to look
253253+ # those headwords up.
254254+ # -- problem encountered with 1.10.2, at 2007-08-05.
255255+ file_data.write('\n')
256256+ wn_url = self.wn_url
257257+ conversion_datetime = datetime.datetime.now().strftime(self.datetime_fmt)
258258+ wn_license = self.wn_license
259259+ self.dict_entry_write(file_index, file_data, '00-database-info', '00-database-info\n%s\n' % (self.db_info_fmt % vars()))
260260+ self.dict_entry_write(file_index, file_data, '00-database-long', '00-database-long\n%s\n' % self.desc_long)
261261+ self.dict_entry_write(file_index, file_data, '00-database-short', '00-database-short\n%s\n' % self.desc_short)
262262+ self.dict_entry_write(file_index, file_data, '00-database-url', '00-database-url\n%s\n' % self.wn_url)
263263+264264+265265+ words = self.word_data.keys()
266266+ words.sort()
267267+ for word in words:
268268+ for wi in self.word_data[word]:
269269+ word_cs = word
270270+ # Use case-sensitivity information of first entry of first synset that
271271+ # matches this word case-insensitively
272272+ for synset in wi.synsets:
273273+ for ss_word in synset.words:
274274+ if (ss_word.lower() == word_cs.lower()):
275275+ word_cs = ss_word
276276+ break
277277+ else:
278278+ continue
279279+ break
280280+ else:
281281+ continue
282282+ break
283283+284284+ outstr = ''
285285+ for wi in self.word_data[word]:
286286+ outstr += wi.dict_str() + '\n'
287287+288288+ outstr = '%s%s%s' % (word_cs, wi.linesep, outstr)
289289+ self.dict_entry_write(file_index, file_data, word_cs, outstr, wi.linesep)
290290+291291+ file_index.truncate()
292292+ file_data.truncate()
293293+294294+295295+if (__name__ == '__main__'):
296296+ import optparse
297297+ op = optparse.OptionParser(usage='usage: %prog [options] (<wn_index_file> <wn_data_file>)+')
298298+ op.add_option('-i', '--outindex', dest='oi', default='wn.index', help='filename of index file to write to')
299299+ op.add_option('-d', '--outdata', dest='od', default='wn.dict', help='filename of data file to write to')
300300+ op.add_option('--wn_url', dest='wn_url', default='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources')
301301+ op.add_option('--db_desc_short', dest='desc_short', default=' WordNet (r) 2.1 (2005)', help='short dict DB description')
302302+ op.add_option('--db_desc_long', dest='desc_long', default=' WordNet (r): A Lexical Database for English from the\n Cognitive Science Laboratory at Princeton University', help='long dict DB description')
303303+304304+ (options, args) = op.parse_args()
305305+306306+ wnd = WordnetDict(wn_url=options.wn_url, desc_short=options.desc_short, desc_long=options.desc_long)
307307+308308+ for i in range(0,len(args),2):
309309+ print 'Opening index file %r...' % args[i]
310310+ file_index = file(args[i])
311311+ print 'Opening data file %r...' % args[i+1]
312312+ file_data = file(args[i+1])
313313+ print 'Parsing index file and data file...'
314314+ wnd.wn_dict_add(file_index, file_data)
315315+316316+ print 'All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od)
317317+318318+ wnd.dict_generate(file(options.oi, 'w'),file(options.od, 'w'))
319319+ print 'All done.'