···1+{stdenv, fetchsvn, python, wordnet, writeScript}:
2+3+stdenv.mkDerivation rec {
4+ version = "542";
5+ name = "dict-db-wordnet-${version}";
6+7+ buildInputs = [python wordnet];
8+ convert = ./wordnet_structures.py;
9+10+ builder = writeScript "builder.sh" ''
11+ . ${stdenv}/setup
12+ ensureDir $out/share/dictd/
13+ cd $out/share/dictd
14+15+ for i in ${wordnet}/dict/data.*; do
16+ DATA="$DATA `echo $i | sed -e s,data,index,` $i";
17+ done
18+19+ python ${convert} $DATA
20+ echo en_US.UTF-8 > locale
21+ '';
22+23+ meta = {
24+ description = "dictd-compatible version of WordNet";
25+26+ longDescription =
27+ '' WordNet® is a large lexical database of English. This package makes
28+ the wordnet data available to dictd and by extension for lookup with
29+ the dict command. '';
30+31+ homepage = http://wordnet.princeton.edu/;
32+33+ maintainers = [ stdenv.lib.maintainers.mornfall ];
34+ platforms = stdenv.lib.platforms.all;
35+ };
36+}
···1+#!/usr/bin/env python
2+#Copyright 2007 Sebastian Hagen
3+# This file is part of wordnet_tools.
4+5+# wordnet_tools is free software; you can redistribute it and/or modify
6+# it under the terms of the GNU General Public License version 2
7+# as published by the Free Software Foundation
8+9+# wordnet_tools is distributed in the hope that it will be useful,
10+# but WITHOUT ANY WARRANTY; without even the implied warranty of
11+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+# GNU General Public License for more details.
13+14+# You should have received a copy of the GNU General Public License
15+# along with wordnet_tools; if not, write to the Free Software
16+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17+18+# This program requires python >= 2.4.
19+20+# This program converts wordnet index/data file pairs into dict index/data
21+# files usable by dictd.
22+# This is basically a reimplementation of the wnfilter program by Rik Faith,
23+# which unfortunately doesn't work correctly for wordnet files in the newer
24+# formats. This version of wordnet_structures whould parse wordnet 2.1 files
25+# correctly, and create output very similar to what wnfilter would have
26+# written.
27+28+import datetime
29+from textwrap import TextWrapper
30+31+CAT_ADJECTIVE = 0
32+CAT_ADVERB = 1
33+CAT_NOUN = 2
34+CAT_VERB = 3
35+36+category_map = {
37+ 'n': CAT_NOUN,
38+ 'v': CAT_VERB,
39+ 'a': CAT_ADJECTIVE,
40+ 's': CAT_ADJECTIVE,
41+ 'r': CAT_ADVERB
42+}
43+44+45+class WordIndex:
46+ def __init__(self, lemma, category, ptrs, synsets, tagsense_count):
47+ self.lemma = lemma
48+ self.category = category
49+ self.ptrs = ptrs
50+ self.synsets = synsets
51+ self.tagsense_count = tagsense_count
52+53+ @classmethod
54+ def build_from_line(cls, line_data, synset_map):
55+ line_split = line_data.split()
56+ lemma = line_split[0]
57+ category = category_map[line_split[1]]
58+ synset_count = int(line_split[2],10)
59+ ptr_count = int(line_split[3],10)
60+ ptrs = [line_split[i] for i in range(3, 3+ptr_count)]
61+ tagsense_count = int(line_split[5 + ptr_count],10)
62+ synsets = [synset_map[int(line_split[i],10)] for i in range(6 + ptr_count, 6 + ptr_count + synset_count)]
63+ return cls(lemma, category, ptrs, synsets, tagsense_count)
64+65+ @classmethod
66+ def build_from_file(cls, f, synset_map, rv_base=None):
67+ if (rv_base is None):
68+ rv = {}
69+ else:
70+ rv = rv_base
71+72+ for line in f:
73+ if (line.startswith(' ')):
74+ continue
75+ wi = cls.build_from_line(line, synset_map)
76+ word = wi.lemma.lower()
77+ if not (word in rv):
78+ rv[word] = []
79+ rv[word].append(wi)
80+ return rv
81+82+ def __repr__(self):
83+ return '%s%s' % (self.__class__.__name__, (self.lemma, self.category, self.ptrs, self.synsets, self.tagsense_count))
84+85+86+class WordIndexDictFormatter(WordIndex):
87+ category_map_rev = {
88+ CAT_NOUN: 'n',
89+ CAT_VERB: 'v',
90+ CAT_ADJECTIVE: 'adj',
91+ CAT_ADVERB: 'adv'
92+ }
93+ linesep = '\n'
94+ LINE_WIDTH_MAX = 68
95+ prefix_fmtf_line_first = '%5s 1: '
96+ prefix_fmtn_line_first = ' '
97+ prefix_fmtf_line_nonfirst = '%5d: '
98+ prefix_fmtn_line_nonfirst = ' '
99+100+ def dict_str(self):
101+ tw = TextWrapper(width=self.LINE_WIDTH_MAX,
102+ initial_indent=(self.prefix_fmtf_line_first % self.category_map_rev[self.category]),
103+ subsequent_indent=self.prefix_fmtn_line_first)
104+105+ lines = (tw.wrap(self.synsets[0].dict_str()))
106+ i = 2
107+ for synset in self.synsets[1:]:
108+ tw = TextWrapper(width=self.LINE_WIDTH_MAX,
109+ initial_indent=(self.prefix_fmtf_line_nonfirst % i),
110+ subsequent_indent=self.prefix_fmtn_line_nonfirst)
111+ lines.extend(tw.wrap(synset.dict_str()))
112+ i += 1
113+ return self.linesep.join(lines)
114+115+116+class Synset:
117+ def __init__(self, offset, ss_type, words, ptrs, gloss, frames=()):
118+ self.offset = offset
119+ self.type = ss_type
120+ self.words = words
121+ self.ptrs = ptrs
122+ self.gloss = gloss
123+ self.frames = frames
124+ self.comments = []
125+126+ @classmethod
127+ def build_from_line(cls, line_data):
128+ line_split = line_data.split()
129+ synset_offset = int(line_split[0],10)
130+ ss_type = category_map[line_split[2]]
131+ word_count = int(line_split[3],16)
132+ words = [line_split[i] for i in range(4, 4 + word_count*2,2)]
133+ ptr_count = int(line_split[4 + word_count*2],10)
134+ ptrs = [(line_split[i], line_split[i+1], line_split[i+2], line_split[i+3]) for i in range(5 + word_count*2,4 + word_count*2 + ptr_count*4,4)]
135+136+ tok = line_split[5 + word_count*2 + ptr_count*4]
137+ base = 6 + word_count*2 + ptr_count*4
138+ if (tok != '|'):
139+ frame_count = int(tok, 10)
140+ frames = [(int(line_split[i+1],10), int(line_split[i+2],16)) for i in range(base, base + frame_count*3, 3)]
141+ base += frame_count*3 + 1
142+ else:
143+ frames = []
144+145+ line_split2 = line_data.split(None, base)
146+ if (len(line_split2) < base):
147+ gloss = None
148+ else:
149+ gloss = line_split2[-1]
150+151+ return cls(synset_offset, ss_type, words, ptrs, gloss, frames)
152+153+ @classmethod
154+ def build_from_file(cls, f):
155+ rv = {}
156+ comments = []
157+158+ for line in f:
159+ if (line.startswith(' ')):
160+ line_s = line.lstrip().rstrip('\n')
161+ line_elements = line_s.split(None,1)
162+ try:
163+ int(line_elements[0])
164+ except ValueError:
165+ continue
166+ if (len(line_elements) == 1):
167+ line_elements.append('')
168+ comments.append(line_elements[1])
169+ continue
170+ synset = cls.build_from_line(line.rstrip())
171+ rv[synset.offset] = synset
172+173+ return (rv, comments)
174+175+ def dict_str(self):
176+ rv = self.gloss
177+ if (len(self.words) > 1):
178+ rv += ' [syn: %s]' % (', '.join([('{%s}' % word) for word in self.words]))
179+ return rv
180+181+ def __repr__(self):
182+ return '%s%s' % (self.__class__.__name__, (self.offset, self.type, self.words, self.ptrs, self.gloss, self.frames))
183+184+185+class WordnetDict:
186+ db_info_fmt = '''This file was converted from the original database on:
187+ %(conversion_datetime)s
188+189+The original data is available from:
190+ %(wn_url)s
191+192+The original data was distributed with the notice shown below. No
193+additional restrictions are claimed. Please redistribute this changed
194+version under the same conditions and restriction that apply to the
195+original version.\n\n
196+%(wn_license)s'''
197+198+ datetime_fmt = '%Y-%m-%dT%H:%M:%S'
199+ base64_map = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
200+201+ def __init__(self, wn_url, desc_short, desc_long):
202+ self.word_data = {}
203+ self.wn_url = wn_url
204+ self.desc_short = desc_short
205+ self.desc_long = desc_long
206+ self.wn_license = None
207+208+ def wn_dict_add(self, file_index, file_data):
209+ file_data.seek(0)
210+ file_index.seek(0)
211+ (synsets, license_lines) = Synset.build_from_file(file_data)
212+ WordIndexDictFormatter.build_from_file(file_index, synsets, self.word_data)
213+ if (license_lines):
214+ self.wn_license = '\n'.join(license_lines) + '\n'
215+216+ @classmethod
217+ def base64_encode(cls, i):
218+ """Encode a non-negative integer into a dictd compatible base64 string"""
219+ if (i < 0):
220+ raise ValueError('Value %r for i is negative' % (i,))
221+ r = 63
222+ e = 1
223+ while (r < i):
224+ e += 1
225+ r = 64**e - 1
226+227+ rv = ''
228+ while (e > 0):
229+ e -= 1
230+ d = (i / 64**e)
231+ rv += cls.base64_map[d]
232+ i = i % (64**e)
233+ return rv
234+235+ @classmethod
236+ def dict_entry_write(cls, file_index, file_data, key, entry, linesep='\n'):
237+ """Write a single dict entry for <key> to index and data files"""
238+ entry_start = file_data.tell()
239+ file_data.write(entry)
240+ entry_len = len(entry)
241+ file_index.write('%s\t%s\t%s%s' % (key, cls.base64_encode(entry_start),
242+ cls.base64_encode(entry_len), linesep))
243+244+ def dict_generate(self, file_index, file_data):
245+ file_index.seek(0)
246+ file_data.seek(0)
247+ # The dictd file format is fairly iffy on the subject of special
248+ # headwords: either dictd is buggy, or the manpage doesn't tell the whole
249+ # story about the format.
250+ # The upshot is that order of these entries in the index *matters*.
251+ # Putting them at the beginning and in alphabetic order is afaict ok.
252+ # Some other orders completely and quietly break the ability to look
253+ # those headwords up.
254+ # -- problem encountered with 1.10.2, at 2007-08-05.
255+ file_data.write('\n')
256+ wn_url = self.wn_url
257+ conversion_datetime = datetime.datetime.now().strftime(self.datetime_fmt)
258+ wn_license = self.wn_license
259+ self.dict_entry_write(file_index, file_data, '00-database-info', '00-database-info\n%s\n' % (self.db_info_fmt % vars()))
260+ self.dict_entry_write(file_index, file_data, '00-database-long', '00-database-long\n%s\n' % self.desc_long)
261+ self.dict_entry_write(file_index, file_data, '00-database-short', '00-database-short\n%s\n' % self.desc_short)
262+ self.dict_entry_write(file_index, file_data, '00-database-url', '00-database-url\n%s\n' % self.wn_url)
263+264+265+ words = self.word_data.keys()
266+ words.sort()
267+ for word in words:
268+ for wi in self.word_data[word]:
269+ word_cs = word
270+ # Use case-sensitivity information of first entry of first synset that
271+ # matches this word case-insensitively
272+ for synset in wi.synsets:
273+ for ss_word in synset.words:
274+ if (ss_word.lower() == word_cs.lower()):
275+ word_cs = ss_word
276+ break
277+ else:
278+ continue
279+ break
280+ else:
281+ continue
282+ break
283+284+ outstr = ''
285+ for wi in self.word_data[word]:
286+ outstr += wi.dict_str() + '\n'
287+288+ outstr = '%s%s%s' % (word_cs, wi.linesep, outstr)
289+ self.dict_entry_write(file_index, file_data, word_cs, outstr, wi.linesep)
290+291+ file_index.truncate()
292+ file_data.truncate()
293+294+295+if (__name__ == '__main__'):
296+ import optparse
297+ op = optparse.OptionParser(usage='usage: %prog [options] (<wn_index_file> <wn_data_file>)+')
298+ op.add_option('-i', '--outindex', dest='oi', default='wn.index', help='filename of index file to write to')
299+ op.add_option('-d', '--outdata', dest='od', default='wn.dict', help='filename of data file to write to')
300+ op.add_option('--wn_url', dest='wn_url', default='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources')
301+ op.add_option('--db_desc_short', dest='desc_short', default=' WordNet (r) 2.1 (2005)', help='short dict DB description')
302+ op.add_option('--db_desc_long', dest='desc_long', default=' WordNet (r): A Lexical Database for English from the\n Cognitive Science Laboratory at Princeton University', help='long dict DB description')
303+304+ (options, args) = op.parse_args()
305+306+ wnd = WordnetDict(wn_url=options.wn_url, desc_short=options.desc_short, desc_long=options.desc_long)
307+308+ for i in range(0,len(args),2):
309+ print 'Opening index file %r...' % args[i]
310+ file_index = file(args[i])
311+ print 'Opening data file %r...' % args[i+1]
312+ file_data = file(args[i+1])
313+ print 'Parsing index file and data file...'
314+ wnd.wn_dict_add(file_index, file_data)
315+316+ print 'All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od)
317+318+ wnd.dict_generate(file(options.oi, 'w'),file(options.od, 'w'))
319+ print 'All done.'