1#!/usr/bin/env python
2#Copyright 2007 Sebastian Hagen
3# This file is part of wordnet_tools.
4
5# wordnet_tools is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License version 2
7# as published by the Free Software Foundation
8
9# wordnet_tools is distributed in the hope that it will be useful,
10# but WITHOUT ANY WARRANTY; without even the implied warranty of
11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12# GNU General Public License for more details.
13
14# You should have received a copy of the GNU General Public License
15# along with wordnet_tools; if not, write to the Free Software
16# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
18# This program requires python >= 2.4.
19
20# This program converts wordnet index/data file pairs into dict index/data
21# files usable by dictd.
22# This is basically a reimplementation of the wnfilter program by Rik Faith,
23# which unfortunately doesn't work correctly for wordnet files in the newer
24# formats. This version of wordnet_structures whould parse wordnet 2.1 files
25# correctly, and create output very similar to what wnfilter would have
26# written.
27
28import datetime
29from textwrap import TextWrapper
30
31CAT_ADJECTIVE = 0
32CAT_ADVERB = 1
33CAT_NOUN = 2
34CAT_VERB = 3
35
36category_map = {
37 'n': CAT_NOUN,
38 'v': CAT_VERB,
39 'a': CAT_ADJECTIVE,
40 's': CAT_ADJECTIVE,
41 'r': CAT_ADVERB
42}
43
44
45class WordIndex:
46 def __init__(self, lemma, category, ptrs, synsets, tagsense_count):
47 self.lemma = lemma
48 self.category = category
49 self.ptrs = ptrs
50 self.synsets = synsets
51 self.tagsense_count = tagsense_count
52
53 @classmethod
54 def build_from_line(cls, line_data, synset_map):
55 line_split = line_data.split()
56 lemma = line_split[0]
57 category = category_map[line_split[1]]
58 synset_count = int(line_split[2],10)
59 ptr_count = int(line_split[3],10)
60 ptrs = [line_split[i] for i in range(3, 3+ptr_count)]
61 tagsense_count = int(line_split[5 + ptr_count],10)
62 synsets = [synset_map[int(line_split[i],10)] for i in range(6 + ptr_count, 6 + ptr_count + synset_count)]
63 return cls(lemma, category, ptrs, synsets, tagsense_count)
64
65 @classmethod
66 def build_from_file(cls, f, synset_map, rv_base=None):
67 if (rv_base is None):
68 rv = {}
69 else:
70 rv = rv_base
71
72 for line in f:
73 if (line.startswith(' ')):
74 continue
75 wi = cls.build_from_line(line, synset_map)
76 word = wi.lemma.lower()
77 if not (word in rv):
78 rv[word] = []
79 rv[word].append(wi)
80 return rv
81
82 def __repr__(self):
83 return '%s%s' % (self.__class__.__name__, (self.lemma, self.category, self.ptrs, self.synsets, self.tagsense_count))
84
85
86class WordIndexDictFormatter(WordIndex):
87 category_map_rev = {
88 CAT_NOUN: 'n',
89 CAT_VERB: 'v',
90 CAT_ADJECTIVE: 'adj',
91 CAT_ADVERB: 'adv'
92 }
93 linesep = '\n'
94 LINE_WIDTH_MAX = 68
95 prefix_fmtf_line_first = '%5s 1: '
96 prefix_fmtn_line_first = ' '
97 prefix_fmtf_line_nonfirst = '%5d: '
98 prefix_fmtn_line_nonfirst = ' '
99
100 def dict_str(self):
101 tw = TextWrapper(width=self.LINE_WIDTH_MAX,
102 initial_indent=(self.prefix_fmtf_line_first % self.category_map_rev[self.category]),
103 subsequent_indent=self.prefix_fmtn_line_first)
104
105 lines = (tw.wrap(self.synsets[0].dict_str()))
106 i = 2
107 for synset in self.synsets[1:]:
108 tw = TextWrapper(width=self.LINE_WIDTH_MAX,
109 initial_indent=(self.prefix_fmtf_line_nonfirst % i),
110 subsequent_indent=self.prefix_fmtn_line_nonfirst)
111 lines.extend(tw.wrap(synset.dict_str()))
112 i += 1
113 return self.linesep.join(lines)
114
115
116class Synset:
117 def __init__(self, offset, ss_type, words, ptrs, gloss, frames=()):
118 self.offset = offset
119 self.type = ss_type
120 self.words = words
121 self.ptrs = ptrs
122 self.gloss = gloss
123 self.frames = frames
124 self.comments = []
125
126 @classmethod
127 def build_from_line(cls, line_data):
128 line_split = line_data.split()
129 synset_offset = int(line_split[0],10)
130 ss_type = category_map[line_split[2]]
131 word_count = int(line_split[3],16)
132 words = [line_split[i] for i in range(4, 4 + word_count*2,2)]
133 ptr_count = int(line_split[4 + word_count*2],10)
134 ptrs = [(line_split[i], line_split[i+1], line_split[i+2], line_split[i+3]) for i in range(5 + word_count*2,4 + word_count*2 + ptr_count*4,4)]
135
136 tok = line_split[5 + word_count*2 + ptr_count*4]
137 base = 6 + word_count*2 + ptr_count*4
138 if (tok != '|'):
139 frame_count = int(tok, 10)
140 frames = [(int(line_split[i+1],10), int(line_split[i+2],16)) for i in range(base, base + frame_count*3, 3)]
141 base += frame_count*3 + 1
142 else:
143 frames = []
144
145 line_split2 = line_data.split(None, base)
146 if (len(line_split2) < base):
147 gloss = None
148 else:
149 gloss = line_split2[-1]
150
151 return cls(synset_offset, ss_type, words, ptrs, gloss, frames)
152
153 @classmethod
154 def build_from_file(cls, f):
155 rv = {}
156 comments = []
157
158 for line in f:
159 if (line.startswith(' ')):
160 line_s = line.lstrip().rstrip('\n')
161 line_elements = line_s.split(None,1)
162 try:
163 int(line_elements[0])
164 except ValueError:
165 continue
166 if (len(line_elements) == 1):
167 line_elements.append('')
168 comments.append(line_elements[1])
169 continue
170 synset = cls.build_from_line(line.rstrip())
171 rv[synset.offset] = synset
172
173 return (rv, comments)
174
175 def dict_str(self):
176 rv = self.gloss
177 if (len(self.words) > 1):
178 rv += ' [syn: %s]' % (', '.join([('{%s}' % word) for word in self.words]))
179 return rv
180
181 def __repr__(self):
182 return '%s%s' % (self.__class__.__name__, (self.offset, self.type, self.words, self.ptrs, self.gloss, self.frames))
183
184
185class WordnetDict:
186 db_info_fmt = '''This file was converted from the original database on:
187 %(conversion_datetime)s
188
189The original data is available from:
190 %(wn_url)s
191
192The original data was distributed with the notice shown below. No
193additional restrictions are claimed. Please redistribute this changed
194version under the same conditions and restriction that apply to the
195original version.\n\n
196%(wn_license)s'''
197
198 datetime_fmt = '%Y-%m-%dT%H:%M:%S'
199 base64_map = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
200
201 def __init__(self, wn_url, desc_short, desc_long):
202 self.word_data = {}
203 self.wn_url = wn_url
204 self.desc_short = desc_short
205 self.desc_long = desc_long
206 self.wn_license = None
207
208 def wn_dict_add(self, file_index, file_data):
209 file_data.seek(0)
210 file_index.seek(0)
211 (synsets, license_lines) = Synset.build_from_file(file_data)
212 WordIndexDictFormatter.build_from_file(file_index, synsets, self.word_data)
213 if (license_lines):
214 self.wn_license = '\n'.join(license_lines) + '\n'
215
216 @classmethod
217 def base64_encode(cls, i):
218 """Encode a non-negative integer into a dictd compatible base64 string"""
219 if (i < 0):
220 raise ValueError('Value %r for i is negative' % (i,))
221 r = 63
222 e = 1
223 while (r < i):
224 e += 1
225 r = 64**e - 1
226
227 rv = ''
228 while (e > 0):
229 e -= 1
230 d = (i / 64**e)
231 rv += cls.base64_map[d]
232 i = i % (64**e)
233 return rv
234
235 @classmethod
236 def dict_entry_write(cls, file_index, file_data, key, entry, linesep='\n'):
237 """Write a single dict entry for <key> to index and data files"""
238 entry_start = file_data.tell()
239 file_data.write(entry)
240 entry_len = len(entry)
241 file_index.write('%s\t%s\t%s%s' % (key, cls.base64_encode(entry_start),
242 cls.base64_encode(entry_len), linesep))
243
244 def dict_generate(self, file_index, file_data):
245 file_index.seek(0)
246 file_data.seek(0)
247 # The dictd file format is fairly iffy on the subject of special
248 # headwords: either dictd is buggy, or the manpage doesn't tell the whole
249 # story about the format.
250 # The upshot is that order of these entries in the index *matters*.
251 # Putting them at the beginning and in alphabetic order is afaict ok.
252 # Some other orders completely and quietly break the ability to look
253 # those headwords up.
254 # -- problem encountered with 1.10.2, at 2007-08-05.
255 file_data.write('\n')
256 wn_url = self.wn_url
257 conversion_datetime = datetime.datetime.now().strftime(self.datetime_fmt)
258 wn_license = self.wn_license
259 self.dict_entry_write(file_index, file_data, '00-database-info', '00-database-info\n%s\n' % (self.db_info_fmt % vars()))
260 self.dict_entry_write(file_index, file_data, '00-database-long', '00-database-long\n%s\n' % self.desc_long)
261 self.dict_entry_write(file_index, file_data, '00-database-short', '00-database-short\n%s\n' % self.desc_short)
262 self.dict_entry_write(file_index, file_data, '00-database-url', '00-database-url\n%s\n' % self.wn_url)
263
264
265 words = self.word_data.keys()
266 words.sort()
267 for word in words:
268 for wi in self.word_data[word]:
269 word_cs = word
270 # Use case-sensitivity information of first entry of first synset that
271 # matches this word case-insensitively
272 for synset in wi.synsets:
273 for ss_word in synset.words:
274 if (ss_word.lower() == word_cs.lower()):
275 word_cs = ss_word
276 break
277 else:
278 continue
279 break
280 else:
281 continue
282 break
283
284 outstr = ''
285 for wi in self.word_data[word]:
286 outstr += wi.dict_str() + '\n'
287
288 outstr = '%s%s%s' % (word_cs, wi.linesep, outstr)
289 self.dict_entry_write(file_index, file_data, word_cs, outstr, wi.linesep)
290
291 file_index.truncate()
292 file_data.truncate()
293
294
295if (__name__ == '__main__'):
296 import optparse
297 op = optparse.OptionParser(usage='usage: %prog [options] (<wn_index_file> <wn_data_file>)+')
298 op.add_option('-i', '--outindex', dest='oi', default='wn.index', help='filename of index file to write to')
299 op.add_option('-d', '--outdata', dest='od', default='wn.dict', help='filename of data file to write to')
300 op.add_option('--wn_url', dest='wn_url', default='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources')
301 op.add_option('--db_desc_short', dest='desc_short', default=' WordNet (r) 2.1 (2005)', help='short dict DB description')
302 op.add_option('--db_desc_long', dest='desc_long', default=' WordNet (r): A Lexical Database for English from the\n Cognitive Science Laboratory at Princeton University', help='long dict DB description')
303
304 (options, args) = op.parse_args()
305
306 wnd = WordnetDict(wn_url=options.wn_url, desc_short=options.desc_short, desc_long=options.desc_long)
307
308 for i in range(0,len(args),2):
309 print 'Opening index file %r...' % args[i]
310 file_index = file(args[i])
311 print 'Opening data file %r...' % args[i+1]
312 file_data = file(args[i+1])
313 print 'Parsing index file and data file...'
314 wnd.wn_dict_add(file_index, file_data)
315
316 print 'All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od)
317
318 wnd.dict_generate(file(options.oi, 'w'),file(options.od, 'w'))
319 print 'All done.'