at v192 319 lines 12 kB view raw
1#!/usr/bin/env python 2#Copyright 2007 Sebastian Hagen 3# This file is part of wordnet_tools. 4 5# wordnet_tools is free software; you can redistribute it and/or modify 6# it under the terms of the GNU General Public License version 2 7# as published by the Free Software Foundation 8 9# wordnet_tools is distributed in the hope that it will be useful, 10# but WITHOUT ANY WARRANTY; without even the implied warranty of 11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12# GNU General Public License for more details. 13 14# You should have received a copy of the GNU General Public License 15# along with wordnet_tools; if not, write to the Free Software 16# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 18# This program requires python >= 2.4. 19 20# This program converts wordnet index/data file pairs into dict index/data 21# files usable by dictd. 22# This is basically a reimplementation of the wnfilter program by Rik Faith, 23# which unfortunately doesn't work correctly for wordnet files in the newer 24# formats. This version of wordnet_structures whould parse wordnet 2.1 files 25# correctly, and create output very similar to what wnfilter would have 26# written. 27 28import datetime 29from textwrap import TextWrapper 30 31CAT_ADJECTIVE = 0 32CAT_ADVERB = 1 33CAT_NOUN = 2 34CAT_VERB = 3 35 36category_map = { 37 'n': CAT_NOUN, 38 'v': CAT_VERB, 39 'a': CAT_ADJECTIVE, 40 's': CAT_ADJECTIVE, 41 'r': CAT_ADVERB 42} 43 44 45class WordIndex: 46 def __init__(self, lemma, category, ptrs, synsets, tagsense_count): 47 self.lemma = lemma 48 self.category = category 49 self.ptrs = ptrs 50 self.synsets = synsets 51 self.tagsense_count = tagsense_count 52 53 @classmethod 54 def build_from_line(cls, line_data, synset_map): 55 line_split = line_data.split() 56 lemma = line_split[0] 57 category = category_map[line_split[1]] 58 synset_count = int(line_split[2],10) 59 ptr_count = int(line_split[3],10) 60 ptrs = [line_split[i] for i in range(3, 3+ptr_count)] 61 tagsense_count = int(line_split[5 + ptr_count],10) 62 synsets = [synset_map[int(line_split[i],10)] for i in range(6 + ptr_count, 6 + ptr_count + synset_count)] 63 return cls(lemma, category, ptrs, synsets, tagsense_count) 64 65 @classmethod 66 def build_from_file(cls, f, synset_map, rv_base=None): 67 if (rv_base is None): 68 rv = {} 69 else: 70 rv = rv_base 71 72 for line in f: 73 if (line.startswith(' ')): 74 continue 75 wi = cls.build_from_line(line, synset_map) 76 word = wi.lemma.lower() 77 if not (word in rv): 78 rv[word] = [] 79 rv[word].append(wi) 80 return rv 81 82 def __repr__(self): 83 return '%s%s' % (self.__class__.__name__, (self.lemma, self.category, self.ptrs, self.synsets, self.tagsense_count)) 84 85 86class WordIndexDictFormatter(WordIndex): 87 category_map_rev = { 88 CAT_NOUN: 'n', 89 CAT_VERB: 'v', 90 CAT_ADJECTIVE: 'adj', 91 CAT_ADVERB: 'adv' 92 } 93 linesep = '\n' 94 LINE_WIDTH_MAX = 68 95 prefix_fmtf_line_first = '%5s 1: ' 96 prefix_fmtn_line_first = ' ' 97 prefix_fmtf_line_nonfirst = '%5d: ' 98 prefix_fmtn_line_nonfirst = ' ' 99 100 def dict_str(self): 101 tw = TextWrapper(width=self.LINE_WIDTH_MAX, 102 initial_indent=(self.prefix_fmtf_line_first % self.category_map_rev[self.category]), 103 subsequent_indent=self.prefix_fmtn_line_first) 104 105 lines = (tw.wrap(self.synsets[0].dict_str())) 106 i = 2 107 for synset in self.synsets[1:]: 108 tw = TextWrapper(width=self.LINE_WIDTH_MAX, 109 initial_indent=(self.prefix_fmtf_line_nonfirst % i), 110 subsequent_indent=self.prefix_fmtn_line_nonfirst) 111 lines.extend(tw.wrap(synset.dict_str())) 112 i += 1 113 return self.linesep.join(lines) 114 115 116class Synset: 117 def __init__(self, offset, ss_type, words, ptrs, gloss, frames=()): 118 self.offset = offset 119 self.type = ss_type 120 self.words = words 121 self.ptrs = ptrs 122 self.gloss = gloss 123 self.frames = frames 124 self.comments = [] 125 126 @classmethod 127 def build_from_line(cls, line_data): 128 line_split = line_data.split() 129 synset_offset = int(line_split[0],10) 130 ss_type = category_map[line_split[2]] 131 word_count = int(line_split[3],16) 132 words = [line_split[i] for i in range(4, 4 + word_count*2,2)] 133 ptr_count = int(line_split[4 + word_count*2],10) 134 ptrs = [(line_split[i], line_split[i+1], line_split[i+2], line_split[i+3]) for i in range(5 + word_count*2,4 + word_count*2 + ptr_count*4,4)] 135 136 tok = line_split[5 + word_count*2 + ptr_count*4] 137 base = 6 + word_count*2 + ptr_count*4 138 if (tok != '|'): 139 frame_count = int(tok, 10) 140 frames = [(int(line_split[i+1],10), int(line_split[i+2],16)) for i in range(base, base + frame_count*3, 3)] 141 base += frame_count*3 + 1 142 else: 143 frames = [] 144 145 line_split2 = line_data.split(None, base) 146 if (len(line_split2) < base): 147 gloss = None 148 else: 149 gloss = line_split2[-1] 150 151 return cls(synset_offset, ss_type, words, ptrs, gloss, frames) 152 153 @classmethod 154 def build_from_file(cls, f): 155 rv = {} 156 comments = [] 157 158 for line in f: 159 if (line.startswith(' ')): 160 line_s = line.lstrip().rstrip('\n') 161 line_elements = line_s.split(None,1) 162 try: 163 int(line_elements[0]) 164 except ValueError: 165 continue 166 if (len(line_elements) == 1): 167 line_elements.append('') 168 comments.append(line_elements[1]) 169 continue 170 synset = cls.build_from_line(line.rstrip()) 171 rv[synset.offset] = synset 172 173 return (rv, comments) 174 175 def dict_str(self): 176 rv = self.gloss 177 if (len(self.words) > 1): 178 rv += ' [syn: %s]' % (', '.join([('{%s}' % word) for word in self.words])) 179 return rv 180 181 def __repr__(self): 182 return '%s%s' % (self.__class__.__name__, (self.offset, self.type, self.words, self.ptrs, self.gloss, self.frames)) 183 184 185class WordnetDict: 186 db_info_fmt = '''This file was converted from the original database on: 187 %(conversion_datetime)s 188 189The original data is available from: 190 %(wn_url)s 191 192The original data was distributed with the notice shown below. No 193additional restrictions are claimed. Please redistribute this changed 194version under the same conditions and restriction that apply to the 195original version.\n\n 196%(wn_license)s''' 197 198 datetime_fmt = '%Y-%m-%dT%H:%M:%S' 199 base64_map = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/' 200 201 def __init__(self, wn_url, desc_short, desc_long): 202 self.word_data = {} 203 self.wn_url = wn_url 204 self.desc_short = desc_short 205 self.desc_long = desc_long 206 self.wn_license = None 207 208 def wn_dict_add(self, file_index, file_data): 209 file_data.seek(0) 210 file_index.seek(0) 211 (synsets, license_lines) = Synset.build_from_file(file_data) 212 WordIndexDictFormatter.build_from_file(file_index, synsets, self.word_data) 213 if (license_lines): 214 self.wn_license = '\n'.join(license_lines) + '\n' 215 216 @classmethod 217 def base64_encode(cls, i): 218 """Encode a non-negative integer into a dictd compatible base64 string""" 219 if (i < 0): 220 raise ValueError('Value %r for i is negative' % (i,)) 221 r = 63 222 e = 1 223 while (r < i): 224 e += 1 225 r = 64**e - 1 226 227 rv = '' 228 while (e > 0): 229 e -= 1 230 d = (i / 64**e) 231 rv += cls.base64_map[d] 232 i = i % (64**e) 233 return rv 234 235 @classmethod 236 def dict_entry_write(cls, file_index, file_data, key, entry, linesep='\n'): 237 """Write a single dict entry for <key> to index and data files""" 238 entry_start = file_data.tell() 239 file_data.write(entry) 240 entry_len = len(entry) 241 file_index.write('%s\t%s\t%s%s' % (key, cls.base64_encode(entry_start), 242 cls.base64_encode(entry_len), linesep)) 243 244 def dict_generate(self, file_index, file_data): 245 file_index.seek(0) 246 file_data.seek(0) 247 # The dictd file format is fairly iffy on the subject of special 248 # headwords: either dictd is buggy, or the manpage doesn't tell the whole 249 # story about the format. 250 # The upshot is that order of these entries in the index *matters*. 251 # Putting them at the beginning and in alphabetic order is afaict ok. 252 # Some other orders completely and quietly break the ability to look 253 # those headwords up. 254 # -- problem encountered with 1.10.2, at 2007-08-05. 255 file_data.write('\n') 256 wn_url = self.wn_url 257 conversion_datetime = datetime.datetime.now().strftime(self.datetime_fmt) 258 wn_license = self.wn_license 259 self.dict_entry_write(file_index, file_data, '00-database-info', '00-database-info\n%s\n' % (self.db_info_fmt % vars())) 260 self.dict_entry_write(file_index, file_data, '00-database-long', '00-database-long\n%s\n' % self.desc_long) 261 self.dict_entry_write(file_index, file_data, '00-database-short', '00-database-short\n%s\n' % self.desc_short) 262 self.dict_entry_write(file_index, file_data, '00-database-url', '00-database-url\n%s\n' % self.wn_url) 263 264 265 words = self.word_data.keys() 266 words.sort() 267 for word in words: 268 for wi in self.word_data[word]: 269 word_cs = word 270 # Use case-sensitivity information of first entry of first synset that 271 # matches this word case-insensitively 272 for synset in wi.synsets: 273 for ss_word in synset.words: 274 if (ss_word.lower() == word_cs.lower()): 275 word_cs = ss_word 276 break 277 else: 278 continue 279 break 280 else: 281 continue 282 break 283 284 outstr = '' 285 for wi in self.word_data[word]: 286 outstr += wi.dict_str() + '\n' 287 288 outstr = '%s%s%s' % (word_cs, wi.linesep, outstr) 289 self.dict_entry_write(file_index, file_data, word_cs, outstr, wi.linesep) 290 291 file_index.truncate() 292 file_data.truncate() 293 294 295if (__name__ == '__main__'): 296 import optparse 297 op = optparse.OptionParser(usage='usage: %prog [options] (<wn_index_file> <wn_data_file>)+') 298 op.add_option('-i', '--outindex', dest='oi', default='wn.index', help='filename of index file to write to') 299 op.add_option('-d', '--outdata', dest='od', default='wn.dict', help='filename of data file to write to') 300 op.add_option('--wn_url', dest='wn_url', default='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources') 301 op.add_option('--db_desc_short', dest='desc_short', default=' WordNet (r) 2.1 (2005)', help='short dict DB description') 302 op.add_option('--db_desc_long', dest='desc_long', default=' WordNet (r): A Lexical Database for English from the\n Cognitive Science Laboratory at Princeton University', help='long dict DB description') 303 304 (options, args) = op.parse_args() 305 306 wnd = WordnetDict(wn_url=options.wn_url, desc_short=options.desc_short, desc_long=options.desc_long) 307 308 for i in range(0,len(args),2): 309 print 'Opening index file %r...' % args[i] 310 file_index = file(args[i]) 311 print 'Opening data file %r...' % args[i+1] 312 file_data = file(args[i+1]) 313 print 'Parsing index file and data file...' 314 wnd.wn_dict_add(file_index, file_data) 315 316 print 'All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od) 317 318 wnd.dict_generate(file(options.oi, 'w'),file(options.od, 'w')) 319 print 'All done.'