Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
1#!/usr/bin/env python3 2#Copyright 2007 Sebastian Hagen 3# This file is part of wordnet_tools. 4 5# wordnet_tools is free software; you can redistribute it and/or modify 6# it under the terms of the GNU General Public License version 2 7# as published by the Free Software Foundation 8 9# wordnet_tools is distributed in the hope that it will be useful, 10# but WITHOUT ANY WARRANTY; without even the implied warranty of 11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12# GNU General Public License for more details. 13 14# You should have received a copy of the GNU General Public License 15# along with wordnet_tools; if not, write to the Free Software 16# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 18# This program requires python >= 2.4. 19 20# This program converts wordnet index/data file pairs into dict index/data 21# files usable by dictd. 22# This is basically a reimplementation of the wnfilter program by Rik Faith, 23# which unfortunately doesn't work correctly for wordnet files in the newer 24# formats. This version of wordnet_structures whould parse wordnet 2.1 files 25# correctly, and create output very similar to what wnfilter would have 26# written. 27 28import datetime 29import math 30from textwrap import TextWrapper 31 32CAT_ADJECTIVE = 0 33CAT_ADVERB = 1 34CAT_NOUN = 2 35CAT_VERB = 3 36 37category_map = { 38 'n': CAT_NOUN, 39 'v': CAT_VERB, 40 'a': CAT_ADJECTIVE, 41 's': CAT_ADJECTIVE, 42 'r': CAT_ADVERB 43} 44 45 46class WordIndex: 47 def __init__(self, lemma, category, ptrs, synsets, tagsense_count): 48 self.lemma = lemma 49 self.category = category 50 self.ptrs = ptrs 51 self.synsets = synsets 52 self.tagsense_count = tagsense_count 53 54 @classmethod 55 def build_from_line(cls, line_data, synset_map): 56 line_split = line_data.split() 57 lemma = line_split[0] 58 category = category_map[line_split[1]] 59 synset_count = int(line_split[2],10) 60 ptr_count = int(line_split[3],10) 61 ptrs = [line_split[i] for i in range(3, 3+ptr_count)] 62 tagsense_count = int(line_split[5 + ptr_count],10) 63 synsets = [synset_map[int(line_split[i],10)] for i in range(6 + ptr_count, 6 + ptr_count + synset_count)] 64 return cls(lemma, category, ptrs, synsets, tagsense_count) 65 66 @classmethod 67 def build_from_file(cls, f, synset_map, rv_base=None): 68 if (rv_base is None): 69 rv = {} 70 else: 71 rv = rv_base 72 73 for line in f: 74 if (line.startswith(' ')): 75 continue 76 wi = cls.build_from_line(line, synset_map) 77 word = wi.lemma.lower() 78 if not (word in rv): 79 rv[word] = [] 80 rv[word].append(wi) 81 return rv 82 83 def __repr__(self): 84 return '%s%s' % (self.__class__.__name__, (self.lemma, self.category, self.ptrs, self.synsets, self.tagsense_count)) 85 86 87class WordIndexDictFormatter(WordIndex): 88 category_map_rev = { 89 CAT_NOUN: 'n', 90 CAT_VERB: 'v', 91 CAT_ADJECTIVE: 'adj', 92 CAT_ADVERB: 'adv' 93 } 94 linesep = '\n' 95 LINE_WIDTH_MAX = 68 96 prefix_fmtf_line_first = '%5s 1: ' 97 prefix_fmtn_line_first = ' ' 98 prefix_fmtf_line_nonfirst = '%5d: ' 99 prefix_fmtn_line_nonfirst = ' ' 100 101 def dict_str(self): 102 tw = TextWrapper(width=self.LINE_WIDTH_MAX, 103 initial_indent=(self.prefix_fmtf_line_first % self.category_map_rev[self.category]), 104 subsequent_indent=self.prefix_fmtn_line_first) 105 106 lines = (tw.wrap(self.synsets[0].dict_str())) 107 i = 2 108 for synset in self.synsets[1:]: 109 tw = TextWrapper(width=self.LINE_WIDTH_MAX, 110 initial_indent=(self.prefix_fmtf_line_nonfirst % i), 111 subsequent_indent=self.prefix_fmtn_line_nonfirst) 112 lines.extend(tw.wrap(synset.dict_str())) 113 i += 1 114 return self.linesep.join(lines) 115 116 117class Synset: 118 def __init__(self, offset, ss_type, words, ptrs, gloss, frames=()): 119 self.offset = offset 120 self.type = ss_type 121 self.words = words 122 self.ptrs = ptrs 123 self.gloss = gloss 124 self.frames = frames 125 self.comments = [] 126 127 @classmethod 128 def build_from_line(cls, line_data): 129 line_split = line_data.split() 130 synset_offset = int(line_split[0],10) 131 ss_type = category_map[line_split[2]] 132 word_count = int(line_split[3],16) 133 words = [line_split[i] for i in range(4, 4 + word_count*2,2)] 134 ptr_count = int(line_split[4 + word_count*2],10) 135 ptrs = [(line_split[i], line_split[i+1], line_split[i+2], line_split[i+3]) for i in range(5 + word_count*2,4 + word_count*2 + ptr_count*4,4)] 136 137 tok = line_split[5 + word_count*2 + ptr_count*4] 138 base = 6 + word_count*2 + ptr_count*4 139 if (tok != '|'): 140 frame_count = int(tok, 10) 141 frames = [(int(line_split[i+1],10), int(line_split[i+2],16)) for i in range(base, base + frame_count*3, 3)] 142 base += frame_count*3 + 1 143 else: 144 frames = [] 145 146 line_split2 = line_data.split(None, base) 147 if (len(line_split2) < base): 148 gloss = None 149 else: 150 gloss = line_split2[-1] 151 152 return cls(synset_offset, ss_type, words, ptrs, gloss, frames) 153 154 @classmethod 155 def build_from_file(cls, f): 156 rv = {} 157 comments = [] 158 159 for line in f: 160 if (line.startswith(' ')): 161 line_s = line.lstrip().rstrip('\n') 162 line_elements = line_s.split(None,1) 163 try: 164 int(line_elements[0]) 165 except ValueError: 166 continue 167 if (len(line_elements) == 1): 168 line_elements.append('') 169 comments.append(line_elements[1]) 170 continue 171 synset = cls.build_from_line(line.rstrip()) 172 rv[synset.offset] = synset 173 174 return (rv, comments) 175 176 def dict_str(self): 177 rv = self.gloss 178 if (len(self.words) > 1): 179 rv += ' [syn: %s]' % (', '.join([('{%s}' % word) for word in self.words])) 180 return rv 181 182 def __repr__(self): 183 return '%s%s' % (self.__class__.__name__, (self.offset, self.type, self.words, self.ptrs, self.gloss, self.frames)) 184 185 186class WordnetDict: 187 db_info_fmt = '''This file was converted from the original database on: 188 %(conversion_datetime)s 189 190The original data is available from: 191 %(wn_url)s 192 193The original data was distributed with the notice shown below. No 194additional restrictions are claimed. Please redistribute this changed 195version under the same conditions and restriction that apply to the 196original version.\n\n 197%(wn_license)s''' 198 199 datetime_fmt = '%Y-%m-%dT%H:%M:%S' 200 base64_map = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/' 201 202 def __init__(self, wn_url, desc_short, desc_long): 203 self.word_data = {} 204 self.wn_url = wn_url 205 self.desc_short = desc_short 206 self.desc_long = desc_long 207 self.wn_license = None 208 209 def wn_dict_add(self, file_index, file_data): 210 file_data.seek(0) 211 file_index.seek(0) 212 (synsets, license_lines) = Synset.build_from_file(file_data) 213 WordIndexDictFormatter.build_from_file(file_index, synsets, self.word_data) 214 if (license_lines): 215 self.wn_license = '\n'.join(license_lines) + '\n' 216 217 @classmethod 218 def base64_encode(cls, i): 219 """Encode a non-negative integer into a dictd compatible base64 string""" 220 if (i < 0): 221 raise ValueError('Value %r for i is negative' % (i,)) 222 r = 63 223 e = 1 224 while (r < i): 225 e += 1 226 r = 64**e - 1 227 228 rv = '' 229 while (e > 0): 230 e -= 1 231 d = math.floor(i / 64**e) 232 rv += cls.base64_map[d] 233 i = i % (64**e) 234 return rv 235 236 @classmethod 237 def dict_entry_write(cls, file_index, file_data, key, entry, linesep='\n'): 238 """Write a single dict entry for <key> to index and data files""" 239 entry_start = file_data.tell() 240 file_data.write(entry) 241 entry_len = len(entry) 242 file_index.write('%s\t%s\t%s%s' % (key, cls.base64_encode(entry_start), 243 cls.base64_encode(entry_len), linesep)) 244 245 def dict_generate(self, file_index, file_data): 246 file_index.seek(0) 247 file_data.seek(0) 248 # The dictd file format is fairly iffy on the subject of special 249 # headwords: either dictd is buggy, or the manpage doesn't tell the whole 250 # story about the format. 251 # The upshot is that order of these entries in the index *matters*. 252 # Putting them at the beginning and in alphabetic order is afaict ok. 253 # Some other orders completely and quietly break the ability to look 254 # those headwords up. 255 # -- problem encountered with 1.10.2, at 2007-08-05. 256 file_data.write('\n') 257 wn_url = self.wn_url 258 conversion_datetime = datetime.datetime.now().strftime(self.datetime_fmt) 259 wn_license = self.wn_license 260 self.dict_entry_write(file_index, file_data, '00-database-info', '00-database-info\n%s\n' % (self.db_info_fmt % vars())) 261 self.dict_entry_write(file_index, file_data, '00-database-long', '00-database-long\n%s\n' % self.desc_long) 262 self.dict_entry_write(file_index, file_data, '00-database-short', '00-database-short\n%s\n' % self.desc_short) 263 self.dict_entry_write(file_index, file_data, '00-database-url', '00-database-url\n%s\n' % self.wn_url) 264 265 266 words = list(self.word_data.keys()) 267 words.sort() 268 for word in words: 269 for wi in self.word_data[word]: 270 word_cs = word 271 # Use case-sensitivity information of first entry of first synset that 272 # matches this word case-insensitively 273 for synset in wi.synsets: 274 for ss_word in synset.words: 275 if (ss_word.lower() == word_cs.lower()): 276 word_cs = ss_word 277 break 278 else: 279 continue 280 break 281 else: 282 continue 283 break 284 285 outstr = '' 286 for wi in self.word_data[word]: 287 outstr += wi.dict_str() + '\n' 288 289 outstr = '%s%s%s' % (word_cs, wi.linesep, outstr) 290 self.dict_entry_write(file_index, file_data, word_cs, outstr, wi.linesep) 291 292 file_index.truncate() 293 file_data.truncate() 294 295 296if (__name__ == '__main__'): 297 import optparse 298 op = optparse.OptionParser(usage='usage: %prog [options] (<wn_index_file> <wn_data_file>)+') 299 op.add_option('-i', '--outindex', dest='oi', default='wn.index', help='filename of index file to write to') 300 op.add_option('-d', '--outdata', dest='od', default='wn.dict', help='filename of data file to write to') 301 op.add_option('--wn_url', dest='wn_url', default='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources') 302 op.add_option('--db_desc_short', dest='desc_short', default=' WordNet (r) 2.1 (2005)', help='short dict DB description') 303 op.add_option('--db_desc_long', dest='desc_long', default=' WordNet (r): A Lexical Database for English from the\n Cognitive Science Laboratory at Princeton University', help='long dict DB description') 304 305 (options, args) = op.parse_args() 306 307 wnd = WordnetDict(wn_url=options.wn_url, desc_short=options.desc_short, desc_long=options.desc_long) 308 309 for i in range(0,len(args),2): 310 print('Opening index file %r...' % args[i]) 311 file_index = open(args[i]) 312 print('Opening data file %r...' % args[i+1]) 313 file_data = open(args[i+1]) 314 print('Parsing index file and data file...') 315 wnd.wn_dict_add(file_index, file_data) 316 317 print('All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od)) 318 319 wnd.dict_generate(open(options.oi, 'w'),open(options.od, 'w')) 320 print('All done.')