1{
2 lib,
3 newScope,
4 fetchFromGitHub,
5 unzip,
6 stdenvNoCC,
7}:
8let
9 base = {
10 version = "0-unstable-2024-07-29";
11 nativeBuildInputs = [ unzip ];
12 dontBuild = true;
13 dontFixup = true;
14 meta = with lib; {
15 description = "NLTK Data";
16 homepage = "https://github.com/nltk/nltk_data";
17 license = licenses.asl20;
18 platforms = platforms.all;
19 maintainers = with maintainers; [
20 bengsparks
21 happysalada
22 ];
23 };
24 };
25 makeNltkDataPackage =
26 {
27 pname,
28 location,
29 hash,
30 }:
31 let
32 src = fetchFromGitHub {
33 owner = "nltk";
34 repo = "nltk_data";
35 rev = "cfe82914f3c2d24363687f1db3b05e8b9f687e2b";
36 inherit hash;
37 sparseCheckout = [ "packages/${location}/${pname}.zip" ];
38 };
39 in
40 stdenvNoCC.mkDerivation (
41 base
42 // {
43 inherit pname src;
44 inherit (base) version;
45 installPhase = ''
46 runHook preInstall
47
48 mkdir -p $out
49 unzip ${src}/packages/${location}/${pname}.zip
50 mkdir -p $out/${location}
51 cp -R ${pname}/ $out/${location}
52
53 runHook postInstall
54 '';
55 }
56 );
57
58 makeChunker =
59 pname:
60 makeNltkDataPackage {
61 inherit pname;
62 location = "chunkers";
63 hash = "sha256-kemjqaCM9hlKAdMw8oVJnp62EAC9rMQ50dKg7wlAwEc=";
64 };
65
66 makeCorpus =
67 pname:
68 makeNltkDataPackage {
69 inherit pname;
70 location = "corpora";
71 hash = "sha256-8lMjW5YI8h6dHJ/83HVY2OYGDyKPpgkUAKPISiAKqqk=";
72 };
73
74 makeGrammar =
75 pname:
76 makeNltkDataPackage {
77 inherit pname;
78 location = "grammars";
79 hash = "sha256-pyLEcX3Azv8j1kCGvVYonuiNgVJxtWt7veU0S/yNbIM=";
80 };
81
82 makeHelp =
83 pname:
84 makeNltkDataPackage {
85 inherit pname;
86 location = "help";
87 hash = "sha256-97mYLNES5WujLF5gD8Ul4cJ6LqSzz+jDzclUsdBeHNE=";
88 };
89
90 makeMisc =
91 pname:
92 makeNltkDataPackage {
93 inherit pname;
94 location = "misc";
95 hash = "sha256-XtizfEsc8TYWqvvC/eSFdha2ClC5/ZiJM8nue0vXLb4=";
96 };
97
98 makeModel =
99 pname:
100 makeNltkDataPackage {
101 inherit pname;
102 location = "models";
103 hash = "sha256-iq3weEgCci6rgLW2j28F2eRLprJtInGXKe/awJPSVG4=";
104 };
105
106 makeTagger =
107 pname:
108 makeNltkDataPackage {
109 inherit pname;
110 location = "taggers";
111 hash = "sha256-tl3Cn2okhBkUtTXvAmFRx72Brez6iTGRdmFTwFmpk3M=";
112 };
113
114 makeTokenizer =
115 pname:
116 makeNltkDataPackage {
117 inherit pname;
118 location = "tokenizers";
119 hash = "sha256-OzMkruoYbFKqzuimOXIpE5lhHz8tmSqOFoLT+fjdTVg=";
120 };
121
122 makeStemmer =
123 pname:
124 makeNltkDataPackage {
125 inherit pname;
126 location = "stemmers";
127 hash = "sha256-mNefwOPVJGz9kXV3LV4DuV7FJpNir/Nwg4ujd0CogEk=";
128 };
129in
130lib.makeScope newScope (self: {
131 ## Chunkers
132 maxent-ne-chunker = makeChunker "maxent_ne_chunker";
133 maxent-ne-chunker-tab = makeChunker "maxent_ne_chunker_tab";
134
135 ## Corpora
136 abc = makeCorpus "abc";
137 alpino = makeCorpus "alpino";
138 bcp47 = makeCorpus "bcp47";
139 biocreative-ppi = makeCorpus "biocreative_ppi";
140 brown = makeCorpus "brown";
141 brown-tei = makeCorpus "brown_tei";
142 cess-cat = makeCorpus "cess_cat";
143 cess-esp = makeCorpus "cess_esp";
144 chat80 = makeCorpus "chat80";
145 city-database = makeCorpus "city_database";
146 cmudict = makeCorpus "cmudict";
147 comparative-sentences = makeCorpus "comparative_sentences";
148 comtrans = makeCorpus "comtrans";
149 conll2000 = makeCorpus "conll2000";
150 conll2002 = makeCorpus "conll2002";
151 conll2007 = makeCorpus "conll2007";
152 crubadan = makeCorpus "crubadan";
153 dependency-treebank = makeCorpus "dependency_treebank";
154 dolch = makeCorpus "dolch";
155 europarl-raw = makeCorpus "europarl_raw";
156 extended-omw = makeCorpus "extended_omw";
157 floresta = makeCorpus "floresta";
158 framenet-v15 = makeCorpus "framenet_v15";
159 framenet-v17 = makeCorpus "framenet_v17";
160 gazetteers = makeCorpus "gazetteers";
161 genesis = makeCorpus "genesis";
162 gutenberg = makeCorpus "gutenberg";
163 ieer = makeCorpus "ieer";
164 inaugural = makeCorpus "inaugural";
165 indian = makeCorpus "indian";
166 jeita = makeCorpus "jeita";
167 kimmo = makeCorpus "kimmo";
168 knbc = makeCorpus "knbc";
169 lin-thesaurus = makeCorpus "lin_thesaurus";
170 mac-morpho = makeCorpus "mac_morpho";
171 machado = makeCorpus "machado";
172 masc-tagged = makeCorpus "masc_tagged";
173 movie-reviews = makeCorpus "movie_reviews";
174 mte-teip5 = makeCorpus "mte_teip5";
175 names = makeCorpus "names";
176 nombank-1-0 = makeCorpus "nombank.1.0";
177 nonbreaking-prefixes = makeCorpus "nonbreaking_prefixes";
178 nps-chat = makeCorpus "nps_chat";
179 omw = makeCorpus "omw";
180 omw-1-4 = makeCorpus "omw-1.4";
181 opinion-lexicon = makeCorpus "opinion_lexicon";
182 panlex-swadesh = makeCorpus "panlex_swadesh";
183 paradigms = makeCorpus "paradigms";
184 pe08 = makeCorpus "pe08";
185 pil = makeCorpus "pil";
186 pl196x = makeCorpus "pl196x";
187 ppattach = makeCorpus "ppattach";
188 problem-reports = makeCorpus "problem_reports";
189 product-reviews-1 = makeCorpus "product_reviews_1";
190 product-reviews-2 = makeCorpus "product_reviews_2";
191 propbank = makeCorpus "propbank";
192 pros-cons = makeCorpus "pros_cons";
193 ptb = makeCorpus "ptb";
194 qc = makeCorpus "qc";
195 reuters = makeCorpus "reuters";
196 rte = makeCorpus "rte";
197 semcor = makeCorpus "semcor";
198 senseval = makeCorpus "senseval";
199 sentence-polarity = makeCorpus "sentence_polarity";
200 sentiwordnet = makeCorpus "sentiwordnet";
201 shakespeare = makeCorpus "shakespeare";
202 sinica-treebank = makeCorpus "sinica_treebank";
203 smultron = makeCorpus "smultron";
204 state-union = makeCorpus "state_union";
205 stopwords = makeCorpus "stopwords";
206 subjectivity = makeCorpus "subjectivity";
207 swadesh = makeCorpus "swadesh";
208 switchboard = makeCorpus "switchboard";
209 timit = makeCorpus "timit";
210 toolbox = makeCorpus "toolbox";
211 treebank = makeCorpus "treebank";
212 twitter-samples = makeCorpus "twitter_samples";
213 udhr = makeCorpus "udhr";
214 udhr2 = makeCorpus "udhr2";
215 unicode-samples = makeCorpus "unicode_samples";
216 universal-treebanks-v20 = makeCorpus "universal_treebanks_v20";
217 verbnet = makeCorpus "verbnet";
218 verbnet3 = makeCorpus "verbnet3";
219 webtext = makeCorpus "webtext";
220 wordnet = makeCorpus "wordnet";
221 wordnet-ic = makeCorpus "wordnet_ic";
222 wordnet2021 = makeCorpus "wordnet2021";
223 wordnet2022 = makeCorpus "wordnet2022";
224 wordnet31 = makeCorpus "wordnet31";
225 words = makeCorpus "words";
226 ycoe = makeCorpus "ycoe";
227
228 ## Grammars
229 basque-grammars = makeGrammar "basque_grammars";
230 book-grammars = makeGrammar "book_grammars";
231 large-grammars = makeGrammar "large_grammars";
232 sample-grammars = makeGrammar "sample_grammars";
233 spanish-grammars = makeGrammar "spanish_grammars";
234
235 ## Help
236 tagsets-json = makeHelp "tagsets_json";
237
238 ## Misc
239 mwa-ppdb = makeMisc "mwa_ppdb";
240 perluniprops = makeMisc "perluniprops";
241
242 ## Models
243 bllip-wsj-no-aux = makeModel "bllip_wsj_no_aux";
244 moses-sample = makeModel "moses_sample";
245 wmt15-eval = makeModel "wmt15_eval";
246 word2vec-sample = makeModel "word2vec_sample";
247
248 ## Taggers
249 averaged-perceptron-tagger = makeTagger "averaged_perceptron_tagger";
250 averaged-perceptron-tagger-eng = makeTagger "averaged_perceptron_tagger_eng";
251 averaged-perceptron-tagger-ru = makeTagger "averaged_perceptron_tagger_ru";
252 averaged-perceptron-tagger-rus = makeTagger "averaged_perceptron_tagger_rus";
253 maxent-treebank-pos-tagger = makeTagger "maxent_treebank_pos_tagger";
254 maxent-treebank-pos-tagger-tab = makeTagger "maxent_treebank_pos_tagger_tab";
255 universal-tagset = makeTagger "universal_tagset";
256
257 ## Tokenizers
258 punkt = makeTokenizer "punkt";
259 punkt-tab = makeTokenizer "punkt_tab";
260
261 ## Stemmers
262 porter-test = makeStemmer "porter_test";
263 rslp = makeStemmer "rslp";
264 snowball-data = makeStemmer "snowball_data";
265})