fork of indigo with slightly nicer lexgen

automod: copy Hive auto-labeling code from labeler package

Changed files
+593
automod
+3
automod/visual/doc.go
··· 1 + // automod helpers for visual content (image blobs) 2 + package visual 3 +
+147
automod/visual/hiveai.go
··· 1 + package visual 2 + 3 + import ( 4 + "log/slog" 5 + "bytes" 6 + "context" 7 + "encoding/json" 8 + "fmt" 9 + "io" 10 + "mime/multipart" 11 + "net/http" 12 + 13 + lexutil "github.com/bluesky-social/indigo/lex/util" 14 + "github.com/bluesky-social/indigo/util" 15 + 16 + "github.com/carlmjohnson/versioninfo" 17 + ) 18 + 19 + type HiveAILabeler struct { 20 + Client http.Client 21 + ApiToken string 22 + } 23 + 24 + // schema: https://docs.thehive.ai/reference/classification 25 + type HiveAIResp struct { 26 + Status []HiveAIResp_Status `json:"status"` 27 + } 28 + 29 + type HiveAIResp_Status struct { 30 + Response HiveAIResp_Response `json:"response"` 31 + } 32 + 33 + type HiveAIResp_Response struct { 34 + Output []HiveAIResp_Out `json:"output"` 35 + } 36 + 37 + type HiveAIResp_Out struct { 38 + Time float64 `json:"time"` 39 + Classes []HiveAIResp_Class `json:"classes"` 40 + } 41 + 42 + type HiveAIResp_Class struct { 43 + Class string `json:"class"` 44 + Score float64 `json:"score"` 45 + } 46 + 47 + func NewHiveAILabeler(token string) HiveAILabeler { 48 + return HiveAILabeler{ 49 + Client: *util.RobustHTTPClient(), 50 + ApiToken: token, 51 + } 52 + } 53 + 54 + func (resp *HiveAIResp) SummarizeLabels() []string { 55 + var labels []string 56 + 57 + for _, status := range resp.Status { 58 + for _, out := range status.Response.Output { 59 + for _, cls := range out.Classes { 60 + // TODO(bnewbold): lots more upstream tags could be included here. 61 + // for example, "sexy" for not nude but still explicit/suggestive, 62 + // or lolicon (animated, not nude, "sugggestive" 63 + 64 + // sexual: https://docs.thehive.ai/docs/sexual-content 65 + // note: won't apply "nude" if "porn" already applied 66 + if cls.Class == "yes_sexual_activity" && cls.Score >= 0.90 { 67 + // NOTE: will include "hentai" 68 + labels = append(labels, "porn") 69 + } else if cls.Class == "animal_genitalia_and_human" && cls.Score >= 0.90 { 70 + labels = append(labels, "porn") 71 + } else if cls.Class == "yes_male_nudity" && cls.Score >= 0.90 { 72 + labels = append(labels, "nude") 73 + } else if cls.Class == "yes_female_nudity" && cls.Score >= 0.90 { 74 + labels = append(labels, "nude") 75 + } 76 + 77 + // gore and violence: https://docs.thehive.ai/docs/class-descriptions-violence-gore 78 + if cls.Class == "very_bloody" && cls.Score >= 0.90 { 79 + labels = append(labels, "gore") 80 + } 81 + if cls.Class == "human_corpse" && cls.Score >= 0.90 { 82 + labels = append(labels, "corpse") 83 + } 84 + if cls.Class == "yes_self_harm" && cls.Score >= 0.90 { 85 + labels = append(labels, "self-harm") 86 + } 87 + } 88 + } 89 + } 90 + 91 + return labels 92 + } 93 + 94 + func (hal *HiveAILabeler) LabelBlob(ctx context.Context, blob lexutil.LexBlob, blobBytes []byte) ([]string, error) { 95 + 96 + slog.Info("sending blob to thehive.ai", "cid", blob.Ref, "mimetype", blob.MimeType, "size", len(blobBytes)) 97 + 98 + // generic HTTP form file upload, then parse the response JSON 99 + body := &bytes.Buffer{} 100 + writer := multipart.NewWriter(body) 101 + part, err := writer.CreateFormFile("media", blob.Ref.String()) 102 + if err != nil { 103 + return nil, err 104 + } 105 + _, err = part.Write(blobBytes) 106 + if err != nil { 107 + return nil, err 108 + } 109 + err = writer.Close() 110 + if err != nil { 111 + return nil, err 112 + } 113 + 114 + req, err := http.NewRequest("POST", "https://api.thehive.ai/api/v2/task/sync", body) 115 + if err != nil { 116 + return nil, err 117 + } 118 + 119 + req.Header.Set("Authorization", fmt.Sprintf("Token %s", hal.ApiToken)) 120 + req.Header.Add("Content-Type", writer.FormDataContentType()) 121 + req.Header.Set("Accept", "application/json") 122 + req.Header.Set("User-Agent", "labelmaker/"+versioninfo.Short()) 123 + 124 + res, err := hal.Client.Do(req) 125 + if err != nil { 126 + return nil, fmt.Errorf("HiveAI request failed: %v", err) 127 + } 128 + defer res.Body.Close() 129 + if res.StatusCode != 200 { 130 + return nil, fmt.Errorf("HiveAI request failed statusCode=%d", res.StatusCode) 131 + } 132 + 133 + respBytes, err := io.ReadAll(res.Body) 134 + if err != nil { 135 + return nil, fmt.Errorf("failed to read HiveAI resp body: %v", err) 136 + } 137 + 138 + slog.Debug("HiveAI raw result", "cid", blob.Ref, "body", string(respBytes)) 139 + 140 + var respObj HiveAIResp 141 + if err := json.Unmarshal(respBytes, &respObj); err != nil { 142 + return nil, fmt.Errorf("failed to parse HiveAI resp JSON: %v", err) 143 + } 144 + respJson, _ := json.Marshal(respObj.Status[0].Response.Output[0]) 145 + slog.Info("HiveAI result", "cid", blob.Ref, "json", string(respJson)) 146 + return respObj.SummarizeLabels(), nil 147 + }
+42
automod/visual/hiveai_test.go
··· 1 + package visual 2 + 3 + import ( 4 + "encoding/json" 5 + "io" 6 + "os" 7 + "reflect" 8 + "testing" 9 + ) 10 + 11 + func TestHiveParse(t *testing.T) { 12 + file, err := os.Open("testdata/hiveai_resp_example.json") 13 + if err != nil { 14 + t.Fatal(err) 15 + } 16 + 17 + respBytes, err := io.ReadAll(file) 18 + if err != nil { 19 + t.Fatal(err) 20 + } 21 + 22 + var respObj HiveAIResp 23 + if err := json.Unmarshal(respBytes, &respObj); err != nil { 24 + t.Fatal(err) 25 + } 26 + 27 + classes := respObj.Status[0].Response.Output[0].Classes 28 + if len(classes) <= 10 { 29 + t.Fatal("didn't get expected class count") 30 + } 31 + for _, c := range classes { 32 + if c.Class == "" || c.Score == 0.0 { 33 + t.Fatal("got null/empty class in resp") 34 + } 35 + } 36 + 37 + labels := respObj.SummarizeLabels() 38 + expected := []string{"porn"} 39 + if !reflect.DeepEqual(labels, expected) { 40 + t.Fatal("didn't summarize to expected labels") 41 + } 42 + }
+401
automod/visual/testdata/hiveai_resp_example.json
··· 1 + { 2 + "id": "02122580-c37f-11ed-81d2-000000000000", 3 + "code": 200, 4 + "project_id": 12345, 5 + "user_id": 12345, 6 + "created_on": "2023-03-15T22:16:18.408Z", 7 + "status": [ 8 + { 9 + "status": { 10 + "code": "0", 11 + "message": "SUCCESS" 12 + }, 13 + "response": { 14 + "input": { 15 + "id": "02122580-c37f-11ed-81d2-000000000000", 16 + "charge": 0.003, 17 + "model": "mod55_dense", 18 + "model_version": 1, 19 + "model_type": "CATEGORIZATION", 20 + "created_on": "2023-03-15T22:16:18.136Z", 21 + "media": { 22 + "url": null, 23 + "filename": "bafkreiam7k6mvkyuoybq4ynhljvj5xa75sdbhjbolzjf5j2udx7vj5gnsy", 24 + "type": "PHOTO", 25 + "mime_type": "jpeg", 26 + "mimetype": "image/jpeg", 27 + "width": 800, 28 + "height": 800, 29 + "num_frames": 1, 30 + "duration": 0 31 + }, 32 + "user_id": 12345, 33 + "project_id": 12345, 34 + "config_version": 1, 35 + "config_tag": "default" 36 + }, 37 + "output": [ 38 + { 39 + "time": 0, 40 + "classes": [ 41 + { 42 + "class": "general_not_nsfw_not_suggestive", 43 + "score": 0.9998097218132356 44 + }, 45 + { 46 + "class": "general_nsfw", 47 + "score": 8.857344804177162e-05 48 + }, 49 + { 50 + "class": "general_suggestive", 51 + "score": 0.00010170473872266839 52 + }, 53 + { 54 + "class": "no_female_underwear", 55 + "score": 0.9999923079040384 56 + }, 57 + { 58 + "class": "yes_female_underwear", 59 + "score": 7.692095961599136e-06 60 + }, 61 + { 62 + "class": "no_male_underwear", 63 + "score": 0.9999984904867634 64 + }, 65 + { 66 + "class": "yes_male_underwear", 67 + "score": 1.5095132367094679e-06 68 + }, 69 + { 70 + "class": "no_sex_toy", 71 + "score": 0.9999970970762551 72 + }, 73 + { 74 + "class": "yes_sex_toy", 75 + "score": 2.9029237450490604e-06 76 + }, 77 + { 78 + "class": "no_female_nudity", 79 + "score": 0.9999739028909301 80 + }, 81 + { 82 + "class": "yes_female_nudity", 83 + "score": 2.60971090699536e-05 84 + }, 85 + { 86 + "class": "no_male_nudity", 87 + "score": 0.9999711373083747 88 + }, 89 + { 90 + "class": "yes_male_nudity", 91 + "score": 2.8862691625255323e-05 92 + }, 93 + { 94 + "class": "no_female_swimwear", 95 + "score": 0.9999917609899659 96 + }, 97 + { 98 + "class": "yes_female_swimwear", 99 + "score": 8.239010034025379e-06 100 + }, 101 + { 102 + "class": "no_male_shirtless", 103 + "score": 0.9999583350744331 104 + }, 105 + { 106 + "class": "yes_male_shirtless", 107 + "score": 4.166492556688088e-05 108 + }, 109 + { 110 + "class": "no_text", 111 + "score": 0.9958378716447616 112 + }, 113 + { 114 + "class": "text", 115 + "score": 0.0041621283552384265 116 + }, 117 + { 118 + "class": "animated", 119 + "score": 0.46755478950048235 120 + }, 121 + { 122 + "class": "hybrid", 123 + "score": 0.0011440363434524984 124 + }, 125 + { 126 + "class": "natural", 127 + "score": 0.5313011741560651 128 + }, 129 + { 130 + "class": "animated_gun", 131 + "score": 2.0713000782979496e-05 132 + }, 133 + { 134 + "class": "gun_in_hand", 135 + "score": 1.5844730446534659e-06 136 + }, 137 + { 138 + "class": "gun_not_in_hand", 139 + "score": 1.0338973818006654e-06 140 + }, 141 + { 142 + "class": "no_gun", 143 + "score": 0.9999766686287906 144 + }, 145 + { 146 + "class": "culinary_knife_in_hand", 147 + "score": 3.8063500083369785e-06 148 + }, 149 + { 150 + "class": "culinary_knife_not_in_hand", 151 + "score": 7.94057948996249e-07 152 + }, 153 + { 154 + "class": "knife_in_hand", 155 + "score": 4.5578955723278505e-07 156 + }, 157 + { 158 + "class": "knife_not_in_hand", 159 + "score": 3.842124714748908e-07 160 + }, 161 + { 162 + "class": "no_knife", 163 + "score": 0.999994559590014 164 + }, 165 + { 166 + "class": "a_little_bloody", 167 + "score": 2.1317745626539786e-07 168 + }, 169 + { 170 + "class": "no_blood", 171 + "score": 0.9999793341236429 172 + }, 173 + { 174 + "class": "other_blood", 175 + "score": 2.0322054269591763e-05 176 + }, 177 + { 178 + "class": "very_bloody", 179 + "score": 1.306446309561673e-07 180 + }, 181 + { 182 + "class": "no_pills", 183 + "score": 0.9999989592376954 184 + }, 185 + { 186 + "class": "yes_pills", 187 + "score": 1.0407623044588633e-06 188 + }, 189 + { 190 + "class": "no_smoking", 191 + "score": 0.9999939101969173 192 + }, 193 + { 194 + "class": "yes_smoking", 195 + "score": 6.089803082758281e-06 196 + }, 197 + { 198 + "class": "illicit_injectables", 199 + "score": 6.925695592003094e-07 200 + }, 201 + { 202 + "class": "medical_injectables", 203 + "score": 8.587808234452378e-07 204 + }, 205 + { 206 + "class": "no_injectables", 207 + "score": 0.9999984486496174 208 + }, 209 + { 210 + "class": "no_nazi", 211 + "score": 0.9999987449628097 212 + }, 213 + { 214 + "class": "yes_nazi", 215 + "score": 1.2550371902234279e-06 216 + }, 217 + { 218 + "class": "no_kkk", 219 + "score": 0.999999762417549 220 + }, 221 + { 222 + "class": "yes_kkk", 223 + "score": 2.3758245111050425e-07 224 + }, 225 + { 226 + "class": "no_middle_finger", 227 + "score": 0.9999881515231847 228 + }, 229 + { 230 + "class": "yes_middle_finger", 231 + "score": 1.184847681536747e-05 232 + }, 233 + { 234 + "class": "no_terrorist", 235 + "score": 0.9999998870793229 236 + }, 237 + { 238 + "class": "yes_terrorist", 239 + "score": 1.1292067715380635e-07 240 + }, 241 + { 242 + "class": "no_overlay_text", 243 + "score": 0.9996453363440359 244 + }, 245 + { 246 + "class": "yes_overlay_text", 247 + "score": 0.0003546636559640924 248 + }, 249 + { 250 + "class": "no_sexual_activity", 251 + "score": 0.9999563580374798 252 + }, 253 + { 254 + "class": "yes_sexual_activity", 255 + "score": 0.99, 256 + "realScore": 4.364196252012032e-05 257 + }, 258 + { 259 + "class": "hanging", 260 + "score": 3.6435135762510905e-07 261 + }, 262 + { 263 + "class": "no_hanging_no_noose", 264 + "score": 0.9999980779196416 265 + }, 266 + { 267 + "class": "noose", 268 + "score": 1.5577290007796094e-06 269 + }, 270 + { 271 + "class": "no_realistic_nsfw", 272 + "score": 0.9999944341007805 273 + }, 274 + { 275 + "class": "yes_realistic_nsfw", 276 + "score": 5.565899219571182e-06 277 + }, 278 + { 279 + "class": "animated_corpse", 280 + "score": 5.276802046755426e-07 281 + }, 282 + { 283 + "class": "human_corpse", 284 + "score": 2.5449360984211012e-08 285 + }, 286 + { 287 + "class": "no_corpse", 288 + "score": 0.9999994468704343 289 + }, 290 + { 291 + "class": "no_self_harm", 292 + "score": 0.9999994515625507 293 + }, 294 + { 295 + "class": "yes_self_harm", 296 + "score": 5.484374493605692e-07 297 + }, 298 + { 299 + "class": "no_drawing", 300 + "score": 0.9978276028816608 301 + }, 302 + { 303 + "class": "yes_drawing", 304 + "score": 0.0021723971183392485 305 + }, 306 + { 307 + "class": "no_emaciated_body", 308 + "score": 0.9999998146500432 309 + }, 310 + { 311 + "class": "yes_emaciated_body", 312 + "score": 1.853499568724518e-07 313 + }, 314 + { 315 + "class": "no_child_present", 316 + "score": 0.9999970498515446 317 + }, 318 + { 319 + "class": "yes_child_present", 320 + "score": 2.950148455380443e-06 321 + }, 322 + { 323 + "class": "no_sexual_intent", 324 + "score": 0.9999963861546292 325 + }, 326 + { 327 + "class": "yes_sexual_intent", 328 + "score": 3.613845370766111e-06 329 + }, 330 + { 331 + "class": "animal_genitalia_and_human", 332 + "score": 2.255472023465222e-08 333 + }, 334 + { 335 + "class": "animal_genitalia_only", 336 + "score": 4.6783185199931176e-07 337 + }, 338 + { 339 + "class": "animated_animal_genitalia", 340 + "score": 6.707857419436447e-07 341 + }, 342 + { 343 + "class": "no_animal_genitalia", 344 + "score": 0.9999988388276858 345 + }, 346 + { 347 + "class": "no_gambling", 348 + "score": 0.9999960939687145 349 + }, 350 + { 351 + "class": "yes_gambling", 352 + "score": 3.906031285604864e-06 353 + }, 354 + { 355 + "class": "no_undressed", 356 + "score": 0.99999923356218 357 + }, 358 + { 359 + "class": "yes_undressed", 360 + "score": 7.664378199789045e-07 361 + }, 362 + { 363 + "class": "no_confederate", 364 + "score": 0.9999925456900376 365 + }, 366 + { 367 + "class": "yes_confederate", 368 + "score": 7.454309962453175e-06 369 + }, 370 + { 371 + "class": "animated_alcohol", 372 + "score": 1.8109949948066074e-06 373 + }, 374 + { 375 + "class": "no_alcohol", 376 + "score": 0.9999916620957963 377 + }, 378 + { 379 + "class": "yes_alcohol", 380 + "score": 5.88781463445443e-06 381 + }, 382 + { 383 + "class": "yes_drinking_alcohol", 384 + "score": 6.390945746578106e-07 385 + }, 386 + { 387 + "class": "no_religious_icon", 388 + "score": 0.9999862158580689 389 + }, 390 + { 391 + "class": "yes_religious_icon", 392 + "score": 1.3784141931119298e-05 393 + } 394 + ] 395 + } 396 + ] 397 + } 398 + } 399 + ], 400 + "from_cache": false 401 + }