An ATProto Lexicon validator for Gleam.

feat: validate full ATProto blob structure with strict field checking

- Add is_valid_raw_cid function to formats module for blob CID validation
(validates CIDv1 with raw multicodec, bafkrei prefix)
- Validate $type field must equal "blob"
- Validate ref object with $link containing raw CID
- Validate mimeType is non-empty
- Validate size is non-negative (zero allowed)
- Reject extra fields (strict mode per ATProto spec)

+6
CHANGELOG.md
··· 1 1 # Changelog 2 2 3 + ## 1.2.0 4 + 5 + ### Added 6 + 7 + - Validate full ATProto blob structure with stricter field checking 8 + 3 9 ## 1.1.0 4 10 5 11 ### Added
+1 -1
gleam.toml
··· 1 1 name = "honk" 2 - version = "1.1.0" 2 + version = "1.2.0" 3 3 description = "ATProtocol lexicon validator for Gleam" 4 4 internal_modules = ["honk/internal", "honk/internal/*"] 5 5 licences = ["Apache-2.0"]
+2 -2
manifest.toml
··· 6 6 { name = "filepath", version = "1.1.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "filepath", source = "hex", outer_checksum = "B06A9AF0BF10E51401D64B98E4B627F1D2E48C154967DA7AF4D0914780A6D40A" }, 7 7 { name = "gleam_json", version = "3.1.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_json", source = "hex", outer_checksum = "44FDAA8847BE8FC48CA7A1C089706BD54BADCC4C45B237A992EDDF9F2CDB2836" }, 8 8 { name = "gleam_regexp", version = "1.1.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_regexp", source = "hex", outer_checksum = "9C215C6CA84A5B35BB934A9B61A9A306EC743153BE2B0425A0D032E477B062A9" }, 9 - { name = "gleam_stdlib", version = "0.65.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "7C69C71D8C493AE11A5184828A77110EB05A7786EBF8B25B36A72F879C3EE107" }, 10 - { name = "gleam_time", version = "1.5.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_time", source = "hex", outer_checksum = "D560E672C7279C89908981E068DF07FD16D0C859DCA266F908B18F04DF0EB8E6" }, 9 + { name = "gleam_stdlib", version = "0.67.1", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "6CE3E4189A8B8EC2F73AB61A2FBDE49F159D6C9C61C49E3B3082E439F260D3D0" }, 10 + { name = "gleam_time", version = "1.6.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_time", source = "hex", outer_checksum = "0DF3834D20193F0A38D0EB21F0A78D48F2EC276C285969131B86DF8D4EF9E762" }, 11 11 { name = "gleeunit", version = "1.9.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "DA9553CE58B67924B3C631F96FE3370C49EB6D6DC6B384EC4862CC4AAA718F3C" }, 12 12 { name = "simplifile", version = "2.3.1", build_tools = ["gleam"], requirements = ["filepath", "gleam_stdlib"], otp_app = "simplifile", source = "hex", outer_checksum = "957E0E5B75927659F1D2A1B7B75D7B9BA96FAA8D0C53EA71C4AD9CD0C6B848F6" }, 13 13 ]
+9
src/honk/validation/formats.gleam
··· 217 217 } 218 218 } 219 219 220 + /// Validates CID format with raw multicodec (0x55) for blobs 221 + /// Base32 CIDv1 with raw multicodec starts with "bafkrei" 222 + pub fn is_valid_raw_cid(value: String) -> Bool { 223 + case is_valid_cid(value) { 224 + False -> False 225 + True -> string.starts_with(value, "bafkrei") 226 + } 227 + } 228 + 220 229 /// Validates BCP47 language tag 221 230 pub fn is_valid_language_tag(value: String) -> Bool { 222 231 // Lenient BCP47 validation (max 128 chars)
+93 -4
src/honk/validation/primitive/blob.gleam
··· 13 13 import honk/internal/constraints 14 14 import honk/internal/json_helpers 15 15 import honk/validation/context.{type ValidationContext} 16 + import honk/validation/formats 16 17 17 18 const allowed_fields = ["type", "accept", "maxSize", "description"] 19 + 20 + const allowed_data_fields = ["$type", "ref", "mimeType", "size"] 18 21 19 22 /// Validates blob schema definition 20 23 pub fn validate_schema( ··· 66 69 Error(errors.data_validation(def_name <> ": expected blob object")) 67 70 } 68 71 True -> { 69 - // Validate required mimeType field 72 + // Validate no extra fields (strict mode per atproto implementation) 73 + let keys = json_helpers.get_keys(data) 74 + use _ <- result.try(validate_no_extra_fields(def_name, keys)) 75 + 76 + // Validate $type field must be "blob" 77 + use _ <- result.try(case json_helpers.get_string(data, "$type") { 78 + Some("blob") -> Ok(Nil) 79 + Some(other) -> 80 + Error(errors.data_validation( 81 + def_name <> ": blob $type must be 'blob', got '" <> other <> "'", 82 + )) 83 + None -> 84 + Error(errors.data_validation( 85 + def_name <> ": blob missing required '$type' field", 86 + )) 87 + }) 88 + 89 + // Validate ref field with $link containing raw CID 90 + use _ <- result.try(validate_ref_field(data, def_name)) 91 + 92 + // Validate required mimeType field (non-empty) 70 93 use mime_type <- result.try( 71 94 case json_helpers.get_string(data, "mimeType") { 72 - Some(mt) -> Ok(mt) 95 + Some(mt) -> 96 + case string.is_empty(mt) { 97 + True -> 98 + Error(errors.data_validation( 99 + def_name <> ": blob mimeType cannot be empty", 100 + )) 101 + False -> Ok(mt) 102 + } 73 103 None -> 74 104 Error(errors.data_validation( 75 105 def_name <> ": blob missing required 'mimeType' field", ··· 77 107 }, 78 108 ) 79 109 80 - // Validate required size field 110 + // Validate required size field (non-negative integer) 81 111 use size <- result.try(case json_helpers.get_int(data, "size") { 82 - Some(s) -> Ok(s) 112 + Some(s) -> 113 + case s >= 0 { 114 + True -> Ok(s) 115 + False -> 116 + Error(errors.data_validation( 117 + def_name <> ": blob size must be non-negative", 118 + )) 119 + } 83 120 None -> 84 121 Error(errors.data_validation( 85 122 def_name <> ": blob missing or invalid 'size' field", ··· 111 148 None -> Ok(Nil) 112 149 } 113 150 } 151 + } 152 + } 153 + 154 + /// Validates that blob data has no extra fields 155 + fn validate_no_extra_fields( 156 + def_name: String, 157 + keys: List(String), 158 + ) -> Result(Nil, errors.ValidationError) { 159 + let extra_keys = 160 + list.filter(keys, fn(key) { !list.contains(allowed_data_fields, key) }) 161 + case extra_keys { 162 + [] -> Ok(Nil) 163 + [first, ..] -> 164 + Error(errors.data_validation( 165 + def_name <> ": blob has unexpected field '" <> first <> "'", 166 + )) 167 + } 168 + } 169 + 170 + /// Validates the ref field containing $link with raw CID 171 + fn validate_ref_field( 172 + data: Json, 173 + def_name: String, 174 + ) -> Result(Nil, errors.ValidationError) { 175 + case json_helpers.get_field(data, "ref") { 176 + Some(ref_json) -> 177 + case json_helpers.is_object(ref_json) { 178 + False -> 179 + Error(errors.data_validation( 180 + def_name <> ": blob ref must be an object", 181 + )) 182 + True -> 183 + case json_helpers.get_string(ref_json, "$link") { 184 + Some(cid) -> 185 + case formats.is_valid_raw_cid(cid) { 186 + True -> Ok(Nil) 187 + False -> 188 + Error(errors.data_validation( 189 + def_name 190 + <> ": blob ref.$link must be a valid CID with raw multicodec (bafkrei prefix)", 191 + )) 192 + } 193 + None -> 194 + Error(errors.data_validation( 195 + def_name <> ": blob ref must have $link field", 196 + )) 197 + } 198 + } 199 + None -> 200 + Error(errors.data_validation( 201 + def_name <> ": blob missing required 'ref' field", 202 + )) 114 203 } 115 204 } 116 205
+336 -2
test/blob_validator_test.gleam
··· 90 90 91 91 let data = 92 92 json.object([ 93 + #("$type", json.string("blob")), 94 + #( 95 + "ref", 96 + json.object([ 97 + #( 98 + "$link", 99 + json.string( 100 + "bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy", 101 + ), 102 + ), 103 + ]), 104 + ), 93 105 #("mimeType", json.string("image/jpeg")), 94 106 #("size", json.int(50_000)), 95 107 ]) ··· 109 121 110 122 let data = 111 123 json.object([ 124 + #("$type", json.string("blob")), 125 + #( 126 + "ref", 127 + json.object([ 128 + #( 129 + "$link", 130 + json.string( 131 + "bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy", 132 + ), 133 + ), 134 + ]), 135 + ), 112 136 #("mimeType", json.string("video/mp4")), 113 137 #("size", json.int(50_000)), 114 138 ]) ··· 128 152 129 153 let data = 130 154 json.object([ 155 + #("$type", json.string("blob")), 156 + #( 157 + "ref", 158 + json.object([ 159 + #( 160 + "$link", 161 + json.string( 162 + "bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy", 163 + ), 164 + ), 165 + ]), 166 + ), 131 167 #("mimeType", json.string("image/jpeg")), 132 168 #("size", json.int(50_000)), 133 169 ]) ··· 141 177 pub fn missing_mime_type_test() { 142 178 let schema = json.object([#("type", json.string("blob"))]) 143 179 144 - let data = json.object([#("size", json.int(50_000))]) 180 + let data = 181 + json.object([ 182 + #("$type", json.string("blob")), 183 + #( 184 + "ref", 185 + json.object([ 186 + #( 187 + "$link", 188 + json.string( 189 + "bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy", 190 + ), 191 + ), 192 + ]), 193 + ), 194 + #("size", json.int(50_000)), 195 + ]) 145 196 146 197 let assert Ok(ctx) = context.builder() |> context.build 147 198 let result = blob.validate_data(data, schema, ctx) ··· 152 203 pub fn missing_size_test() { 153 204 let schema = json.object([#("type", json.string("blob"))]) 154 205 155 - let data = json.object([#("mimeType", json.string("image/jpeg"))]) 206 + let data = 207 + json.object([ 208 + #("$type", json.string("blob")), 209 + #( 210 + "ref", 211 + json.object([ 212 + #( 213 + "$link", 214 + json.string( 215 + "bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy", 216 + ), 217 + ), 218 + ]), 219 + ), 220 + #("mimeType", json.string("image/jpeg")), 221 + ]) 222 + 223 + let assert Ok(ctx) = context.builder() |> context.build 224 + let result = blob.validate_data(data, schema, ctx) 225 + result |> should.be_error 226 + } 227 + 228 + // ========== FULL BLOB STRUCTURE TESTS ========== 229 + 230 + // Test valid full blob structure 231 + pub fn valid_full_blob_structure_test() { 232 + let schema = json.object([#("type", json.string("blob"))]) 233 + 234 + let data = 235 + json.object([ 236 + #("$type", json.string("blob")), 237 + #( 238 + "ref", 239 + json.object([ 240 + #( 241 + "$link", 242 + json.string( 243 + "bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy", 244 + ), 245 + ), 246 + ]), 247 + ), 248 + #("mimeType", json.string("image/jpeg")), 249 + #("size", json.int(50_000)), 250 + ]) 251 + 252 + let assert Ok(ctx) = context.builder() |> context.build 253 + let result = blob.validate_data(data, schema, ctx) 254 + result |> should.be_ok 255 + } 256 + 257 + // Test missing $type field 258 + pub fn missing_type_field_test() { 259 + let schema = json.object([#("type", json.string("blob"))]) 260 + 261 + let data = 262 + json.object([ 263 + #( 264 + "ref", 265 + json.object([ 266 + #( 267 + "$link", 268 + json.string( 269 + "bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy", 270 + ), 271 + ), 272 + ]), 273 + ), 274 + #("mimeType", json.string("image/jpeg")), 275 + #("size", json.int(50_000)), 276 + ]) 277 + 278 + let assert Ok(ctx) = context.builder() |> context.build 279 + let result = blob.validate_data(data, schema, ctx) 280 + result |> should.be_error 281 + } 282 + 283 + // Test wrong $type value 284 + pub fn wrong_type_value_test() { 285 + let schema = json.object([#("type", json.string("blob"))]) 286 + 287 + let data = 288 + json.object([ 289 + #("$type", json.string("notblob")), 290 + #( 291 + "ref", 292 + json.object([ 293 + #( 294 + "$link", 295 + json.string( 296 + "bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy", 297 + ), 298 + ), 299 + ]), 300 + ), 301 + #("mimeType", json.string("image/jpeg")), 302 + #("size", json.int(50_000)), 303 + ]) 304 + 305 + let assert Ok(ctx) = context.builder() |> context.build 306 + let result = blob.validate_data(data, schema, ctx) 307 + result |> should.be_error 308 + } 309 + 310 + // Test missing ref field 311 + pub fn missing_ref_field_test() { 312 + let schema = json.object([#("type", json.string("blob"))]) 313 + 314 + let data = 315 + json.object([ 316 + #("$type", json.string("blob")), 317 + #("mimeType", json.string("image/jpeg")), 318 + #("size", json.int(50_000)), 319 + ]) 320 + 321 + let assert Ok(ctx) = context.builder() |> context.build 322 + let result = blob.validate_data(data, schema, ctx) 323 + result |> should.be_error 324 + } 325 + 326 + // Test ref without $link 327 + pub fn ref_missing_link_test() { 328 + let schema = json.object([#("type", json.string("blob"))]) 329 + 330 + let data = 331 + json.object([ 332 + #("$type", json.string("blob")), 333 + #("ref", json.object([#("cid", json.string("bafkrei..."))])), 334 + #("mimeType", json.string("image/jpeg")), 335 + #("size", json.int(50_000)), 336 + ]) 337 + 338 + let assert Ok(ctx) = context.builder() |> context.build 339 + let result = blob.validate_data(data, schema, ctx) 340 + result |> should.be_error 341 + } 342 + 343 + // Test ref with invalid CID 344 + pub fn ref_invalid_cid_test() { 345 + let schema = json.object([#("type", json.string("blob"))]) 346 + 347 + let data = 348 + json.object([ 349 + #("$type", json.string("blob")), 350 + #("ref", json.object([#("$link", json.string("not-a-valid-cid"))])), 351 + #("mimeType", json.string("image/jpeg")), 352 + #("size", json.int(50_000)), 353 + ]) 354 + 355 + let assert Ok(ctx) = context.builder() |> context.build 356 + let result = blob.validate_data(data, schema, ctx) 357 + result |> should.be_error 358 + } 359 + 360 + // Test ref with dag-cbor CID (should fail - blobs need raw multicodec) 361 + pub fn ref_dag_cbor_cid_test() { 362 + let schema = json.object([#("type", json.string("blob"))]) 363 + 364 + let data = 365 + json.object([ 366 + #("$type", json.string("blob")), 367 + #( 368 + "ref", 369 + json.object([ 370 + #( 371 + "$link", 372 + json.string( 373 + "bafyreidfayvfuwqa7qlnopdjiqrxzs6blmoeu4rujcjtnci5beludirz2a", 374 + ), 375 + ), 376 + ]), 377 + ), 378 + #("mimeType", json.string("image/jpeg")), 379 + #("size", json.int(50_000)), 380 + ]) 381 + 382 + let assert Ok(ctx) = context.builder() |> context.build 383 + let result = blob.validate_data(data, schema, ctx) 384 + result |> should.be_error 385 + } 386 + 387 + // Test empty mimeType rejected 388 + pub fn empty_mime_type_test() { 389 + let schema = json.object([#("type", json.string("blob"))]) 390 + 391 + let data = 392 + json.object([ 393 + #("$type", json.string("blob")), 394 + #( 395 + "ref", 396 + json.object([ 397 + #( 398 + "$link", 399 + json.string( 400 + "bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy", 401 + ), 402 + ), 403 + ]), 404 + ), 405 + #("mimeType", json.string("")), 406 + #("size", json.int(50_000)), 407 + ]) 408 + 409 + let assert Ok(ctx) = context.builder() |> context.build 410 + let result = blob.validate_data(data, schema, ctx) 411 + result |> should.be_error 412 + } 413 + 414 + // Test size zero is allowed (per atproto implementation) 415 + pub fn size_zero_allowed_test() { 416 + let schema = json.object([#("type", json.string("blob"))]) 417 + 418 + let data = 419 + json.object([ 420 + #("$type", json.string("blob")), 421 + #( 422 + "ref", 423 + json.object([ 424 + #( 425 + "$link", 426 + json.string( 427 + "bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy", 428 + ), 429 + ), 430 + ]), 431 + ), 432 + #("mimeType", json.string("image/jpeg")), 433 + #("size", json.int(0)), 434 + ]) 435 + 436 + let assert Ok(ctx) = context.builder() |> context.build 437 + let result = blob.validate_data(data, schema, ctx) 438 + result |> should.be_ok 439 + } 440 + 441 + // Test negative size rejected 442 + pub fn negative_size_test() { 443 + let schema = json.object([#("type", json.string("blob"))]) 444 + 445 + let data = 446 + json.object([ 447 + #("$type", json.string("blob")), 448 + #( 449 + "ref", 450 + json.object([ 451 + #( 452 + "$link", 453 + json.string( 454 + "bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy", 455 + ), 456 + ), 457 + ]), 458 + ), 459 + #("mimeType", json.string("image/jpeg")), 460 + #("size", json.int(-100)), 461 + ]) 462 + 463 + let assert Ok(ctx) = context.builder() |> context.build 464 + let result = blob.validate_data(data, schema, ctx) 465 + result |> should.be_error 466 + } 467 + 468 + // Test extra fields are rejected (strict mode per atproto implementation) 469 + pub fn extra_fields_rejected_test() { 470 + let schema = json.object([#("type", json.string("blob"))]) 471 + 472 + let data = 473 + json.object([ 474 + #("$type", json.string("blob")), 475 + #( 476 + "ref", 477 + json.object([ 478 + #( 479 + "$link", 480 + json.string( 481 + "bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy", 482 + ), 483 + ), 484 + ]), 485 + ), 486 + #("mimeType", json.string("image/jpeg")), 487 + #("size", json.int(50_000)), 488 + #("extraField", json.string("not allowed")), 489 + ]) 156 490 157 491 let assert Ok(ctx) = context.builder() |> context.build 158 492 let result = blob.validate_data(data, schema, ctx)
+30
test/format_validator_test.gleam
··· 256 256 formats.is_valid_cid("") |> should.be_false 257 257 } 258 258 259 + // ========== RAW CID TESTS ========== 260 + 261 + // Test valid raw CID (bafkrei prefix = CIDv1 + raw multicodec 0x55) 262 + pub fn valid_raw_cid_test() { 263 + formats.is_valid_raw_cid( 264 + "bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy", 265 + ) 266 + |> should.be_true 267 + } 268 + 269 + // Test dag-cbor CID rejected (bafyrei prefix = CIDv1 + dag-cbor multicodec 0x71) 270 + pub fn invalid_raw_cid_dag_cbor_test() { 271 + formats.is_valid_raw_cid( 272 + "bafyreidfayvfuwqa7qlnopdjiqrxzs6blmoeu4rujcjtnci5beludirz2a", 273 + ) 274 + |> should.be_false 275 + } 276 + 277 + // Test CIDv0 rejected for raw CID 278 + pub fn invalid_raw_cid_v0_test() { 279 + formats.is_valid_raw_cid("QmbWqxBEKC3P8tqsKc98xmWNzrzDtRLMiMPL8wBuTGsMnR") 280 + |> should.be_false 281 + } 282 + 283 + // Test invalid CID rejected 284 + pub fn invalid_raw_cid_garbage_test() { 285 + formats.is_valid_raw_cid("not-a-cid") 286 + |> should.be_false 287 + } 288 + 259 289 // ========== LANGUAGE TESTS ========== 260 290 261 291 pub fn language_valid_test() {