rebuild data model, again

+205
ARCHITECTURE_OVERVIEW.md
··· 1 + # ATProto Data 和 Lexicon 模块架构总览 2 + 3 + ## 项目概述 4 + 5 + 本项目为 ATProto (Authenticated Transfer Protocol) 提供 Python 实现,专注于数据模型和 Lexicon 定义的处理。基于现有的 URI 模块架构模式,提供类型安全的数据验证、序列化和 Lexicon 解析功能。 6 + 7 + ## 整体架构设计 8 + 9 + ### 1. 系统架构图 10 + 11 + ```mermaid 12 + graph TB 13 + subgraph "ATProto 核心模块" 14 + URI[URI 模块] 15 + Data[Data 模块] 16 + Lexicon[Lexicon 模块] 17 + end 18 + 19 + subgraph "外部依赖" 20 + Pydantic[Pydantic] 21 + CBOR[cbor2] 22 + CID[py-cid] 23 + end 24 + 25 + subgraph "数据流" 26 + LexiconJSON[Lexicon JSON 文件] 27 + RawData[原始数据] 28 + end 29 + 30 + LexiconJSON --> Lexicon 31 + Lexicon --> Data 32 + RawData --> Data 33 + Data --> Serialized[序列化数据] 34 + 35 + URI --> Data 36 + URI --> Lexicon 37 + 38 + Pydantic --> Data 39 + Pydantic --> Lexicon 40 + CBOR --> Data 41 + CID --> Data 42 + ``` 43 + 44 + ### 2. 模块职责划分 45 + 46 + #### 2.1 Data 模块 (`src/atpasser/data`) 47 + - **数据序列化**: JSON 和 DAG-CBOR 格式的序列化/反序列化 48 + - **数据验证**: 类型验证、格式验证、约束验证 49 + - **特殊类型处理**: CID 链接、Blob 引用、日期时间格式等 50 + - **错误处理**: 详细的验证错误和序列化错误 51 + 52 + #### 2.2 Lexicon 模块 (`src/atpasser/lexicon`) 53 + - **定义解析**: 解析 Lexicon JSON 定义文件 54 + - **模型生成**: 动态生成 Pydantic 模型类 55 + - **引用解析**: 处理跨定义引用和联合类型 56 + - **注册管理**: 模型注册表和缓存管理 57 + - **兼容性验证**: 前向和后向兼容性检查 58 + 59 + ### 3. 核心功能特性 60 + 61 + #### 3.1 类型安全 62 + - 基于 Pydantic 的强类型系统 63 + - 运行时类型验证 64 + - 自动类型转换和规范化 65 + 66 + #### 3.2 格式支持 67 + - **JSON**: 符合 ATProto JSON 编码规范 68 + - **DAG-CBOR**: 支持规范的 DAG-CBOR 编码 69 + - **混合格式**: 支持两种格式间的转换 70 + 71 + #### 3.3 验证系统 72 + - 语法验证 (基础数据类型) 73 + - 语义验证 (业务规则和约束) 74 + - 格式验证 (字符串格式如 datetime、uri、did 等) 75 + - 引用验证 (CID、blob、跨定义引用) 76 + 77 + ### 4. 集成架构 78 + 79 + #### 4.1 与现有 URI 模块的集成 80 + 81 + ```python 82 + # 示例:URI 与 Data 模块的集成 83 + from atpasser.uri import URI, NSID 84 + from atpasser.data import ATProtoSerializer 85 + from atpasser.lexicon import LexiconRegistry 86 + 87 + # 解析 URI 88 + uri = URI("at://example.com/com.example.blog.post/123") 89 + 90 + # 根据 NSID 获取对应的数据模型 91 + model_class = LexiconRegistry.get_model(uri.collection.nsid) 92 + 93 + # 使用 Data 模块处理数据 94 + serializer = ATProtoSerializer() 95 + data = serializer.from_json(raw_data, model_class) 96 + ``` 97 + 98 + #### 4.2 数据流架构 99 + 100 + ``` 101 + 原始数据 → Data 模块验证 → Lexicon 模型转换 → 序列化输出 102 + Lexicon JSON → Lexicon 模块解析 → 生成 Pydantic 模型 → 注册到注册表 103 + ``` 104 + 105 + ### 5. 错误处理架构 106 + 107 + #### 5.1 统一的错误体系 108 + 109 + ```python 110 + class ATProtoError(Exception): 111 + """基础错误类""" 112 + pass 113 + 114 + class DataError(ATProtoError): 115 + """数据相关错误""" 116 + pass 117 + 118 + class LexiconError(ATProtoError): 119 + """Lexicon 相关错误""" 120 + pass 121 + 122 + class URIError(ATProtoError): 123 + """URI 相关错误""" 124 + pass 125 + ``` 126 + 127 + #### 5.2 错误诊断 128 + - **字段级错误定位**: 精确到具体字段的路径信息 129 + - **上下文信息**: 包含验证时的输入数据和期望格式 130 + - **建议修复**: 提供具体的修复建议 131 + 132 + ### 6. 性能优化策略 133 + 134 + #### 6.1 缓存机制 135 + - **模型缓存**: 缓存已解析的 Lexicon 模型 136 + - **序列化缓存**: 缓存序列化结果 137 + - **引用解析缓存**: 缓存跨定义引用解析结果 138 + 139 + #### 6.2 懒加载 140 + - 按需解析 Lexicon 定义 141 + - 延迟模型生成直到实际使用 142 + - 动态导入依赖模块 143 + 144 + ### 7. 扩展性设计 145 + 146 + #### 7.1 插件系统 147 + - 支持自定义类型处理器 148 + - 支持自定义验证规则 149 + - 支持自定义序列化格式 150 + 151 + #### 7.2 中间件支持 152 + - 预处理钩子 (数据清洗、转换) 153 + - 后处理钩子 (日志记录、监控) 154 + - 验证钩子 (自定义验证逻辑) 155 + 156 + ### 8. 实施路线图 157 + 158 + #### 阶段 1: 基础实现 (2-3 周) 159 + - 实现 Data 模块基础类型和 JSON 序列化 160 + - 实现 Lexicon 模块基础解析器 161 + - 建立基本的错误处理系统 162 + 163 + #### 阶段 2: 完整功能 (3-4 周) 164 + - 添加 CBOR 序列化支持 165 + - 实现完整的验证系统 166 + - 添加引用解析和联合类型支持 167 + 168 + #### 阶段 3: 优化增强 (2 周) 169 + - 实现缓存和性能优化 170 + - 添加高级格式验证 171 + - 完善错误处理和诊断信息 172 + 173 + #### 阶段 4: 测试部署 (1-2 周) 174 + - 编写完整的测试套件 175 + - 性能测试和优化 176 + - 文档编写和示例代码 177 + 178 + ### 9. 依赖管理 179 + 180 + #### 9.1 核心依赖 181 + - `pydantic >=2.11.9`: 数据验证和模型定义 182 + - `cbor2 >=5.7.0`: CBOR 序列化支持 183 + - `py-cid >=0.3.0`: CID 处理支持 184 + 185 + #### 9.2 可选依赖 186 + - `jsonpath-ng >=1.7.0`: JSONPath 支持 187 + - `langcodes >=3.5.0`: 语言代码验证 188 + 189 + ### 10. 质量保证 190 + 191 + #### 10.1 测试策略 192 + - **单元测试**: 覆盖所有核心功能 193 + - **集成测试**: 测试模块间集成 194 + - **兼容性测试**: 确保与规范兼容 195 + - **性能测试**: 验证性能指标 196 + 197 + #### 10.2 代码质量 198 + - 类型注解覆盖率达到 100% 199 + - 测试覆盖率超过 90% 200 + - 遵循 PEP 8 编码规范 201 + - 详细的文档和示例 202 + 203 + ## 总结 204 + 205 + 本架构设计提供了一个完整、可扩展的 ATProto 数据处理解决方案,充分利用了 Python 的类型系统和现有生态,同时保持了与 ATProto 规范的完全兼容性。模块化的设计使得各个组件可以独立开发和测试,同时也便于未来的扩展和维护。
+119
examples/basic_usage.py
··· 1 + """Basic usage examples for ATProto data and lexicon modules.""" 2 + 3 + import json 4 + from atpasser.data import serializer, CIDLink, DateTimeString 5 + from atpasser.lexicon import parser, registry 6 + 7 + 8 + def demonstrate_data_serialization(): 9 + """Demonstrate basic data serialization.""" 10 + print("=== Data Serialization Demo ===") 11 + 12 + # Create some sample data 13 + sample_data = { 14 + "title": "Hello ATProto", 15 + "content": "This is a test post", 16 + "createdAt": "2024-01-15T10:30:00.000Z", 17 + "tags": ["atproto", "test", "demo"], 18 + "cidLink": CIDLink( 19 + "bafyreidfayvfuwqa7qlnopdjiqrxzs6blmoeu4rujcjtnci5beludirz2a" 20 + ), 21 + } 22 + 23 + # Serialize to JSON 24 + json_output = serializer.to_json(sample_data, indent=2) 25 + print("JSON Output:") 26 + print(json_output) 27 + 28 + # Deserialize back 29 + deserialized = serializer.from_json(json_output) 30 + print("\nDeserialized:") 31 + print(deserialized) 32 + 33 + print() 34 + 35 + 36 + def demonstrate_lexicon_parsing(): 37 + """Demonstrate Lexicon parsing.""" 38 + print("=== Lexicon Parsing Demo ===") 39 + 40 + # Sample Lexicon definition 41 + sample_lexicon = { 42 + "lexicon": 1, 43 + "id": "com.example.blog.post", 44 + "description": "A simple blog post record", 45 + "defs": { 46 + "main": { 47 + "type": "record", 48 + "key": "literal:post", 49 + "record": { 50 + "type": "object", 51 + "properties": { 52 + "title": {"type": "string", "maxLength": 300}, 53 + "content": {"type": "string"}, 54 + "createdAt": {"type": "string", "format": "datetime"}, 55 + "tags": { 56 + "type": "array", 57 + "items": {"type": "string"}, 58 + "maxLength": 10, 59 + }, 60 + }, 61 + "required": ["title", "content", "createdAt"], 62 + }, 63 + } 64 + }, 65 + } 66 + 67 + try: 68 + # Parse and register the Lexicon 69 + parser.parse_and_register(sample_lexicon) 70 + print("Lexicon parsed and registered successfully!") 71 + 72 + # Get the generated model 73 + model_class = registry.get_model("com.example.blog.post") 74 + if model_class: 75 + print(f"Generated model: {model_class.__name__}") 76 + 77 + # Create an instance using the model 78 + post_data = { 79 + "title": "Test Post", 80 + "content": "This is a test post content", 81 + "createdAt": "2024-01-15T10:30:00.000Z", 82 + "tags": ["test", "demo"], 83 + } 84 + 85 + validated_post = model_class(**post_data) 86 + print(f"Validated post: {validated_post.model_dump()}") 87 + 88 + except Exception as e: 89 + print(f"Error: {e}") 90 + 91 + print() 92 + 93 + 94 + def demonstrate_custom_types(): 95 + """Demonstrate custom type validation.""" 96 + print("=== Custom Type Validation Demo ===") 97 + 98 + # DateTimeString validation 99 + try: 100 + valid_dt = DateTimeString("2024-01-15T10:30:00.000Z") 101 + print(f"Valid datetime: {valid_dt}") 102 + except Exception as e: 103 + print(f"DateTime validation error: {e}") 104 + 105 + # Invalid datetime 106 + try: 107 + invalid_dt = DateTimeString("invalid-date") 108 + print(f"Invalid datetime: {invalid_dt}") 109 + except Exception as e: 110 + print(f"DateTime validation caught: {e}") 111 + 112 + print() 113 + 114 + 115 + if __name__ == "__main__": 116 + demonstrate_data_serialization() 117 + demonstrate_lexicon_parsing() 118 + demonstrate_custom_types() 119 + print("Demo completed!")
+11
src/atpasser/__init__.py
··· 1 + """ATProto Python implementation - Tools for Authenticated Transfer Protocol.""" 2 + 3 + from . import uri 4 + from . import data 5 + from . import lexicon 6 + 7 + __all__ = ["uri", "data", "lexicon"] 8 + 9 + __version__ = "0.1.0" 10 + __author__ = "diaowinner" 11 + __email__ = "diaowinner@qq.com"
+215
src/atpasser/data/ARCHITECTURE.md
··· 1 + # ATProto 数据模型模块架构设计 2 + 3 + ## 概述 4 + 5 + 本模块负责实现 ATProto 数据模型的序列化、反序列化和验证功能,支持 JSON 和 DAG-CBOR 两种格式的数据编码。 6 + 7 + ## 核心架构设计 8 + 9 + ### 1. 基础类型系统 10 + 11 + #### 1.1 基础类型映射 12 + 13 + ```python 14 + # 基础类型映射 15 + DATA_MODEL_TYPE_MAPPING = { 16 + "null": NoneType, 17 + "boolean": bool, 18 + "integer": int, 19 + "string": str, 20 + "bytes": bytes, 21 + "cid-link": CIDLink, 22 + "blob": BlobRef, 23 + "array": list, 24 + "object": dict 25 + } 26 + ``` 27 + 28 + #### 1.2 自定义字段类型 29 + 30 + - **CIDLink**: 处理 CID 链接,支持二进制和字符串表示 31 + - **BlobRef**: 处理 blob 引用,支持新旧格式兼容 32 + - **DateTimeString**: RFC 3339 日期时间格式验证 33 + - **LanguageTag**: BCP 47 语言标签验证 34 + 35 + ### 2. 序列化器架构 36 + 37 + #### 2.1 序列化器层级结构 38 + 39 + ``` 40 + ATProtoSerializer 41 + ├── JSONSerializer 42 + │ ├── Normalizer 43 + │ └── Denormalizer 44 + └── CBORSerializer 45 + ├── DAGCBOREncoder 46 + └── DAGCBORDecoder 47 + ``` 48 + 49 + #### 2.2 序列化流程 50 + 51 + 1. **数据验证**: 使用 Pydantic 模型验证数据 52 + 2. **格式转换**: 特殊类型转换(CID、bytes 等) 53 + 3. **编码**: 根据目标格式进行编码 54 + 4. **规范化**: 确保输出符合 ATProto 规范 55 + 56 + ### 3. 验证系统 57 + 58 + #### 3.1 验证层级 59 + 60 + 1. **语法验证**: 基础数据类型验证 61 + 2. **格式验证**: 字符串格式验证(datetime、uri、did 等) 62 + 3. **约束验证**: 长度、范围、枚举等约束验证 63 + 4. **引用验证**: CID 和 blob 引用有效性验证 64 + 65 + #### 3.2 自定义验证器 66 + 67 + ```python 68 + class DataModelValidator: 69 + def validate_cid(self, value: str) -> bool: 70 + """验证 CID 格式""" 71 + pass 72 + 73 + def validate_datetime(self, value: str) -> bool: 74 + """验证 RFC 3339 datetime 格式""" 75 + pass 76 + 77 + def validate_did(self, value: str) -> bool: 78 + """验证 DID 格式""" 79 + pass 80 + 81 + def validate_handle(self, value: str) -> bool: 82 + """验证 handle 格式""" 83 + pass 84 + 85 + def validate_nsid(self, value: str) -> bool: 86 + """验证 NSID 格式""" 87 + pass 88 + ``` 89 + 90 + ### 4. 特殊类型处理 91 + 92 + #### 4.1 CID 链接处理 93 + 94 + ```python 95 + class CIDLink: 96 + """处理 CID 链接类型""" 97 + 98 + def __init__(self, cid: Union[str, bytes]): 99 + self.cid = cid 100 + 101 + def to_json(self) -> dict: 102 + """序列化为 JSON 格式: {"$link": "cid-string"}""" 103 + return {"$link": str(self.cid)} 104 + 105 + def to_cbor(self) -> bytes: 106 + """序列化为 DAG-CBOR 格式""" 107 + pass 108 + ``` 109 + 110 + #### 4.2 Blob 引用处理 111 + 112 + ```python 113 + class BlobRef: 114 + """处理 blob 引用,支持新旧格式""" 115 + 116 + def __init__(self, ref: CIDLink, mime_type: str, size: int): 117 + self.ref = ref 118 + self.mime_type = mime_type 119 + self.size = size 120 + 121 + def to_json(self) -> dict: 122 + """序列化为 JSON 格式""" 123 + return { 124 + "$type": "blob", 125 + "ref": self.ref.to_json(), 126 + "mimeType": self.mime_type, 127 + "size": self.size 128 + } 129 + 130 + @classmethod 131 + def from_legacy(cls, data: dict): 132 + """从旧格式解析""" 133 + pass 134 + ``` 135 + 136 + ### 5. 错误处理系统 137 + 138 + #### 5.1 错误类型体系 139 + 140 + ```python 141 + class DataModelError(Exception): 142 + """基础数据模型错误""" 143 + pass 144 + 145 + class SerializationError(DataModelError): 146 + """序列化错误""" 147 + pass 148 + 149 + class ValidationError(DataModelError): 150 + """验证错误""" 151 + pass 152 + 153 + class FormatError(DataModelError): 154 + """格式错误""" 155 + pass 156 + ``` 157 + 158 + #### 5.2 错误消息格式 159 + 160 + - **详细路径信息**: 包含字段路径 161 + - **期望值描述**: 明确的期望格式说明 162 + - **上下文信息**: 验证时的上下文数据 163 + 164 + ### 6. 模块文件结构 165 + 166 + ``` 167 + src/atpasser/data/ 168 + ├── __init__.py # 模块导出 169 + ├── ARCHITECTURE.md # 架构文档 170 + ├── types.py # 基础类型定义 171 + ├── serializer.py # 序列化器实现 172 + ├── validator.py # 验证器实现 173 + ├── exceptions.py # 异常定义 174 + ├── cid.py # CID 链接处理 175 + ├── blob.py # Blob 引用处理 176 + └── formats.py # 格式验证器 177 + ``` 178 + 179 + ### 7. 依赖关系 180 + 181 + - **内部依赖**: `src/atpasser/uri` (NSID、DID、Handle 验证) 182 + - **外部依赖**: 183 + - `pydantic`: 数据验证 184 + - `cbor2`: CBOR 序列化 185 + - `py-cid`: CID 处理 186 + 187 + ## 实现策略 188 + 189 + ### 1. 渐进式实现 190 + 191 + 1. **阶段一**: 实现基础类型和 JSON 序列化 192 + 2. **阶段二**: 添加 CBOR 序列化和验证器 193 + 3. **阶段三**: 实现高级格式验证 194 + 4. **阶段四**: 性能优化和内存管理 195 + 196 + ### 2. 测试策略 197 + 198 + - **单元测试**: 测试各个组件功能 199 + - **集成测试**: 测试端到端数据流 200 + - **兼容性测试**: 确保与现有实现兼容 201 + - **性能测试**: 验证序列化性能 202 + 203 + ### 3. 扩展性考虑 204 + 205 + - **插件系统**: 支持自定义格式验证 206 + - **中间件**: 支持预处理和后处理钩子 207 + - **缓存**: 序列化结果缓存优化 208 + 209 + ## 优势 210 + 211 + 1. **类型安全**: 基于 Pydantic 的强类型系统 212 + 2. **性能**: 优化的序列化实现 213 + 3. **兼容性**: 支持新旧格式兼容 214 + 4. **可扩展**: 模块化设计支持未来扩展 215 + 5. **错误友好**: 详细的错误消息和诊断信息
+47
src/atpasser/data/__init__.py
··· 1 + """ATProto data model module for serialization and validation.""" 2 + 3 + from .exceptions import ( 4 + DataModelError, 5 + SerializationError, 6 + ValidationError, 7 + FormatError, 8 + CIDError, 9 + BlobError, 10 + ) 11 + 12 + from .types import ( 13 + CIDLink, 14 + DateTimeString, 15 + LanguageTag, 16 + ATUri, 17 + DIDString, 18 + HandleString, 19 + NSIDString, 20 + ) 21 + 22 + from .formats import format_validator, FormatValidator 23 + from .serializer import ATProtoSerializer, serializer 24 + 25 + __all__ = [ 26 + # Exceptions 27 + "DataModelError", 28 + "SerializationError", 29 + "ValidationError", 30 + "FormatError", 31 + "CIDError", 32 + "BlobError", 33 + # Types 34 + "CIDLink", 35 + "DateTimeString", 36 + "LanguageTag", 37 + "ATUri", 38 + "DIDString", 39 + "HandleString", 40 + "NSIDString", 41 + # Validators 42 + "format_validator", 43 + "FormatValidator", 44 + # Serializers 45 + "ATProtoSerializer", 46 + "serializer", 47 + ]
+87
src/atpasser/data/exceptions.py
··· 1 + """Exceptions for ATProto data model module.""" 2 + 3 + from typing import Optional 4 + 5 + 6 + class DataModelError(Exception): 7 + """Base exception for data model errors.""" 8 + 9 + def __init__(self, message: str, details: Optional[str] = None): 10 + self.message = message 11 + self.details = details 12 + super().__init__(message) 13 + 14 + 15 + class SerializationError(DataModelError): 16 + """Raised when serialization fails.""" 17 + 18 + def __init__(self, message: str, details: Optional[str] = None): 19 + super().__init__(f"Serialization error: {message}", details) 20 + 21 + 22 + class ValidationError(DataModelError): 23 + """Raised when data validation fails.""" 24 + 25 + def __init__( 26 + self, 27 + message: str, 28 + field_path: Optional[str] = None, 29 + expected: Optional[str] = None, 30 + actual: Optional[str] = None, 31 + ): 32 + self.fieldPath = field_path 33 + self.expected = expected 34 + self.actual = actual 35 + 36 + details = [] 37 + if field_path: 38 + details.append(f"Field: {field_path}") 39 + if expected: 40 + details.append(f"Expected: {expected}") 41 + if actual: 42 + details.append(f"Actual: {actual}") 43 + 44 + super().__init__( 45 + f"Validation error: {message}", "; ".join(details) if details else None 46 + ) 47 + 48 + 49 + class FormatError(DataModelError): 50 + """Raised when format validation fails.""" 51 + 52 + def __init__( 53 + self, 54 + message: str, 55 + format_type: Optional[str] = None, 56 + value: Optional[str] = None, 57 + ): 58 + self.formatType = format_type 59 + self.value = value 60 + 61 + details = [] 62 + if format_type: 63 + details.append(f"Format: {format_type}") 64 + if value: 65 + details.append(f"Value: {value}") 66 + 67 + super().__init__( 68 + f"Format error: {message}", "; ".join(details) if details else None 69 + ) 70 + 71 + 72 + class CIDError(DataModelError): 73 + """Raised when CID processing fails.""" 74 + 75 + def __init__(self, message: str, cid: Optional[str] = None): 76 + self.cid = cid 77 + super().__init__(f"CID error: {message}", f"CID: {cid}" if cid else None) 78 + 79 + 80 + class BlobError(DataModelError): 81 + """Raised when blob processing fails.""" 82 + 83 + def __init__(self, message: str, blob_ref: Optional[str] = None): 84 + self.blobRef = blob_ref 85 + super().__init__( 86 + f"Blob error: {message}", f"Blob ref: {blob_ref}" if blob_ref else None 87 + )
+190
src/atpasser/data/formats.py
··· 1 + """Format validators for ATProto data model.""" 2 + 3 + import re 4 + from typing import Any, Optional 5 + from .exceptions import FormatError 6 + 7 + 8 + class FormatValidator: 9 + """Validates string formats according to ATProto specifications.""" 10 + 11 + @staticmethod 12 + def validate_datetime(value: str) -> str: 13 + """Validate RFC 3339 datetime format.""" 14 + # RFC 3339 pattern with strict validation 15 + pattern = ( 16 + r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})$" 17 + ) 18 + if not re.match(pattern, value): 19 + raise FormatError("Invalid RFC 3339 datetime format", "datetime", value) 20 + 21 + # Additional semantic validation 22 + try: 23 + # Extract date parts for validation 24 + date_part, time_part = value.split("T", 1) 25 + year, month, day = map(int, date_part.split("-")) 26 + 27 + # Basic date validation 28 + if not (1 <= month <= 12): 29 + raise FormatError("Month must be between 01 and 12", "datetime", value) 30 + if not (1 <= day <= 31): 31 + raise FormatError("Day must be between 01 and 31", "datetime", value) 32 + if year < 0: 33 + raise FormatError("Year must be positive", "datetime", value) 34 + 35 + except ValueError: 36 + raise FormatError("Invalid datetime structure", "datetime", value) 37 + 38 + return value 39 + 40 + @staticmethod 41 + def validate_did(value: str) -> str: 42 + """Validate DID format.""" 43 + pattern = r"^did:[a-z]+:[a-zA-Z0-9._:%-]*[a-zA-Z0-9._-]$" 44 + if not re.match(pattern, value): 45 + raise FormatError("Invalid DID format", "did", value) 46 + 47 + if len(value) > 2048: 48 + raise FormatError("DID too long", "did", value) 49 + 50 + return value 51 + 52 + @staticmethod 53 + def validate_handle(value: str) -> str: 54 + """Validate handle format.""" 55 + if len(value) > 253: 56 + raise FormatError("Handle too long", "handle", value) 57 + 58 + labels = value.lower().split(".") 59 + if len(labels) < 2: 60 + raise FormatError("Handle must contain at least one dot", "handle", value) 61 + 62 + for i, label in enumerate(labels): 63 + if not (1 <= len(label) <= 63): 64 + raise FormatError( 65 + f"Label {i+1} length must be 1-63 characters", "handle", value 66 + ) 67 + 68 + if not re.match(r"^[a-z0-9-]+$", label): 69 + raise FormatError( 70 + f"Label {i+1} contains invalid characters", "handle", value 71 + ) 72 + 73 + if label.startswith("-") or label.endswith("-"): 74 + raise FormatError( 75 + f"Label {i+1} cannot start or end with hyphen", "handle", value 76 + ) 77 + 78 + if labels[-1][0].isdigit(): 79 + raise FormatError("TLD cannot start with digit", "handle", value) 80 + 81 + return value 82 + 83 + @staticmethod 84 + def validate_nsid(value: str) -> str: 85 + """Validate NSID format.""" 86 + if len(value) > 317: 87 + raise FormatError("NSID too long", "nsid", value) 88 + 89 + if not all(ord(c) < 128 for c in value): 90 + raise FormatError("NSID must contain only ASCII characters", "nsid", value) 91 + 92 + if value.startswith(".") or value.endswith("."): 93 + raise FormatError("NSID cannot start or end with dot", "nsid", value) 94 + 95 + segments = value.split(".") 96 + if len(segments) < 3: 97 + raise FormatError("NSID must have at least 3 segments", "nsid", value) 98 + 99 + # Validate domain authority segments 100 + for i, segment in enumerate(segments[:-1]): 101 + if not (1 <= len(segment) <= 63): 102 + raise FormatError( 103 + f"Domain segment {i+1} length must be 1-63", "nsid", value 104 + ) 105 + 106 + if not re.match(r"^[a-z0-9-]+$", segment): 107 + raise FormatError( 108 + f"Domain segment {i+1} contains invalid chars", "nsid", value 109 + ) 110 + 111 + if segment.startswith("-") or segment.endswith("-"): 112 + raise FormatError( 113 + f"Domain segment {i+1} cannot start/end with hyphen", "nsid", value 114 + ) 115 + 116 + # Validate name segment 117 + name = segments[-1] 118 + if not (1 <= len(name) <= 63): 119 + raise FormatError("Name segment length must be 1-63", "nsid", value) 120 + 121 + if not re.match(r"^[a-zA-Z0-9]+$", name): 122 + raise FormatError("Name segment contains invalid characters", "nsid", value) 123 + 124 + if name[0].isdigit(): 125 + raise FormatError("Name segment cannot start with digit", "nsid", value) 126 + 127 + return value 128 + 129 + @staticmethod 130 + def validate_uri(value: str) -> str: 131 + """Validate URI format.""" 132 + if len(value) > 8192: # 8 KB limit 133 + raise FormatError("URI too long", "uri", value) 134 + 135 + # Basic URI pattern validation 136 + uri_pattern = r"^[a-zA-Z][a-zA-Z0-9+.-]*:.*$" 137 + if not re.match(uri_pattern, value): 138 + raise FormatError("Invalid URI format", "uri", value) 139 + 140 + return value 141 + 142 + @staticmethod 143 + def validate_cid(value: str) -> str: 144 + """Validate CID format.""" 145 + # Basic CID pattern validation (simplified) 146 + cid_pattern = r"^[a-zA-Z0-9]+$" 147 + if not re.match(cid_pattern, value): 148 + raise FormatError("Invalid CID format", "cid", value) 149 + 150 + return value 151 + 152 + @staticmethod 153 + def validate_at_identifier(value: str) -> str: 154 + """Validate at-identifier format (DID or handle).""" 155 + try: 156 + # Try DID first 157 + return FormatValidator.validate_did(value) 158 + except FormatError: 159 + try: 160 + # Fall back to handle 161 + return FormatValidator.validate_handle(value) 162 + except FormatError: 163 + raise FormatError( 164 + "Invalid at-identifier (not a DID or handle)", 165 + "at-identifier", 166 + value, 167 + ) 168 + 169 + @staticmethod 170 + def validate_at_uri(value: str) -> str: 171 + """Validate at-uri format.""" 172 + if not value.startswith("at://"): 173 + raise FormatError("AT URI must start with 'at://'", "at-uri", value) 174 + 175 + # Additional validation can be added here 176 + return value 177 + 178 + @staticmethod 179 + def validate_language(value: str) -> str: 180 + """Validate language tag format.""" 181 + # BCP 47 pattern validation 182 + pattern = r"^[a-zA-Z]{1,8}(?:-[a-zA-Z0-9]{1,8})*$" 183 + if not re.match(pattern, value): 184 + raise FormatError("Invalid language tag format", "language", value) 185 + 186 + return value 187 + 188 + 189 + # Global validator instance 190 + format_validator = FormatValidator()
+125
src/atpasser/data/serializer.py
··· 1 + """Serializer for ATProto data model formats.""" 2 + 3 + import json 4 + import base64 5 + from typing import Any, Dict, Type, Union, Optional 6 + from pydantic import BaseModel 7 + from .exceptions import SerializationError, ValidationError 8 + from .types import CIDLink 9 + 10 + 11 + class ATProtoSerializer: 12 + """Serializer for ATProto JSON and CBOR formats.""" 13 + 14 + def __init__(self): 15 + self.json_encoder = JSONEncoder() 16 + self.json_decoder = JSONDecoder() 17 + 18 + def to_json(self, obj: Any, indent: Optional[int] = None) -> str: 19 + """Serialize object to ATProto JSON format.""" 20 + try: 21 + if isinstance(obj, BaseModel): 22 + obj = obj.model_dump(mode="json") 23 + 24 + serialized = self.json_encoder.encode(obj) 25 + return json.dumps(serialized, indent=indent, ensure_ascii=False) 26 + except Exception as e: 27 + raise SerializationError(f"JSON serialization failed: {str(e)}") 28 + 29 + def from_json( 30 + self, data: Union[str, bytes, dict], model: Optional[Type[BaseModel]] = None 31 + ) -> Any: 32 + """Deserialize from ATProto JSON format.""" 33 + try: 34 + if isinstance(data, (str, bytes)): 35 + data = json.loads(data) 36 + 37 + decoded = self.json_decoder.decode(data) 38 + 39 + if model and issubclass(model, BaseModel): 40 + return model.model_validate(decoded) 41 + return decoded 42 + except Exception as e: 43 + raise SerializationError(f"JSON deserialization failed: {str(e)}") 44 + 45 + def to_cbor(self, obj: Any) -> bytes: 46 + """Serialize object to DAG-CBOR format.""" 47 + try: 48 + # This is a placeholder - actual CBOR implementation would go here 49 + # For now, we'll convert to JSON and then encode as bytes 50 + json_str = self.to_json(obj) 51 + return json_str.encode("utf-8") 52 + except Exception as e: 53 + raise SerializationError(f"CBOR serialization failed: {str(e)}") 54 + 55 + def from_cbor(self, data: bytes, model: Optional[Type[BaseModel]] = None) -> Any: 56 + """Deserialize from DAG-CBOR format.""" 57 + try: 58 + # This is a placeholder - actual CBOR implementation would go here 59 + # For now, we'll decode from bytes and then parse JSON 60 + json_str = data.decode("utf-8") 61 + return self.from_json(json_str, model) 62 + except Exception as e: 63 + raise SerializationError(f"CBOR deserialization failed: {str(e)}") 64 + 65 + 66 + class JSONEncoder: 67 + """Encodes Python objects to ATProto JSON format.""" 68 + 69 + def encode(self, obj: Any) -> Any: 70 + """Recursively encode object to ATProto JSON format.""" 71 + if isinstance(obj, dict): 72 + return {k: self.encode(v) for k, v in obj.items()} 73 + elif isinstance(obj, list): 74 + return [self.encode(item) for item in obj] 75 + elif isinstance(obj, CIDLink): 76 + return obj.to_json() 77 + elif isinstance(obj, bytes): 78 + return self._encode_bytes(obj) 79 + else: 80 + return obj 81 + 82 + def _encode_bytes(self, data: bytes) -> Dict[str, str]: 83 + """Encode bytes to ATProto bytes format.""" 84 + return {"$bytes": base64.b64encode(data).decode("ascii")} 85 + 86 + 87 + class JSONDecoder: 88 + """Decodes ATProto JSON format to Python objects.""" 89 + 90 + def decode(self, obj: Any) -> Any: 91 + """Recursively decode ATProto JSON format to Python objects.""" 92 + if isinstance(obj, dict): 93 + return self._decode_object(obj) 94 + elif isinstance(obj, list): 95 + return [self.decode(item) for item in obj] 96 + else: 97 + return obj 98 + 99 + def _decode_object(self, obj: Dict[str, Any]) -> Any: 100 + """Decode a JSON object, handling special ATProto formats.""" 101 + if len(obj) == 1: 102 + key = next(iter(obj.keys())) 103 + value = obj[key] 104 + 105 + if key == "$link" and isinstance(value, str): 106 + return CIDLink(value) 107 + elif key == "$bytes" and isinstance(value, str): 108 + return self._decode_bytes(value) 109 + elif key == "$type" and value == "blob": 110 + # This would be handled by a blob-specific decoder 111 + return obj 112 + 113 + # Regular object - decode recursively 114 + return {k: self.decode(v) for k, v in obj.items()} 115 + 116 + def _decode_bytes(self, value: str) -> bytes: 117 + """Decode ATProto bytes format.""" 118 + try: 119 + return base64.b64decode(value) 120 + except Exception as e: 121 + raise SerializationError(f"Invalid base64 encoding: {str(e)}") 122 + 123 + 124 + # Global serializer instance 125 + serializer = ATProtoSerializer()
+179
src/atpasser/data/types.py
··· 1 + """Base types for ATProto data model.""" 2 + 3 + from typing import Any, Union, Optional, TypeVar 4 + from datetime import datetime 5 + import re 6 + import base64 7 + from pydantic import BaseModel, Field, validator 8 + from .exceptions import ValidationError, FormatError 9 + 10 + T = TypeVar("T") 11 + 12 + 13 + class CIDLink: 14 + """Represents a CID link in ATProto data model.""" 15 + 16 + def __init__(self, cid: Union[str, bytes]): 17 + if isinstance(cid, bytes): 18 + # Convert bytes to string representation 19 + # This is a simplified implementation 20 + self.cid = f"bafy{base64.b64encode(cid).decode()[:44]}" 21 + else: 22 + self.cid = cid 23 + 24 + def __str__(self) -> str: 25 + return self.cid 26 + 27 + def __eq__(self, other: Any) -> bool: 28 + if isinstance(other, CIDLink): 29 + return self.cid == other.cid 30 + elif isinstance(other, str): 31 + return self.cid == other 32 + return False 33 + 34 + def to_json(self) -> dict: 35 + """Convert to JSON representation.""" 36 + return {"$link": self.cid} 37 + 38 + @classmethod 39 + def from_json(cls, data: dict) -> "CIDLink": 40 + """Create from JSON representation.""" 41 + if not isinstance(data, dict) or "$link" not in data: 42 + raise ValidationError( 43 + "Invalid CID link format", expected="{'$link': 'cid_string'}" 44 + ) 45 + return cls(data["$link"]) 46 + 47 + 48 + class DateTimeString(str): 49 + """RFC 3339 datetime string with validation.""" 50 + 51 + @classmethod 52 + def __get_validators__(cls): 53 + yield cls.validate 54 + 55 + @classmethod 56 + def validate(cls, v: Any) -> "DateTimeString": 57 + if not isinstance(v, str): 58 + raise ValidationError("Must be a string", actual=type(v).__name__) 59 + 60 + # RFC 3339 pattern validation 61 + pattern = ( 62 + r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})$" 63 + ) 64 + if not re.match(pattern, v): 65 + raise FormatError("Invalid RFC 3339 datetime format", "datetime", v) 66 + 67 + # Additional semantic validation 68 + try: 69 + # Try to parse to ensure it's a valid datetime 70 + datetime.fromisoformat(v.replace("Z", "+00:00").replace("z", "+00:00")) 71 + except ValueError: 72 + raise FormatError("Invalid datetime value", "datetime", v) 73 + 74 + return cls(v) 75 + 76 + 77 + class LanguageTag(str): 78 + """BCP 47 language tag with validation.""" 79 + 80 + @classmethod 81 + def __get_validators__(cls): 82 + yield cls.validate 83 + 84 + @classmethod 85 + def validate(cls, v: Any) -> "LanguageTag": 86 + if not isinstance(v, str): 87 + raise ValidationError("Must be a string", actual=type(v).__name__) 88 + 89 + # Basic BCP 47 pattern validation 90 + pattern = r"^[a-zA-Z]{1,8}(?:-[a-zA-Z0-9]{1,8})*$" 91 + if not re.match(pattern, v): 92 + raise FormatError("Invalid BCP 47 language tag format", "language", v) 93 + 94 + return cls(v) 95 + 96 + 97 + class ATUri(str): 98 + """AT Protocol URI with validation.""" 99 + 100 + @classmethod 101 + def __get_validators__(cls): 102 + yield cls.validate 103 + 104 + @classmethod 105 + def validate(cls, v: Any) -> "ATUri": 106 + if not isinstance(v, str): 107 + raise ValidationError("Must be a string", actual=type(v).__name__) 108 + 109 + # Basic AT URI validation 110 + if not v.startswith("at://"): 111 + raise FormatError("AT URI must start with 'at://'", "at-uri", v) 112 + 113 + # Additional validation can be added here 114 + return cls(v) 115 + 116 + 117 + class DIDString(str): 118 + """DID string with validation.""" 119 + 120 + @classmethod 121 + def __get_validators__(cls): 122 + yield cls.validate 123 + 124 + @classmethod 125 + def validate(cls, v: Any) -> "DIDString": 126 + if not isinstance(v, str): 127 + raise ValidationError("Must be a string", actual=type(v).__name__) 128 + 129 + # Basic DID format validation 130 + pattern = r"^did:[a-z]+:[a-zA-Z0-9._:%-]*[a-zA-Z0-9._-]$" 131 + if not re.match(pattern, v): 132 + raise FormatError("Invalid DID format", "did", v) 133 + 134 + return cls(v) 135 + 136 + 137 + class HandleString(str): 138 + """Handle string with validation.""" 139 + 140 + @classmethod 141 + def __get_validators__(cls): 142 + yield cls.validate 143 + 144 + @classmethod 145 + def validate(cls, v: Any) -> "HandleString": 146 + if not isinstance(v, str): 147 + raise ValidationError("Must be a string", actual=type(v).__name__) 148 + 149 + # Basic handle validation 150 + if len(v) > 253: 151 + raise FormatError("Handle too long", "handle", v) 152 + 153 + labels = v.lower().split(".") 154 + if len(labels) < 2: 155 + raise FormatError("Handle must contain at least one dot", "handle", v) 156 + 157 + return cls(v) 158 + 159 + 160 + class NSIDString(str): 161 + """NSID string with validation.""" 162 + 163 + @classmethod 164 + def __get_validators__(cls): 165 + yield cls.validate 166 + 167 + @classmethod 168 + def validate(cls, v: Any) -> "NSIDString": 169 + if not isinstance(v, str): 170 + raise ValidationError("Must be a string", actual=type(v).__name__) 171 + 172 + # Basic NSID validation 173 + if len(v) > 317: 174 + raise FormatError("NSID too long", "nsid", v) 175 + 176 + if not all(ord(c) < 128 for c in v): 177 + raise FormatError("NSID must contain only ASCII characters", "nsid", v) 178 + 179 + return cls(v)
+263
src/atpasser/lexicon/ARCHITECTURE.md
··· 1 + # ATProto Lexicon 模块架构设计 2 + 3 + ## 概述 4 + 5 + 本模块负责解析、验证和管理 ATProto Lexicon 定义文件,将 JSON Schema 转换为可执行的 Pydantic 模型,并提供类型安全的接口。 6 + 7 + ## 核心架构设计 8 + 9 + ### 1. Lexicon 解析系统 10 + 11 + #### 1.1 解析器层级结构 12 + 13 + ``` 14 + LexiconParser 15 + ├── DefinitionParser 16 + │ ├── PrimaryDefinitionParser 17 + │ │ ├── RecordParser 18 + │ │ ├── QueryParser 19 + │ │ ├── ProcedureParser 20 + │ │ └── SubscriptionParser 21 + │ └── FieldDefinitionParser 22 + │ ├── SimpleTypeParser 23 + │ ├── CompoundTypeParser 24 + │ └── MetaTypeParser 25 + └── Validator 26 + ├── SchemaValidator 27 + └── CrossReferenceValidator 28 + ``` 29 + 30 + #### 1.2 解析流程 31 + 32 + 1. **加载 Lexicon JSON**: 读取并验证 Lexicon 文件结构 33 + 2. **解析定义**: 根据类型分发到相应的解析器 34 + 3. **构建模型**: 生成对应的 Pydantic 模型类 35 + 4. **验证引用**: 检查跨定义引用的有效性 36 + 5. **注册模型**: 将模型注册到全局注册表 37 + 38 + ### 2. 类型映射系统 39 + 40 + #### 2.1 Lexicon 类型到 Python 类型映射 41 + 42 + ```python 43 + LEXICON_TYPE_MAPPING = { 44 + "null": None, 45 + "boolean": bool, 46 + "integer": int, 47 + "string": str, 48 + "bytes": bytes, 49 + "cid-link": "CIDLink", 50 + "blob": "BlobRef", 51 + "array": list, 52 + "object": dict, 53 + "params": dict, 54 + "token": "LexiconToken", 55 + "ref": "LexiconRef", 56 + "union": "LexiconUnion", 57 + "unknown": Any, 58 + "record": "RecordModel", 59 + "query": "QueryModel", 60 + "procedure": "ProcedureModel", 61 + "subscription": "SubscriptionModel" 62 + } 63 + ``` 64 + 65 + #### 2.2 自定义类型处理器 66 + 67 + - **LexiconRef**: 处理跨定义引用解析 68 + - **LexiconUnion**: 处理联合类型验证 69 + - **LexiconToken**: 处理符号化值 70 + - **RecordModel**: 记录类型基类 71 + - **QueryModel**: 查询类型基类 72 + 73 + ### 3. 模型生成系统 74 + 75 + #### 3.1 动态模型生成 76 + 77 + ```python 78 + class ModelGenerator: 79 + """动态生成 Pydantic 模型""" 80 + 81 + def generate_record_model(self, definition: dict) -> Type[BaseModel]: 82 + """生成记录模型""" 83 + pass 84 + 85 + def generate_query_model(self, definition: dict) -> Type[BaseModel]: 86 + """生成查询模型""" 87 + pass 88 + 89 + def generate_field_validator(self, field_def: dict) -> Callable: 90 + """生成字段验证器""" 91 + pass 92 + ``` 93 + 94 + #### 3.2 约束处理 95 + 96 + ```python 97 + class ConstraintProcessor: 98 + """处理字段约束""" 99 + 100 + def process_integer_constraints(self, field_def: dict) -> dict: 101 + """处理整数约束 (min, max, enum)""" 102 + pass 103 + 104 + def process_string_constraints(self, field_def: dict) -> dict: 105 + """处理字符串约束 (format, length, enum)""" 106 + pass 107 + 108 + def process_array_constraints(self, field_def: dict) -> dict: 109 + """处理数组约束 (minLength, maxLength)""" 110 + pass 111 + ``` 112 + 113 + ### 4. 注册表和缓存机制 114 + 115 + #### 4.1 模型注册表 116 + 117 + ```python 118 + class LexiconRegistry: 119 + """Lexicon 模型注册表""" 120 + 121 + def __init__(self): 122 + self._models: Dict[str, Type[BaseModel]] = {} 123 + self._definitions: Dict[str, dict] = {} 124 + self._ref_cache: Dict[str, Type[BaseModel]] = {} 125 + 126 + def register(self, nsid: str, model: Type[BaseModel], definition: dict): 127 + """注册 Lexicon 模型""" 128 + pass 129 + 130 + def get_model(self, nsid: str) -> Optional[Type[BaseModel]]: 131 + """获取已注册的模型""" 132 + pass 133 + 134 + def resolve_ref(self, ref: str) -> Optional[Type[BaseModel]]: 135 + """解析引用到具体模型""" 136 + pass 137 + 138 + def clear_cache(self): 139 + """清空缓存""" 140 + pass 141 + ``` 142 + 143 + #### 4.2 缓存策略 144 + 145 + - **内存缓存**: 缓存已解析的模型定义 146 + - **文件缓存**: 缓存序列化结果以提高性能 147 + - **LRU 策略**: 使用最近最少使用算法管理缓存 148 + 149 + ### 5. 验证系统 150 + 151 + #### 5.1 验证层级 152 + 153 + 1. **语法验证**: JSON Schema 结构验证 154 + 2. **语义验证**: 类型约束和业务规则验证 155 + 3. **引用验证**: 跨定义引用有效性验证 156 + 4. **兼容性验证**: 前向和后向兼容性检查 157 + 158 + #### 5.2 自定义验证器 159 + 160 + ```python 161 + class LexiconValidator: 162 + """Lexicon 定义验证器""" 163 + 164 + def validate_definition(self, definition: dict) -> bool: 165 + """验证 Lexicon 定义完整性""" 166 + pass 167 + 168 + def validate_refs(self, definition: dict) -> List[str]: 169 + """验证所有引用的有效性""" 170 + pass 171 + 172 + def validate_compatibility(self, old_def: dict, new_def: dict) -> bool: 173 + """验证版本兼容性""" 174 + pass 175 + ``` 176 + 177 + ### 6. 错误处理系统 178 + 179 + #### 6.1 错误类型体系 180 + 181 + ```python 182 + class LexiconError(Exception): 183 + """基础 Lexicon 错误""" 184 + pass 185 + 186 + class ParseError(LexiconError): 187 + """解析错误""" 188 + pass 189 + 190 + class ValidationError(LexiconError): 191 + """验证错误""" 192 + pass 193 + 194 + class ResolutionError(LexiconError): 195 + """引用解析错误""" 196 + pass 197 + 198 + class GenerationError(LexiconError): 199 + """模型生成错误""" 200 + pass 201 + ``` 202 + 203 + #### 6.2 诊断信息 204 + 205 + - **详细错误消息**: 包含具体的字段路径和期望值 206 + - **上下文信息**: 提供验证时的上下文信息 207 + - **建议修复**: 提供可能的修复建议 208 + 209 + ### 7. 模块文件结构 210 + 211 + ``` 212 + src/atpasser/lexicon/ 213 + ├── __init__.py # 模块导出 214 + ├── ARCHITECTURE.md # 架构文档 215 + ├── parser.py # 主解析器 216 + ├── generator.py # 模型生成器 217 + ├── registry.py # 注册表实现 218 + ├── validator.py # 验证器实现 219 + ├── types.py # 类型定义 220 + ├── exceptions.py # 异常定义 221 + ├── constraints.py # 约束处理器 222 + └── utils.py # 工具函数 223 + ``` 224 + 225 + ### 8. 依赖关系 226 + 227 + - **内部依赖**: 228 + - `src/atpasser/data` (数据序列化和验证) 229 + - `src/atpasser/uri` (NSID 验证和处理) 230 + - **外部依赖**: 231 + - `pydantic`: 模型生成和验证 232 + - `jsonpath-ng`: JSONPath 支持 233 + - `cbor2`: CBOR 序列化支持 234 + 235 + ## 实现策略 236 + 237 + ### 1. 渐进式实现 238 + 239 + 1. **阶段一**: 实现基础解析器和简单类型映射 240 + 2. **阶段二**: 添加复杂类型和引用解析 241 + 3. **阶段三**: 实现模型生成和注册表 242 + 4. **阶段四**: 添加高级验证和错误处理 243 + 244 + ### 2. 测试策略 245 + 246 + - **单元测试**: 测试各个解析器组件 247 + - **集成测试**: 测试端到端 Lexicon 解析流程 248 + - **兼容性测试**: 确保与现有 Lexicon 文件兼容 249 + - **性能测试**: 验证解析和模型生成性能 250 + 251 + ### 3. 扩展性考虑 252 + 253 + - **插件系统**: 支持自定义类型解析器 254 + - **中间件**: 支持预处理和后处理钩子 255 + - **监控**: 集成性能监控和日志记录 256 + 257 + ## 优势 258 + 259 + 1. **类型安全**: 利用 Pydantic 的强类型系统 260 + 2. **性能**: 优化的解析和缓存机制 261 + 3. **可扩展**: 模块化设计支持未来扩展 262 + 4. **兼容性**: 保持与 ATProto Lexicon 规范完全兼容 263 + 5. **开发者友好**: 提供清晰的错误消息和文档
+71
src/atpasser/lexicon/__init__.py
··· 1 + """ATProto Lexicon module for parsing and managing schema definitions.""" 2 + 3 + from .exceptions import ( 4 + LexiconError, 5 + ParseError, 6 + ValidationError, 7 + ResolutionError, 8 + GenerationError, 9 + CompatibilityError, 10 + ) 11 + 12 + from .types import ( 13 + LexiconType, 14 + LexiconDefinition, 15 + IntegerConstraints, 16 + StringConstraints, 17 + ArrayConstraints, 18 + ObjectConstraints, 19 + BlobConstraints, 20 + ParamsConstraints, 21 + RefDefinition, 22 + UnionDefinition, 23 + RecordDefinition, 24 + QueryDefinition, 25 + ProcedureDefinition, 26 + SubscriptionDefinition, 27 + LexiconDocument, 28 + ErrorDefinition, 29 + LexiconSchema, 30 + PropertyMap, 31 + DefinitionMap, 32 + ) 33 + 34 + from .registry import LexiconRegistry, registry 35 + from .parser import LexiconParser, parser 36 + 37 + __all__ = [ 38 + # Exceptions 39 + "LexiconError", 40 + "ParseError", 41 + "ValidationError", 42 + "ResolutionError", 43 + "GenerationError", 44 + "CompatibilityError", 45 + # Types 46 + "LexiconType", 47 + "LexiconDefinition", 48 + "IntegerConstraints", 49 + "StringConstraints", 50 + "ArrayConstraints", 51 + "ObjectConstraints", 52 + "BlobConstraints", 53 + "ParamsConstraints", 54 + "RefDefinition", 55 + "UnionDefinition", 56 + "RecordDefinition", 57 + "QueryDefinition", 58 + "ProcedureDefinition", 59 + "SubscriptionDefinition", 60 + "LexiconDocument", 61 + "ErrorDefinition", 62 + "LexiconSchema", 63 + "PropertyMap", 64 + "DefinitionMap", 65 + # Registry 66 + "LexiconRegistry", 67 + "registry", 68 + # Parser 69 + "LexiconParser", 70 + "parser", 71 + ]
+125
src/atpasser/lexicon/exceptions.py
··· 1 + """Exceptions for ATProto Lexicon module.""" 2 + 3 + from typing import Optional 4 + 5 + 6 + class LexiconError(Exception): 7 + """Base exception for Lexicon errors.""" 8 + 9 + def __init__(self, message: str, details: Optional[str] = None): 10 + self.message = message 11 + self.details = details 12 + super().__init__(message) 13 + 14 + 15 + class ParseError(LexiconError): 16 + """Raised when Lexicon parsing fails.""" 17 + 18 + def __init__( 19 + self, message: str, nsid: Optional[str] = None, definition: Optional[str] = None 20 + ): 21 + self.nsid = nsid 22 + self.definition = definition 23 + 24 + details = [] 25 + if nsid: 26 + details.append(f"NSID: {nsid}") 27 + if definition: 28 + details.append(f"Definition: {definition}") 29 + 30 + super().__init__( 31 + f"Parse error: {message}", "; ".join(details) if details else None 32 + ) 33 + 34 + 35 + class ValidationError(LexiconError): 36 + """Raised when Lexicon validation fails.""" 37 + 38 + def __init__( 39 + self, 40 + message: str, 41 + nsid: Optional[str] = None, 42 + field: Optional[str] = None, 43 + expected: Optional[str] = None, 44 + ): 45 + self.nsid = nsid 46 + self.field = field 47 + self.expected = expected 48 + 49 + details = [] 50 + if nsid: 51 + details.append(f"NSID: {nsid}") 52 + if field: 53 + details.append(f"Field: {field}") 54 + if expected: 55 + details.append(f"Expected: {expected}") 56 + 57 + super().__init__( 58 + f"Validation error: {message}", "; ".join(details) if details else None 59 + ) 60 + 61 + 62 + class ResolutionError(LexiconError): 63 + """Raised when reference resolution fails.""" 64 + 65 + def __init__( 66 + self, message: str, ref: Optional[str] = None, context: Optional[str] = None 67 + ): 68 + self.ref = ref 69 + self.context = context 70 + 71 + details = [] 72 + if ref: 73 + details.append(f"Reference: {ref}") 74 + if context: 75 + details.append(f"Context: {context}") 76 + 77 + super().__init__( 78 + f"Resolution error: {message}", "; ".join(details) if details else None 79 + ) 80 + 81 + 82 + class GenerationError(LexiconError): 83 + """Raised when model generation fails.""" 84 + 85 + def __init__( 86 + self, 87 + message: str, 88 + nsid: Optional[str] = None, 89 + definition_type: Optional[str] = None, 90 + ): 91 + self.nsid = nsid 92 + self.definitionType = definition_type 93 + 94 + details = [] 95 + if nsid: 96 + details.append(f"NSID: {nsid}") 97 + if definition_type: 98 + details.append(f"Type: {definition_type}") 99 + 100 + super().__init__( 101 + f"Generation error: {message}", "; ".join(details) if details else None 102 + ) 103 + 104 + 105 + class CompatibilityError(LexiconError): 106 + """Raised when compatibility checks fail.""" 107 + 108 + def __init__( 109 + self, 110 + message: str, 111 + old_nsid: Optional[str] = None, 112 + new_nsid: Optional[str] = None, 113 + ): 114 + self.oldNsid = old_nsid 115 + self.newNsid = new_nsid 116 + 117 + details = [] 118 + if old_nsid: 119 + details.append(f"Old NSID: {old_nsid}") 120 + if new_nsid: 121 + details.append(f"New NSID: {new_nsid}") 122 + 123 + super().__init__( 124 + f"Compatibility error: {message}", "; ".join(details) if details else None 125 + )
+208
src/atpasser/lexicon/parser.py
··· 1 + """Parser for ATProto Lexicon definitions.""" 2 + 3 + import json 4 + from typing import Dict, Any, Optional, Type, Union 5 + from pydantic import BaseModel, create_model 6 + from .exceptions import ParseError, ValidationError 7 + from .types import LexiconDocument, LexiconType 8 + from .registry import registry 9 + 10 + 11 + class LexiconParser: 12 + """Parser for ATProto Lexicon JSON definitions.""" 13 + 14 + def __init__(self): 15 + self.validators = LexiconValidator() 16 + 17 + def parse_document(self, json_data: Union[str, dict]) -> LexiconDocument: 18 + """Parse a Lexicon JSON document.""" 19 + try: 20 + if isinstance(json_data, str): 21 + data = json.loads(json_data) 22 + else: 23 + data = json_data 24 + 25 + # Validate basic document structure 26 + self.validators.validate_document_structure(data) 27 + 28 + # Parse into Pydantic model 29 + document = LexiconDocument.model_validate(data) 30 + 31 + # Validate semantic rules 32 + self.validators.validate_document_semantics(document) 33 + 34 + return document 35 + 36 + except Exception as e: 37 + if isinstance(e, (ParseError, ValidationError)): 38 + raise 39 + raise ParseError(f"Failed to parse Lexicon document: {str(e)}") 40 + 41 + def parse_and_register(self, json_data: Union[str, dict]) -> None: 42 + """Parse a Lexicon document and register it.""" 43 + document = self.parse_document(json_data) 44 + registry.register_lexicon(document) 45 + 46 + # Generate and register models for all definitions 47 + generator = ModelGenerator() 48 + for def_name, def_data in document.defs.items(): 49 + try: 50 + model = generator.generate_model(document.id, def_name, def_data) 51 + registry.register_model(document.id, model, def_name) 52 + except Exception as e: 53 + raise ParseError( 54 + f"Failed to generate model for {def_name}: {str(e)}", 55 + document.id, 56 + def_name, 57 + ) 58 + 59 + 60 + class LexiconValidator: 61 + """Validator for Lexicon documents.""" 62 + 63 + def validate_document_structure(self, data: Dict[str, Any]) -> None: 64 + """Validate basic document structure.""" 65 + required_fields = ["lexicon", "id", "defs"] 66 + for field in required_fields: 67 + if field not in data: 68 + raise ValidationError(f"Missing required field: {field}") 69 + 70 + if not isinstance(data["defs"], dict) or not data["defs"]: 71 + raise ValidationError("defs must be a non-empty dictionary") 72 + 73 + if data["lexicon"] != 1: 74 + raise ValidationError("lexicon version must be 1") 75 + 76 + def validate_document_semantics(self, document: LexiconDocument) -> None: 77 + """Validate semantic rules for Lexicon document.""" 78 + # Check primary type constraints 79 + primary_types = { 80 + LexiconType.RECORD, 81 + LexiconType.QUERY, 82 + LexiconType.PROCEDURE, 83 + LexiconType.SUBSCRIPTION, 84 + } 85 + 86 + primary_defs = [] 87 + for def_name, def_data in document.defs.items(): 88 + def_type = def_data.get("type") 89 + if def_type in primary_types: 90 + primary_defs.append((def_name, def_type)) 91 + 92 + # Primary types should usually be named 'main' 93 + if def_name != "main": 94 + # This is a warning, not an error 95 + pass 96 + 97 + # Only one primary type allowed per document 98 + if len(primary_defs) > 1: 99 + raise ValidationError( 100 + f"Multiple primary types found: {[name for name, _ in primary_defs]}", 101 + document.id, 102 + ) 103 + 104 + 105 + class ModelGenerator: 106 + """Generates Pydantic models from Lexicon definitions.""" 107 + 108 + def generate_model( 109 + self, nsid: str, def_name: str, definition: Dict[str, Any] 110 + ) -> Type[BaseModel]: 111 + """Generate a Pydantic model from a Lexicon definition.""" 112 + def_type = definition.get("type") 113 + 114 + if def_type == LexiconType.RECORD: 115 + return self._generate_record_model(nsid, def_name, definition) 116 + elif def_type == LexiconType.OBJECT: 117 + return self._generate_object_model(nsid, def_name, definition) 118 + elif def_type in [ 119 + LexiconType.QUERY, 120 + LexiconType.PROCEDURE, 121 + LexiconType.SUBSCRIPTION, 122 + ]: 123 + return self._generate_primary_model(nsid, def_name, definition) 124 + else: 125 + # For simple types, create a basic model 126 + return self._generate_simple_model(nsid, def_name, definition) 127 + 128 + def _generate_record_model( 129 + self, nsid: str, def_name: str, definition: Dict[str, Any] 130 + ) -> Type[BaseModel]: 131 + """Generate a model for record type.""" 132 + record_schema = definition.get("record", {}) 133 + return self._generate_object_model(nsid, def_name, record_schema) 134 + 135 + def _generate_object_model( 136 + self, nsid: str, def_name: str, definition: Dict[str, Any] 137 + ) -> Type[BaseModel]: 138 + """Generate a model for object type.""" 139 + properties = definition.get("properties", {}) 140 + required = definition.get("required", []) 141 + 142 + field_definitions = {} 143 + for prop_name, prop_schema in properties.items(): 144 + field_type = self._get_field_type(prop_schema) 145 + field_definitions[prop_name] = ( 146 + field_type, 147 + ... if prop_name in required else None, 148 + ) 149 + 150 + model_name = self._get_model_name(nsid, def_name) 151 + return create_model(model_name, **field_definitions) 152 + 153 + def _generate_primary_model( 154 + self, nsid: str, def_name: str, definition: Dict[str, Any] 155 + ) -> Type[BaseModel]: 156 + """Generate a model for primary types (query, procedure, subscription).""" 157 + # For now, create a basic model - specific handling can be added later 158 + return self._generate_simple_model(nsid, def_name, definition) 159 + 160 + def _generate_simple_model( 161 + self, nsid: str, def_name: str, definition: Dict[str, Any] 162 + ) -> Type[BaseModel]: 163 + """Generate a simple model for basic types.""" 164 + field_type = self._get_field_type(definition) 165 + model_name = self._get_model_name(nsid, def_name) 166 + return create_model(model_name, value=(field_type, ...)) 167 + 168 + def _get_field_type(self, schema: Dict[str, Any]) -> Any: 169 + """Get the Python type for a schema definition.""" 170 + schema_type = schema.get("type") 171 + 172 + type_mapping = { 173 + LexiconType.NULL: type(None), 174 + LexiconType.BOOLEAN: bool, 175 + LexiconType.INTEGER: int, 176 + LexiconType.STRING: str, 177 + LexiconType.BYTES: bytes, 178 + LexiconType.ARRAY: list, 179 + LexiconType.OBJECT: dict, 180 + } 181 + 182 + if schema_type and schema_type in type_mapping: 183 + return type_mapping[schema_type] 184 + 185 + if schema_type == LexiconType.REF: 186 + ref = schema.get("ref") 187 + if ref: 188 + return registry.resolve_ref(ref) 189 + 190 + # Default to Any for complex types 191 + return Any 192 + 193 + def _get_model_name(self, nsid: str, def_name: str) -> str: 194 + """Generate a valid Python class name from NSID and definition name.""" 195 + # Convert NSID to PascalCase 196 + parts = nsid.split(".") 197 + name_parts = [part.capitalize() for part in parts] 198 + 199 + # Add definition name 200 + if def_name != "main": 201 + def_part = def_name.capitalize() 202 + name_parts.append(def_part) 203 + 204 + return "".join(name_parts) 205 + 206 + 207 + # Global parser instance 208 + parser = LexiconParser()
+114
src/atpasser/lexicon/registry.py
··· 1 + """Registry for managing Lexicon definitions and generated models.""" 2 + 3 + from typing import Dict, Optional, Type, Any 4 + from pydantic import BaseModel 5 + from .exceptions import ResolutionError 6 + from .types import LexiconDocument 7 + 8 + 9 + class LexiconRegistry: 10 + """Registry for storing and resolving Lexicon definitions and models.""" 11 + 12 + def __init__(self): 13 + self._definitions: Dict[str, LexiconDocument] = {} 14 + self._models: Dict[str, Type[BaseModel]] = {} 15 + self._ref_cache: Dict[str, Type[BaseModel]] = {} 16 + 17 + def register_lexicon(self, document: LexiconDocument) -> None: 18 + """Register a Lexicon document.""" 19 + nsid = document.id 20 + if nsid in self._definitions: 21 + raise ValueError(f"Lexicon with NSID {nsid} is already registered") 22 + 23 + self._definitions[nsid] = document 24 + 25 + # Clear cache for this NSID 26 + self._clear_cache_for_nsid(nsid) 27 + 28 + def get_lexicon(self, nsid: str) -> Optional[LexiconDocument]: 29 + """Get a registered Lexicon document by NSID.""" 30 + return self._definitions.get(nsid) 31 + 32 + def register_model( 33 + self, nsid: str, model: Type[BaseModel], definition_name: Optional[str] = None 34 + ) -> None: 35 + """Register a generated model for a Lexicon definition.""" 36 + key = self._get_model_key(nsid, definition_name) 37 + self._models[key] = model 38 + 39 + # Also cache for quick reference resolution 40 + if definition_name and definition_name != "main": 41 + ref_key = f"{nsid}#{definition_name}" 42 + self._ref_cache[ref_key] = model 43 + 44 + def get_model( 45 + self, nsid: str, definition_name: Optional[str] = None 46 + ) -> Optional[Type[BaseModel]]: 47 + """Get a registered model by NSID and optional definition name.""" 48 + key = self._get_model_key(nsid, definition_name) 49 + return self._models.get(key) 50 + 51 + def resolve_ref(self, ref: str) -> Type[BaseModel]: 52 + """Resolve a reference to a model.""" 53 + if ref in self._ref_cache: 54 + return self._ref_cache[ref] 55 + 56 + # Parse the reference 57 + if "#" in ref: 58 + nsid, definition_name = ref.split("#", 1) 59 + else: 60 + nsid, definition_name = ref, "main" 61 + 62 + model = self.get_model(nsid, definition_name) 63 + if model is None: 64 + raise ResolutionError(f"Reference not found: {ref}", ref) 65 + 66 + # Cache for future use 67 + self._ref_cache[ref] = model 68 + return model 69 + 70 + def has_lexicon(self, nsid: str) -> bool: 71 + """Check if a Lexicon is registered.""" 72 + return nsid in self._definitions 73 + 74 + def has_model(self, nsid: str, definition_name: Optional[str] = None) -> bool: 75 + """Check if a model is registered.""" 76 + key = self._get_model_key(nsid, definition_name) 77 + return key in self._models 78 + 79 + def clear_cache(self) -> None: 80 + """Clear all cached models and references.""" 81 + self._models.clear() 82 + self._ref_cache.clear() 83 + 84 + def _get_model_key(self, nsid: str, definition_name: Optional[str]) -> str: 85 + """Get the internal key for model storage.""" 86 + if definition_name: 87 + return f"{nsid}#{definition_name}" 88 + return f"{nsid}#main" 89 + 90 + def _clear_cache_for_nsid(self, nsid: str) -> None: 91 + """Clear cache entries for a specific NSID.""" 92 + # Clear models 93 + keys_to_remove = [ 94 + key for key in self._models.keys() if key.startswith(f"{nsid}#") 95 + ] 96 + for key in keys_to_remove: 97 + del self._models[key] 98 + 99 + # Clear ref cache 100 + keys_to_remove = [key for key in self._ref_cache.keys() if key.startswith(nsid)] 101 + for key in keys_to_remove: 102 + del self._ref_cache[key] 103 + 104 + def list_lexicons(self) -> Dict[str, LexiconDocument]: 105 + """List all registered Lexicon documents.""" 106 + return self._definitions.copy() 107 + 108 + def list_models(self) -> Dict[str, Type[BaseModel]]: 109 + """List all registered models.""" 110 + return self._models.copy() 111 + 112 + 113 + # Global registry instance 114 + registry = LexiconRegistry()
+155
src/atpasser/lexicon/types.py
··· 1 + """Type definitions for ATProto Lexicon module.""" 2 + 3 + from typing import Dict, List, Optional, Union, Any, Type 4 + from enum import Enum 5 + from pydantic import BaseModel, Field 6 + 7 + 8 + class LexiconType(str, Enum): 9 + """Enumeration of Lexicon definition types.""" 10 + 11 + NULL = "null" 12 + BOOLEAN = "boolean" 13 + INTEGER = "integer" 14 + STRING = "string" 15 + BYTES = "bytes" 16 + CID_LINK = "cid-link" 17 + BLOB = "blob" 18 + ARRAY = "array" 19 + OBJECT = "object" 20 + PARAMS = "params" 21 + TOKEN = "token" 22 + REF = "ref" 23 + UNION = "union" 24 + UNKNOWN = "unknown" 25 + RECORD = "record" 26 + QUERY = "query" 27 + PROCEDURE = "procedure" 28 + SUBSCRIPTION = "subscription" 29 + 30 + 31 + class LexiconDefinition(BaseModel): 32 + """Base class for Lexicon definitions.""" 33 + 34 + type: LexiconType 35 + description: Optional[str] = None 36 + 37 + 38 + class IntegerConstraints(BaseModel): 39 + """Constraints for integer fields.""" 40 + 41 + minimum: Optional[int] = None 42 + maximum: Optional[int] = None 43 + enum: Optional[List[int]] = None 44 + default: Optional[int] = None 45 + const: Optional[int] = None 46 + 47 + 48 + class StringConstraints(BaseModel): 49 + """Constraints for string fields.""" 50 + 51 + format: Optional[str] = None 52 + maxLength: Optional[int] = None 53 + minLength: Optional[int] = None 54 + maxGraphemes: Optional[int] = None 55 + minGraphemes: Optional[int] = None 56 + knownValues: Optional[List[str]] = None 57 + enum: Optional[List[str]] = None 58 + default: Optional[str] = None 59 + const: Optional[str] = None 60 + 61 + 62 + class ArrayConstraints(BaseModel): 63 + """Constraints for array fields.""" 64 + 65 + items: Dict[str, Any] # Schema definition for array items 66 + minLength: Optional[int] = None 67 + maxLength: Optional[int] = None 68 + 69 + 70 + class ObjectConstraints(BaseModel): 71 + """Constraints for object fields.""" 72 + 73 + properties: Dict[str, Dict[str, Any]] # Map of property names to schemas 74 + required: Optional[List[str]] = None 75 + nullable: Optional[List[str]] = None 76 + 77 + 78 + class BlobConstraints(BaseModel): 79 + """Constraints for blob fields.""" 80 + 81 + accept: Optional[List[str]] = None # MIME types 82 + maxSize: Optional[int] = None # Maximum size in bytes 83 + 84 + 85 + class ParamsConstraints(BaseModel): 86 + """Constraints for params fields.""" 87 + 88 + properties: Dict[str, Dict[str, Any]] 89 + required: Optional[List[str]] = None 90 + 91 + 92 + class RefDefinition(BaseModel): 93 + """Reference definition.""" 94 + 95 + ref: str # Reference to another schema 96 + 97 + 98 + class UnionDefinition(BaseModel): 99 + """Union type definition.""" 100 + 101 + refs: List[str] # List of references 102 + closed: Optional[bool] = False # Whether union is closed 103 + 104 + 105 + class RecordDefinition(LexiconDefinition): 106 + """Record type definition.""" 107 + 108 + key: str # Record key type 109 + record: Dict[str, Any] # Object schema 110 + 111 + 112 + class QueryDefinition(LexiconDefinition): 113 + """Query type definition.""" 114 + 115 + parameters: Optional[Dict[str, Any]] = None # Params schema 116 + output: Optional[Dict[str, Any]] = None # Output schema 117 + 118 + 119 + class ProcedureDefinition(LexiconDefinition): 120 + """Procedure type definition.""" 121 + 122 + parameters: Optional[Dict[str, Any]] = None # Params schema 123 + input: Optional[Dict[str, Any]] = None # Input schema 124 + output: Optional[Dict[str, Any]] = None # Output schema 125 + errors: Optional[List[Dict[str, Any]]] = None # Error definitions 126 + 127 + 128 + class SubscriptionDefinition(LexiconDefinition): 129 + """Subscription type definition.""" 130 + 131 + parameters: Optional[Dict[str, Any]] = None # Params schema 132 + message: Optional[Dict[str, Any]] = None # Message schema 133 + errors: Optional[List[Dict[str, Any]]] = None # Error definitions 134 + 135 + 136 + class LexiconDocument(BaseModel): 137 + """Complete Lexicon document.""" 138 + 139 + lexicon: int # Lexicon version (always 1) 140 + id: str # NSID of the Lexicon 141 + description: Optional[str] = None 142 + defs: Dict[str, Dict[str, Any]] # Map of definition names to schemas 143 + 144 + 145 + class ErrorDefinition(BaseModel): 146 + """Error definition for procedures and subscriptions.""" 147 + 148 + name: str # Error name 149 + description: Optional[str] = None 150 + 151 + 152 + # Type aliases for convenience 153 + LexiconSchema = Dict[str, Any] 154 + PropertyMap = Dict[str, LexiconSchema] 155 + DefinitionMap = Dict[str, Union[LexiconDefinition, Dict[str, Any]]]