tech.transparencia.document.item
Schema Diff
+11 -0
Compatibility Analysis
Backward Compatible
Backward compatible. 2 non-breaking changes.
Non-Breaking Changes (2)
- AddedVertex AddedVertex { vertex_id: "tech.transparencia.document.item#retrieval.blob" }
- AddedEdge AddedEdge { src: "tech.transparencia.document.item#retrieval", tgt: "tech.transparencia.document.item#retrieval.blob", kind: "prop", name: Some("blob") }
Migration Guidance
Added Elements
AddedVertex { vertex_id: "tech.transparencia.document.item#retrieval.blob" }
Additional Notes
- Non-breaking: AddedEdge { src: "tech.transparencia.document.item#retrieval", tgt: "tech.transparencia.document.item#retrieval.blob", kind: "prop", name: Some("blob") }
1
1
{
2
2
"id": "tech.transparencia.document.item",
3
3
"defs": {
4
4
"main": {
5
5
"key": "tid",
6
6
"type": "record",
7
7
"record": {
8
8
"type": "object",
9
9
"required": [
10
10
"title",
11
11
"documentType",
12
12
"source",
13
13
"retrieval",
14
14
"publishedAt",
15
15
"createdAt"
16
16
],
17
17
"properties": {
18
18
"title": {
19
19
"type": "string",
20
20
"maxLength": 4096,
21
21
"description": "Official or source-provided title of the document.",
22
22
"maxGraphemes": 1024
23
23
},
24
24
"source": {
25
25
"ref": "com.atproto.repo.strongRef",
26
26
"type": "ref",
27
27
"description": "Strong reference to the tech.transparencia.document.source record for the publisher or repository (e.g., DOF, UNFCCC). Identifies which source this document came from."
28
28
},
29
29
"topics": {
30
30
"type": "array",
31
31
"items": {
32
32
"type": "string",
33
33
"maxLength": 512,
34
34
"maxGraphemes": 128
35
35
},
36
36
"maxLength": 30,
37
37
"description": "Free-form topics, tags, or source categories attached to the document."
38
38
},
39
39
"country": {
40
40
"type": "string",
41
41
"maxLength": 2,
42
42
"description": "Primary country connected to the document, as an ISO 3166-1 alpha-2 code (e.g., 'MX', 'BR', 'US'). Omit for international documents."
43
43
},
44
44
"domains": {
45
45
"type": "array",
46
46
"items": {
47
47
"type": "string",
48
48
"maxLength": 128,
49
49
"knownValues": [
50
50
"government",
51
51
"politics",
52
52
"law",
53
53
"justice",
54
54
"environment",
55
55
"climate",
56
56
"education",
57
57
"health",
58
58
"budget",
59
59
"procurement",
60
60
"economy",
61
61
"finance",
62
62
"labor",
63
63
"energy",
64
64
"infrastructure",
65
65
"security",
66
66
"science-technology",
67
67
"society",
68
68
"human-rights",
69
69
"other"
70
70
],
71
71
"maxGraphemes": 64
72
72
},
73
73
"maxLength": 20,
74
74
"description": "Broad public-interest domains covered by the document. Open set; consumers should tolerate unknown values."
75
75
},
76
76
"issuedAt": {
77
77
"type": "string",
78
78
"format": "datetime",
79
79
"description": "When the issuing authority signed, issued, adopted, or approved the document, if different from publication time."
80
80
},
81
81
"language": {
82
82
"type": "string",
83
83
"format": "language",
84
84
"description": "Primary language of the document content (BCP-47, e.g., 'es-MX', 'en', 'pt-BR')."
85
85
},
86
86
"subtitle": {
87
87
"type": "string",
88
88
"maxLength": 4096,
89
89
"description": "Optional subtitle, section heading, or secondary title.",
90
90
"maxGraphemes": 1024
91
91
},
92
92
"createdAt": {
93
93
"type": "string",
94
94
"format": "datetime",
95
95
"description": "When this AT Protocol record was created."
96
96
},
97
97
"retrieval": {
98
98
"ref": "#retrieval",
99
99
"type": "ref",
100
100
"description": "Per-document retrieval metadata: canonical URLs, MIME type, checksums, file size, and access status of the specific retrieved representation."
101
101
},
102
102
"updatedAt": {
103
103
"type": "string",
104
104
"format": "datetime",
105
105
"description": "When this record was last materially updated."
106
106
},
107
107
"description": {
108
108
"type": "string",
109
109
"maxLength": 10000,
110
110
"description": "Short source-provided description or human-readable abstract. AI summaries should be stored in enrichment records.",
111
111
"maxGraphemes": 2000
112
112
},
113
113
"effectiveAt": {
114
114
"type": "string",
115
115
"format": "datetime",
116
116
"description": "When the document's legal or administrative effects begin, if applicable and explicitly known."
117
117
},
118
118
"identifiers": {
119
119
"type": "array",
120
120
"items": {
121
121
"ref": "#identifier",
122
122
"type": "ref"
123
123
},
124
124
"maxLength": 50,
125
125
"description": "External identifiers such as DOF IDs, UNFCCC symbols, file numbers, docket numbers, ISBNs, or local archival IDs. For content hashes use retrieval.sha256; for URLs use retrieval.url."
126
126
},
127
127
"publishedAt": {
128
128
"type": "string",
129
129
"format": "datetime",
130
130
"description": "When the document was published by the source. Use midnight UTC when only a calendar date is available."
131
131
},
132
132
"documentType": {
133
133
"type": "string",
134
134
"maxLength": 128,
135
135
"description": "Machine-readable document category. Open set; known values cover common official and institutional documents.",
136
136
"knownValues": [
137
137
"official-publication",
138
138
"official-gazette-issue",
139
139
"official-gazette-entry",
140
140
"law",
141
141
"decree",
142
142
"agreement",
143
143
"notice",
144
144
"regulation",
145
145
"standard",
146
146
"report",
147
147
"audit-report",
148
148
"budget-document",
149
149
"contract",
150
150
"procurement-document",
151
151
"court-ruling",
152
152
"legislative-bill",
153
153
"legislative-opinion",
154
154
"treaty",
155
155
"submission",
156
156
"technical-paper",
157
157
"environmental-impact-document",
158
158
"education-policy-document",
159
159
"dataset-documentation",
160
160
"meeting-minutes",
161
161
"resolution",
162
162
"other"
163
163
]
164
164
},
165
165
"jurisdiction": {
166
166
"type": "string",
167
167
"maxLength": 256,
168
168
"description": "Legal or administrative jurisdiction covered by the document (e.g., 'federal', 'state', 'municipal', 'international').",
169
169
"knownValues": [
170
170
"local",
171
171
"municipal",
172
172
"state",
173
173
"federal",
174
174
"national",
175
175
"regional",
176
176
"international",
177
177
"supranational",
178
178
"unknown"
179
179
],
180
180
"maxGraphemes": 64
181
181
},
182
182
"issuingBodies": {
183
183
"type": "array",
184
184
"items": {
185
185
"ref": "tech.transparencia.defs#organization",
186
186
"type": "ref"
187
187
},
188
188
"maxLength": 20,
189
189
"description": "Organizations, public bodies, institutions, or authorities responsible for issuing, publishing, filing, or adopting the document. Uses the shared tech.transparencia.defs#organization type. Conventional role values include 'publisher', 'issuer', 'author', 'adopter', 'filer', 'regulator', 'court', 'legislature', 'repository'."
190
190
}
191
191
}
192
192
},
193
193
"description": "Core document metadata for official and institutional documents. Stores identity, provenance, and public context, but not full text, sections, chunks, AI analysis, or ingestion pipeline state."
194
194
},
195
195
"retrieval": {
196
196
"type": "object",
197
197
"required": [
198
198
"url",
199
199
"retrievedAt"
200
200
],
201
201
"properties": {
202
202
"url": {
203
203
"type": "string",
204
204
"format": "uri",
205
205
"description": "URL where this document was found or retrieved."
206
206
},
207
+
"blob": {
208
+
"type": "blob",
209
+
"accept": [
210
+
"application/pdf",
211
+
"text/html",
212
+
"text/plain",
213
+
"application/json"
214
+
],
215
+
"maxSize": 10000000,
216
+
"description": "Optional binary attachment preserving the actual document bytes on this PDS. Typical contents: the source PDF, an HTML snapshot, the extracted plain text used by the enrichment pipeline, or the original upstream JSON payload (e.g., a SIDOF response). The other retrieval fields (url, pdfUrl, sha256) still reference the original public source — this blob is the archived copy. Max 10 MB."
217
+
},
207
218
"pdfUrl": {
208
219
"type": "string",
209
220
"format": "uri",
210
221
"description": "PDF or downloadable document URL, if available."
211
222
},
212
223
"sha256": {
213
224
"type": "string",
214
225
"maxLength": 64,
215
226
"description": "SHA-256 checksum of the retrieved file or canonical source payload, if available."
216
227
},
217
228
"htmlUrl": {
218
229
"type": "string",
219
230
"format": "uri",
220
231
"description": "HTML landing page or web version of the document, if available."
221
232
},
222
233
"license": {
223
234
"type": "string",
224
235
"maxLength": 512,
225
236
"description": "Per-document license override, if the document is licensed differently from the source-level default.",
226
237
"maxGraphemes": 128
227
238
},
228
239
"fileName": {
229
240
"type": "string",
230
241
"maxLength": 1024,
231
242
"description": "Original or normalized file name, if applicable.",
232
243
"maxGraphemes": 256
233
244
},
234
245
"mimeType": {
235
246
"type": "string",
236
247
"maxLength": 128,
237
248
"description": "MIME type of the retrieved representation (e.g., 'text/html', 'application/pdf')."
238
249
},
239
250
"sourceId": {
240
251
"type": "string",
241
252
"maxLength": 512,
242
253
"description": "Source-system identifier for deduplication, if provided by the upstream source."
243
254
},
244
255
"sizeBytes": {
245
256
"type": "integer",
246
257
"minimum": 0,
247
258
"description": "Size of the retrieved file or canonical representation in bytes."
248
259
},
249
260
"accessType": {
250
261
"type": "string",
251
262
"maxLength": 64,
252
263
"description": "Access status of the source at retrieval time. Use 'previously-public' for documents that were once publicly accessible but have since been withdrawn or removed by the source.",
253
264
"knownValues": [
254
265
"public",
255
266
"restricted",
256
267
"paywalled",
257
268
"previously-public",
258
269
"unknown"
259
270
]
260
271
},
261
272
"retrievedAt": {
262
273
"type": "string",
263
274
"format": "datetime",
264
275
"description": "When the source was retrieved by the pipeline."
265
276
},
266
277
"canonicalUrl": {
267
278
"type": "string",
268
279
"format": "uri",
269
280
"description": "Canonical, normalized, or preferred public URL for the document."
270
281
}
271
282
},
272
283
"description": "Per-document retrieval metadata for a single retrieved representation. Publisher-level metadata (name, base URL, license) lives on the tech.transparencia.document.source record referenced by 'source'."
273
284
},
274
285
"identifier": {
275
286
"type": "object",
276
287
"required": [
277
288
"type",
278
289
"value"
279
290
],
280
291
"properties": {
281
292
"url": {
282
293
"type": "string",
283
294
"format": "uri",
284
295
"description": "Optional URL where this identifier can be resolved or verified."
285
296
},
286
297
"type": {
287
298
"type": "string",
288
299
"maxLength": 128,
289
300
"description": "Identifier type or namespace.",
290
301
"knownValues": [
291
302
"dof_id",
292
303
"dof_publication_id",
293
304
"unfccc_symbol",
294
305
"official_file_number",
295
306
"docket_number",
296
307
"case_number",
297
308
"law_number",
298
309
"isbn",
299
310
"issn",
300
311
"doi",
301
312
"other"
302
313
]
303
314
},
304
315
"value": {
305
316
"type": "string",
306
317
"maxLength": 1024,
307
318
"description": "Identifier value.",
308
319
"maxGraphemes": 256
309
320
}
310
321
},
311
322
"description": "External identifier assigned to a document by a source system, authority, archive, or standard. For content hashes use retrieval.sha256; for URLs use retrieval.url."
312
323
}
313
324
},
314
325
"$type": "com.atproto.lexicon.schema",
315
326
"lexicon": 1,
316
327
"description": "A canonical public-interest document record. Represents one official publication, report, filing, act, submission, or other source document before structural parsing or AI enrichment."
317
328
}