{
"id": "tech.transparencia.document.item",
"defs": {
"main": {
"key": "tid",
"type": "record",
"record": {
"type": "object",
"required": [
"title",
"documentType",
"source",
"retrieval",
"publishedAt",
"createdAt"
],
"properties": {
"title": {
"type": "string",
"maxLength": 4096,
"description": "Official or source-provided title of the document.",
"maxGraphemes": 1024
},
"source": {
"ref": "com.atproto.repo.strongRef",
"type": "ref",
"description": "Strong reference to the tech.transparencia.document.source record for the publisher or repository (e.g., DOF, UNFCCC). Identifies which source this document came from."
},
"topics": {
"type": "array",
"items": {
"type": "string",
"maxLength": 512,
"maxGraphemes": 128
},
"maxLength": 30,
"description": "Free-form topics, tags, or source categories attached to the document."
},
"country": {
"type": "string",
"maxLength": 2,
"description": "Primary country connected to the document, as an ISO 3166-1 alpha-2 code (e.g., 'MX', 'BR', 'US'). Omit for international documents."
},
"domains": {
"type": "array",
"items": {
"type": "string",
"maxLength": 128,
"knownValues": [
"government",
"politics",
"law",
"justice",
"environment",
"climate",
"education",
"health",
"budget",
"procurement",
"economy",
"finance",
"labor",
"energy",
"infrastructure",
"security",
"science-technology",
"society",
"human-rights",
"other"
],
"maxGraphemes": 64
},
"maxLength": 20,
"description": "Broad public-interest domains covered by the document. Open set; consumers should tolerate unknown values."
},
"issuedAt": {
"type": "string",
"format": "datetime",
"description": "When the issuing authority signed, issued, adopted, or approved the document, if different from publication time."
},
"language": {
"type": "string",
"format": "language",
"description": "Primary language of the document content (BCP-47, e.g., 'es-MX', 'en', 'pt-BR')."
},
"subtitle": {
"type": "string",
"maxLength": 4096,
"description": "Optional subtitle, section heading, or secondary title.",
"maxGraphemes": 1024
},
"createdAt": {
"type": "string",
"format": "datetime",
"description": "When this AT Protocol record was created."
},
"retrieval": {
"ref": "#retrieval",
"type": "ref",
"description": "Per-document retrieval metadata: canonical URLs, MIME type, checksums, file size, and access status of the specific retrieved representation."
},
"updatedAt": {
"type": "string",
"format": "datetime",
"description": "When this record was last materially updated."
},
"description": {
"type": "string",
"maxLength": 10000,
"description": "Short source-provided description or human-readable abstract. AI summaries should be stored in enrichment records.",
"maxGraphemes": 2000
},
"effectiveAt": {
"type": "string",
"format": "datetime",
"description": "When the document's legal or administrative effects begin, if applicable and explicitly known."
},
"identifiers": {
"type": "array",
"items": {
"ref": "#identifier",
"type": "ref"
},
"maxLength": 50,
"description": "External identifiers such as DOF IDs, UNFCCC symbols, file numbers, docket numbers, ISBNs, or local archival IDs. For content hashes use retrieval.sha256; for URLs use retrieval.url."
},
"publishedAt": {
"type": "string",
"format": "datetime",
"description": "When the document was published by the source. Use midnight UTC when only a calendar date is available."
},
"documentType": {
"type": "string",
"maxLength": 128,
"description": "Machine-readable document category. Open set; known values cover common official and institutional documents.",
"knownValues": [
"official-publication",
"official-gazette-issue",
"official-gazette-entry",
"law",
"decree",
"agreement",
"notice",
"regulation",
"standard",
"report",
"audit-report",
"budget-document",
"contract",
"procurement-document",
"court-ruling",
"legislative-bill",
"legislative-opinion",
"treaty",
"submission",
"technical-paper",
"environmental-impact-document",
"education-policy-document",
"dataset-documentation",
"meeting-minutes",
"resolution",
"other"
]
},
"jurisdiction": {
"type": "string",
"maxLength": 256,
"description": "Legal or administrative jurisdiction covered by the document (e.g., 'federal', 'state', 'municipal', 'international').",
"knownValues": [
"local",
"municipal",
"state",
"federal",
"national",
"regional",
"international",
"supranational",
"unknown"
],
"maxGraphemes": 64
},
"issuingBodies": {
"type": "array",
"items": {
"ref": "tech.transparencia.defs#organization",
"type": "ref"
},
"maxLength": 20,
"description": "Organizations, public bodies, institutions, or authorities responsible for issuing, publishing, filing, or adopting the document. Uses the shared tech.transparencia.defs#organization type. Conventional role values include 'publisher', 'issuer', 'author', 'adopter', 'filer', 'regulator', 'court', 'legislature', 'repository'."
}
}
},
"description": "Core document metadata for official and institutional documents. Stores identity, provenance, and public context, but not full text, sections, chunks, AI analysis, or ingestion pipeline state."
},
"retrieval": {
"type": "object",
"required": [
"url",
"retrievedAt"
],
"properties": {
"url": {
"type": "string",
"format": "uri",
"description": "URL where this document was found or retrieved."
},
"blob": {
"type": "blob",
"accept": [
"application/pdf",
"text/html",
"text/plain",
"application/json"
],
"maxSize": 10000000,
"description": "Optional binary attachment preserving the actual document bytes on this PDS. Typical contents: the source PDF, an HTML snapshot, the extracted plain text used by the enrichment pipeline, or the original upstream JSON payload (e.g., a SIDOF response). The other retrieval fields (url, pdfUrl, sha256) still reference the original public source — this blob is the archived copy. Max 10 MB."
},
"pdfUrl": {
"type": "string",
"format": "uri",
"description": "PDF or downloadable document URL, if available."
},
"sha256": {
"type": "string",
"maxLength": 64,
"description": "SHA-256 checksum of the retrieved file or canonical source payload, if available."
},
"htmlUrl": {
"type": "string",
"format": "uri",
"description": "HTML landing page or web version of the document, if available."
},
"license": {
"type": "string",
"maxLength": 512,
"description": "Per-document license override, if the document is licensed differently from the source-level default.",
"maxGraphemes": 128
},
"fileName": {
"type": "string",
"maxLength": 1024,
"description": "Original or normalized file name, if applicable.",
"maxGraphemes": 256
},
"mimeType": {
"type": "string",
"maxLength": 128,
"description": "MIME type of the retrieved representation (e.g., 'text/html', 'application/pdf')."
},
"sourceId": {
"type": "string",
"maxLength": 512,
"description": "Source-system identifier for deduplication, if provided by the upstream source."
},
"sizeBytes": {
"type": "integer",
"minimum": 0,
"description": "Size of the retrieved file or canonical representation in bytes."
},
"accessType": {
"type": "string",
"maxLength": 64,
"description": "Access status of the source at retrieval time. Use 'previously-public' for documents that were once publicly accessible but have since been withdrawn or removed by the source.",
"knownValues": [
"public",
"restricted",
"paywalled",
"previously-public",
"unknown"
]
},
"retrievedAt": {
"type": "string",
"format": "datetime",
"description": "When the source was retrieved by the pipeline."
},
"canonicalUrl": {
"type": "string",
"format": "uri",
"description": "Canonical, normalized, or preferred public URL for the document."
}
},
"description": "Per-document retrieval metadata for a single retrieved representation. Publisher-level metadata (name, base URL, license) lives on the tech.transparencia.document.source record referenced by 'source'."
},
"identifier": {
"type": "object",
"required": [
"type",
"value"
],
"properties": {
"url": {
"type": "string",
"format": "uri",
"description": "Optional URL where this identifier can be resolved or verified."
},
"type": {
"type": "string",
"maxLength": 128,
"description": "Identifier type or namespace.",
"knownValues": [
"dof_id",
"dof_publication_id",
"unfccc_symbol",
"official_file_number",
"docket_number",
"case_number",
"law_number",
"isbn",
"issn",
"doi",
"other"
]
},
"value": {
"type": "string",
"maxLength": 1024,
"description": "Identifier value.",
"maxGraphemes": 256
}
},
"description": "External identifier assigned to a document by a source system, authority, archive, or standard. For content hashes use retrieval.sha256; for URLs use retrieval.url."
}
},
"$type": "com.atproto.lexicon.schema",
"lexicon": 1,
"description": "A canonical public-interest document record. Represents one official publication, report, filing, act, submission, or other source document before structural parsing or AI enrichment."
}