{
"id": "science.alt.dataset.entry",
"defs": {
"main": {
"key": "tid",
"type": "record",
"record": {
"type": "object",
"required": [
"name",
"schemaRef",
"storage",
"createdAt"
],
"properties": {
"name": {
"type": "string",
"maxLength": 200,
"description": "Human-readable dataset name"
},
"size": {
"ref": "#datasetSize",
"type": "ref",
"description": "Dataset size information (optional)"
},
"tags": {
"type": "array",
"items": {
"type": "string",
"maxLength": 150
},
"maxLength": 30,
"description": "Searchable tags for dataset discovery. Aligns with Schema.org keywords property."
},
"license": {
"type": "string",
"maxLength": 200,
"description": "License identifier or URL. SPDX identifiers recommended (e.g., MIT, Apache-2.0, CC-BY-4.0) or full SPDX URLs (e.g., http://spdx.org/licenses/MIT). Aligns with Schema.org license property."
},
"storage": {
"refs": [
"science.alt.dataset.storageHttp",
"science.alt.dataset.storageS3",
"science.alt.dataset.storageBlobs"
],
"type": "union",
"description": "Storage location for dataset files (WebDataset tar archives)"
},
"metadata": {
"type": "bytes",
"maxLength": 100000,
"description": "Msgpack-encoded metadata dict for arbitrary extended key-value pairs. Use this for additional metadata beyond the core top-level fields (license, tags, size). Top-level fields are preferred for discoverable/searchable metadata."
},
"createdAt": {
"type": "string",
"format": "datetime",
"description": "Timestamp when this dataset entry was created"
},
"schemaRef": {
"type": "string",
"format": "at-uri",
"maxLength": 500,
"description": "AT-URI reference to the schema record for this dataset's samples"
},
"description": {
"type": "string",
"maxLength": 5000,
"description": "Human-readable description of the dataset"
},
"contentMetadata": {
"type": "unknown",
"description": "Dataset-level content metadata (e.g., instrument settings, acquisition parameters). Structure is validated against the schema referenced by metadataSchemaRef when present. Stored as an open JSON object."
},
"metadataSchemaRef": {
"type": "string",
"format": "at-uri",
"maxLength": 500,
"description": "Optional AT-URI reference to a schema record defining the structure of this dataset's content metadata. When present, contentMetadata is validated against this schema at write time."
}
}
},
"description": "Index entry for a WebDataset-backed dataset with references to storage location and sample schema"
},
"datasetSize": {
"type": "object",
"properties": {
"bytes": {
"type": "integer",
"minimum": 0,
"description": "Total size in bytes"
},
"shards": {
"type": "integer",
"minimum": 1,
"description": "Number of WebDataset shards"
},
"samples": {
"type": "integer",
"minimum": 0,
"description": "Total number of samples in the dataset"
}
},
"description": "Information about dataset size"
},
"shardChecksum": {
"type": "object",
"required": [
"algorithm",
"digest"
],
"properties": {
"digest": {
"type": "string",
"maxLength": 128,
"description": "Hex-encoded hash digest"
},
"algorithm": {
"type": "string",
"maxLength": 20,
"description": "Hash algorithm identifier (e.g., 'sha256', 'blake3')"
}
},
"description": "Content hash for shard integrity verification. Algorithm is flexible to allow SHA-256, BLAKE3, or other hash functions."
}
},
"$type": "com.atproto.lexicon.schema",
"lexicon": 1
}