swh.storage.cassandra.model module
Classes representing tables in the Cassandra database.
They are very close to classes found in swh.model.model, but most of
them are subtly different:
- Large objects are split into other classes (eg. RevisionRow has no
‘parents’ field, because parents are stored in a different table,
represented by RevisionParentRow) 
- They have a “cols” field, which returns the list of column names
of the table 
- They only use types that map directly to Cassandra’s schema (ie. no enums) 
Therefore, this model doesn’t reuse swh.model.model, except for types
that can be mapped to UDTs (Person and TimestampWithTimezone).
Fields may have dataclasses metadata keys fk
if the existence of a corresponding row in a different table is almost guaranteed
(up to loaders not crashing and eventual-consistency settling down) and
points_to if they are a Merkle-DAG link to another object (which is more likely
to be missing).
This is used by swh.storage.cassandra.diagram.dot_diagram().
- 
swh.storage.cassandra.model.MAGIC_NULL_PK = b'<null>'
- NULLs (or all-empty blobs) are not allowed in primary keys; instead we use a
special value that can’t possibly be a valid hash. 
- 
swh.storage.cassandra.model.content_index_table_name(algo: str, skipped_content: bool) → str[source]
- Given an algorithm name, returns the name of one of the ‘content_by_*’
and ‘skipped_content_by_*’ tables that serve as index for the ‘content’
and ‘skipped_content’ tables based on this algorithm’s hashes. - For now it is a simple substitution, but future versions may append a version
number to it, if needed for schema updates. 
- 
class swh.storage.cassandra.model.BaseRow[source]
- Bases: - object
 - 
- 
TABLE: ClassVar[str]
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]]
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ()
 - 
- 
classmethod denullify_clustering_key(ck: Tuple) → Tuple[source]
- If this class has a Optional fields used as a clustering key, this replaces
such values from the given clustering key so it is suitable for sorting purposes 
 - 
- 
classmethod from_dict(d: Dict[str, Any]) → T[source]
 - 
- 
classmethod cols() → List[str][source]
 - 
- 
to_dict() → Dict[str, Any][source]
 
- 
class swh.storage.cassandra.model.MigrationRow(id: str, dependencies: set[str], min_read_version: str, status: str)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'migration'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)
 - 
- 
id: str
 - 
- 
dependencies: set[str]
 - 
- 
min_read_version: str
 - 
- 
status: str
- pending/- running/- completed
 
 
- 
class swh.storage.cassandra.model.ContentRow(sha1: bytes, sha1_git: bytes, sha256: bytes, blake2s256: bytes, length: int, ctime: datetime.datetime | None, status: str)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'content'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('sha256',)
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('sha1', 'sha1_git', 'blake2s256')
 - 
- 
sha1: bytes
 - 
- 
sha1_git: bytes
 - 
- 
sha256: bytes
 - 
- 
blake2s256: bytes
 - 
- 
length: int
 - 
- 
ctime: datetime | None
- creation time, i.e. time of (first) injection into the storage 
 - 
- 
status: str
 
- 
class swh.storage.cassandra.model.SkippedContentRow(sha1: bytes | None, sha1_git: bytes | None, sha256: bytes | None, blake2s256: bytes | None, length: int | None, ctime: datetime.datetime | None, status: str, reason: str, origin: str)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'skipped_content'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('sha1', 'sha1_git', 'sha256', 'blake2s256')
 - 
- 
sha1: bytes | None
 - 
- 
sha1_git: bytes | None
 - 
- 
sha256: bytes | None
 - 
- 
blake2s256: bytes | None
 - 
- 
length: int | None
 - 
- 
ctime: datetime | None
- creation time, i.e. time of (first) injection into the storage 
 - 
- 
status: str
 - 
- 
reason: str
 - 
- 
origin: str
 - 
- 
classmethod denullify_clustering_key(ck: Tuple) → Tuple[source]
- If this class has a Optional fields used as a clustering key, this replaces
such values from the given clustering key so it is suitable for sorting purposes 
 - 
- 
classmethod from_dict(d: Dict[str, Any]) → SkippedContentRow[source]
 
- 
class swh.storage.cassandra.model.DirectoryRow(id: bytes, raw_manifest: bytes | None)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'directory'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)
 - 
- 
id: bytes
 - 
- 
raw_manifest: bytes | None
- NULL if the object can be rebuild from (sorted) entries 
 
- 
class swh.storage.cassandra.model.DirectoryEntryRow(directory_id: bytes, name: bytes, target: bytes, perms: int, type: str)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'directory_entry'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('directory_id',)
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('name',)
 - 
- 
directory_id: bytes
 - 
- 
name: bytes
- path name, relative to containing dir 
 - 
- 
target: bytes
 - 
- 
perms: int
- unix-like permissions 
 - 
- 
type: str
- target type 
 
- 
class swh.storage.cassandra.model.RevisionRow(id: bytes, date: swh.model.model.TimestampWithTimezone | None, committer_date: swh.model.model.TimestampWithTimezone | None, type: str, directory: bytes, message: bytes, author: swh.model.model.Person, committer: swh.model.model.Person, synthetic: bool, metadata: str, extra_headers: dict, raw_manifest: bytes | None)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'revision'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)
 - 
- 
id: bytes
 - 
- 
date: TimestampWithTimezone | None
 - 
- 
committer_date: TimestampWithTimezone | None
 - 
- 
type: str
 - 
- 
directory: bytes
- source code “root” directory 
 - 
- 
message: bytes
 - 
- 
author: Person
 - 
- 
committer: Person
 - 
- 
synthetic: bool
- true iff revision has been created by Software Heritage 
 - 
- 
metadata: str
- extra metadata as JSON(tarball checksums, etc…) 
 - 
- 
extra_headers: dict
- extra commit information as (tuple(key, value), …) 
 - 
- 
raw_manifest: bytes | None
- NULL if the object can be rebuild from other cells and revision_parent. 
 
- 
class swh.storage.cassandra.model.RevisionParentRow(id: bytes, parent_rank: int, parent_id: bytes)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'revision_parent'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('parent_rank',)
 - 
- 
id: bytes
 - 
- 
parent_rank: int
- parent position in merge commits, 0-based 
 - 
- 
parent_id: bytes
 
- 
class swh.storage.cassandra.model.ReleaseRow(id: bytes, target_type: str, target: bytes, date: swh.model.model.TimestampWithTimezone, name: bytes, message: bytes, author: swh.model.model.Person, synthetic: bool, raw_manifest: bytes | None)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'release'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)
 - 
- 
id: bytes
 - 
- 
target_type: str
 - 
- 
target: bytes
 - 
- 
date: TimestampWithTimezone
 - 
- 
name: bytes
 - 
- 
message: bytes
 - 
- 
author: Person
 - 
- 
synthetic: bool
- true iff release has been created by Software Heritage 
 - 
- 
raw_manifest: bytes | None
- NULL if the object can be rebuild from other cells 
 
- 
class swh.storage.cassandra.model.SnapshotRow(id: bytes)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'snapshot'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)
 - 
- 
id: bytes
 
- 
class swh.storage.cassandra.model.SnapshotBranchRow(snapshot_id: bytes, name: bytes, target_type: str | None, target: bytes | None)[source]
- Bases: - BaseRow
 - For a given snapshot_id, branches are sorted by their name,
allowing easy pagination. - 
- 
TABLE: ClassVar[str] = 'snapshot_branch'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('snapshot_id',)
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('name',)
 - 
- 
snapshot_id: bytes
 - 
- 
name: bytes
 - 
- 
target_type: str | None
 - 
- 
target: bytes | None
 
- 
class swh.storage.cassandra.model.OriginVisitRow(origin: str, visit: int, date: datetime.datetime, type: str)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'origin_visit'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('origin',)
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('visit',)
 - 
- 
origin: str
 - 
- 
visit: int
 - 
- 
date: datetime
 - 
- 
type: str
 
- 
class swh.storage.cassandra.model.OriginVisitStatusRow(origin: str, visit: int, date: datetime.datetime, type: str, status: str, metadata: str, snapshot: bytes)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'origin_visit_status'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('origin',)
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('visit', 'date')
 - 
- 
origin: str
 - 
- 
visit: int
 - 
- 
date: datetime
 - 
- 
type: str
 - 
- 
status: str
 - 
- 
metadata: str
 - 
- 
snapshot: bytes
 - 
- 
classmethod from_dict(d: Dict[str, Any]) → T[source]
 
- 
class swh.storage.cassandra.model.OriginRow(sha1: bytes, url: str, next_visit_id: int)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'origin'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('sha1',)
 - 
- 
sha1: bytes
 - 
- 
url: str
 - 
- 
next_visit_id: int
- We need integer visit ids for compatibility with the pgsql
storage, so we’re using lightweight transactions with this trick:
https://stackoverflow.com/a/29391877/539465 
 
- 
class swh.storage.cassandra.model.MetadataAuthorityRow(url: str, type: str)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'metadata_authority'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('url',)
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('type',)
 - 
- 
url: str
 - 
- 
type: str
 
- 
class swh.storage.cassandra.model.MetadataFetcherRow(name: str, version: str)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'metadata_fetcher'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('name',)
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('version',)
 - 
- 
name: str
 - 
- 
version: str
 
- 
class swh.storage.cassandra.model.RawExtrinsicMetadataRow(id: bytes, type: str, target: str, authority_type: str, authority_url: str, discovery_date: datetime, fetcher_name: str, fetcher_version: str, format: str, metadata: bytes, origin: str | None, visit: int | None, snapshot: str | None, release: str | None, revision: str | None, path: bytes | None, directory: str | None)[source]
- Bases: - BaseRow
 - An explanation is in order for the primary key: - Intuitively, the primary key should only be ‘id’, because two metadata
entries are the same iff the id is the same; and ‘id’ is used for
deduplication. - However, we also want to query by
(target, authority_type, authority_url, discovery_date)
The naive solution to this would be an extra table, to use as index;
but it means 1. extra code to keep them in sync 2. overhead when writing
3. overhead + random reads (instead of linear) when reading. - Therefore, we use a single table for both, by adding the column
we want to query with before the id.
It solves both a) the query/order issues and b) the uniqueness issue because: - 
- adding the id at the end of the primary key does not change the rows’ order:
for two different rows, id1 != id2, so
(target1, …, date1) < (target2, …, date2)
<=> (target1, …, date1, id1) < (target2, …, date2, id2) 
- the id is a hash of all the columns, so:
rows are the same
<=> id1 == id2
<=> (target1, …, date1, id1) == (target2, …, date2, id2) 
 - 
- 
TABLE: ClassVar[str] = 'raw_extrinsic_metadata'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('target',)
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('authority_type', 'authority_url', 'discovery_date', 'id')
 - 
- 
id: bytes
 - 
- 
type: str
 - 
- 
target: str
 - 
- 
authority_type: str
 - 
- 
authority_url: str
 - 
- 
discovery_date: datetime
 - 
- 
fetcher_name: str
 - 
- 
fetcher_version: str
 - 
- 
format: str
 - 
- 
metadata: bytes
 - 
- 
origin: str | None
 - 
- 
visit: int | None
 - 
- 
snapshot: str | None
 - 
- 
release: str | None
 - 
- 
revision: str | None
 - 
- 
path: bytes | None
 - 
- 
directory: str | None
 
- 
class swh.storage.cassandra.model.RawExtrinsicMetadataByIdRow(id: bytes, target: str, authority_type: str, authority_url: str)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'raw_extrinsic_metadata_by_id'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ()
 - 
- 
id: bytes
 - 
- 
target: str
 - 
- 
authority_type: str
 - 
- 
authority_url: str
 
- 
class swh.storage.cassandra.model.ObjectCountRow(partition_key: int, object_type: str, count: int)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'object_count'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('partition_key',)
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('object_type',)
 - 
- 
partition_key: int
 - 
- 
object_type: str
 - 
- 
count: int
 
- 
class swh.storage.cassandra.model.ExtIDRow(extid_type: str, extid: bytes, extid_version: int, target_type: str, target: bytes)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'extid'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('extid_type', 'extid')
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('extid_version', 'target_type', 'target')
 - 
- 
extid_type: str
 - 
- 
extid: bytes
 - 
- 
extid_version: int
 - 
- 
target_type: str
 - 
- 
target: bytes
 
- 
class swh.storage.cassandra.model.ExtIDByTargetRow(target_type: str, target: bytes, target_token: int)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'extid_by_target'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('target_type', 'target')
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('target_token',)
 - 
- 
target_type: str
 - 
- 
target: bytes
 - 
- 
target_token: int
- value of token(pk) on the “primary” table 
 
- 
class swh.storage.cassandra.model.ObjectReferenceRow(target_type: str, target: bytes, source_type: str, source: bytes)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'object_references'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('target_type', 'target')
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('source_type', 'source')
 - 
- 
target_type: str
 - 
- 
target: bytes
 - 
- 
source_type: str
 - 
- 
source: bytes
 
- 
class swh.storage.cassandra.model.ObjectReferencesTableRow(pk: int, name: str, year: int, week: int, start: cassandra.util.Date, end: cassandra.util.Date)[source]
- Bases: - BaseRow
 - 
- 
TABLE: ClassVar[str] = 'object_references_table'
 - 
- 
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('pk',)
 - 
- 
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('name',)
 - 
- 
pk: int
- always zero, puts everything in the same Cassandra partition for faster querying 
 - 
- 
name: str
 - 
- 
year: int
- ISO year. 
 - 
- 
week: int
- ISO week. 
 - 
- 
start: Date
 - 
- 
end: Date