3. Data Model

v4.0-beta

        erDiagram
%% Single edge collection connecting all entities
    ATTESTATION }o--|| EDGE : "attests_name, attests_geometry, attests_timespan"
    ATTESTATION }o--|| EDGE : "typed_by, sourced_by, relates_to, meta_attestation"
    EDGE }o--|| ATTESTATION : "subject_of, meta_attestation"
    THING }o--|| EDGE : "subject_of"
    THING ||--}o EDGE : "relates_to"
    AUTHORITY }o--|| EDGE : "part_of"
    AUTHORITY ||--}o EDGE : "typed_by, sourced_by, part_of"
    EDGE }o--|| NAME : "attests_name"
    EDGE }o--|| GEOMETRY : "attests_geometry"
    EDGE }o--|| TIMESPAN : "attests_timespan"


%% Single unified edge collection
    EDGE {
        string _key PK
        string _from "any collection/xyz"
        string _to "any collection/abc"
        string edge_type "subject_of, attests_name, attests_geometry, attests_timespan, relates_to, meta_attestation, typed_by, sourced_by, part_of"
        string meta_type "bundles, contradicts, supersedes, challenges, supports (for meta_attestations only)"
        json properties "flexible storage for edge-specific attributes"
        timestamp created
    }


%% Core entity collections (document collections - vertices/nodes)
    THING {
        string _key PK "ArangoDB key"
        string _id "things/xyz"
        text description
        string thing_type "location, historical_entity, collection, period, route, itinerary, network"
        string primary_name "denormalized from highest-certainty name attestation"
        point representative_point "denormalized from geometry for spatial indexing"
        timestamp created
        timestamp modified
    }
    NAME {
        string _key PK
        string _id "names/xyz"
        string name
        string language "ISO 639-3"
        string script "ISO 15924"
        array name_type "toponym, chrononym, ethnonym, odonym, hydronym"
        string ipa "International Phonetic Alphabet"
        string romanized "romanized/transliterated form"
        string transliteration_system "e.g., Pinyin, BGN/PCGN, ISO 259"
        vector embedding "REQUIRED: 256-dimensional vector for phonetic search"
    }
    TIMESPAN {
        string _key PK
        string _id "timespans/xyz"
        bigint start_earliest "Unix timestamp (milliseconds) or geological time"
        bigint start_latest "Unix timestamp (milliseconds)"
        bigint stop_earliest "Unix timestamp (milliseconds)"
        bigint stop_latest "Unix timestamp (milliseconds) or future sentinel"
        string label "human-readable period name"
        string precision "year, decade, century, era, geological_period"
        integer precision_value "numeric precision in years"
        string periodo_id "PeriodO URI for standard period definitions"
    }
    ATTESTATION {
        string _key PK
        string _id "attestations/xyz"
        integer sequence "for ordered sequences in routes/itineraries (nullable)"
        json connection_metadata "for networks: trade goods, flow direction, route_type"
        float certainty "0.0 to 1.0 (nullable if unknown)"
        text certainty_note "explanation of certainty assessment"
        text notes "additional context or commentary"
        timestamp created
        timestamp modified
    }
    GEOMETRY {
        string _key PK
        string _id "geometries/xyz"
        geometry geom "GeoJSON: Point, MultiPoint, LineString, MultiLineString, Polygon, MultiPolygon"
        point representative_point "single point for spatial indexing and distance queries"
        geometry hull "convex hull for quick spatial filters"
        array bbox "[min_lon, min_lat, max_lon, max_lat]"
        array precision "e.g., [exact, approximate, uncertain, historical_approximate]"
        array precision_km "uncertainty radius in km (can be multiple if heterogeneous)"
        string source_crs "EPSG:4326 or historical/custom CRS identifier"
    }
%% Authority collection (single table inheritance)
    AUTHORITY {
        string _key PK
        string _id "authorities/xyz"
        string authority_type "dataset, source, relation_type, period, classification"
        text description "general description applicable to all types"
        timestamp created
        timestamp modified
        %% Dataset fields
        string title "for datasets: dataset name"
        string version "for datasets: version identifier"
        string publisher "for datasets: publishing institution"
        string license "for datasets: CC-BY, CC0, etc"
        string doi "for datasets: persistent identifier doi:10.83427/whg-dataset-123"
        %% Source fields
        string citation "for sources: bibliographic citation"
        array source_type "for sources: manuscript, inscription, archaeological, published, etc"
        string record_id "for sources: identifier in original source/dataset"
        %% Relation type fields
        string label "for relation_types, periods, classifications: machine-readable identifier"
        string inverse "for relation_types: inverse relation label"
        array domain "for relation_types: valid subject entity types"
        array range "for relation_types: valid object entity types"
        %% Period fields (from PeriodO)
        bigint start_earliest "for periods: temporal bounds"
        bigint start_latest "for periods: temporal bounds"
        bigint stop_earliest "for periods: temporal bounds"
        bigint stop_latest "for periods: temporal bounds"
        %% Classification fields
        string classification_system "for classifications: geonames_fclasses, aat_getty, custom"
        string classification_code "for classifications: A.ADM1, P.PPLA, H.STM, S.ARCH, etc"
        string classification_label "for classifications: human-readable name"
        %% Common fields
        string uri "for all types: external URI (PeriodO, source URL, dataset landing page, authority gazetteer)"
    }
    

Fig. 3.1 Entity–relationship diagram for the WHG v4 data model.

Note

Single EDGE Collection: The v4 model uses a unified edge collection with an edge_type field to distinguish between different relationship types (“subject_of”, “attests_name”, “attests_geometry”, “attests_timespan”, “relates_to”, “meta_attestation”, “typed_by”, “sourced_by”, “part_of”). This edge collection is separate from the attestations collection—attestations are nodes (documents) that serve as junction points, while edges connect them to other entities.

  1. Simplified schema management - One collection to maintain rather than separate collections for each relationship type

  2. Flexible relationship vocabulary - New edge types can be added without schema changes

  3. Efficient graph traversal - Graph algorithms can traverse all relationships uniformly

  4. Reduced operational overhead - Fewer indexes, backups, and permissions to manage

AUTHORITY Collection (Single Table Inheritance): Reference data (datasets, sources, relation_types, periods, certainty_levels) is unified in a single AUTHORITY collection using an authority_type discriminator field. This provides two key efficiencies:

  1. Reduced operational overhead - Managing one collection is simpler than maintaining five separate small collections (fewer indexes, backups, permissions to manage)

  2. Eliminated redundancy - Source metadata is stored once and referenced by multiple attestations through edges, rather than duplicated across millions of attestations that cite the same sources

ATTESTATIONS Collection (Document Collection): Attestations are stored as documents in a standard document collection, not as edges. Each attestation is a node in the graph containing only metadata (certainty, notes, sequence, connection_metadata, timestamps). All relationships (subject_of, attests_name, attests_geometry, etc.) are expressed through edges in the EDGE collection that connect attestation nodes to other entities. This architecture enables attestations to serve as junction points that bundle multiple claims together while maintaining clean separation between entities and relationships.

For Thing-to-Thing relationships using edge_type: "relates_to", the edge references an AUTHORITY document where authority_type: "relation_type" to specify the semantic nature of the relationship (e.g., “capital_of”, “successor_to”). This keeps the core model stable while allowing the vocabulary of historical relationships to grow organically as new use cases emerge.