Skip to main content

hashiverse_lib/tools/
buckets.rs

1//! # Hierarchical time buckets for post storage and discovery
2//!
3//! Posts on hashiverse aren't stored by post id — they are stored inside *buckets* keyed
4//! by `(bucket_type, base_id, duration, bucket_time_millis)`, and every bucket deterministically
5//! hashes down to a single [`Id`] known as its `location_id`. That location_id is what the
6//! Kademlia DHT uses to decide which servers are responsible for holding the bucket.
7//!
8//! ## Bucket types
9//!
10//! [`BucketType`] picks what the bucket is *about*:
11//! - `User` — posts authored by a specific client id
12//! - `Hashtag` — posts tagged with a given hashtag id
13//! - `Mention` — posts that mention a given client id
14//! - `ReplyToPost` — replies to a specific parent post
15//! - `Sequel` — long-running threads that continue from an earlier post
16//!
17//! ## Duration hierarchy
18//!
19//! [`BUCKET_DURATIONS`] is a coarse-to-fine ladder (year → month → week → day → 6h → 1h →
20//! 15m → 5m → 1m) chosen so that each level fans out ~4–6x into the next. When a bucket
21//! overflows at one granularity the client recurses into the finer granularity; the fan-out
22//! factor keeps the worst case from exploding into thousands of tiny buckets.
23//!
24//! Sequel buckets start at year granularity (cheaper discoverability for long-lived threads);
25//! all other types start at month so that a single location_id isn't responsible for a full
26//! year of heavy activity — see [`bucket_durations_for_type`].
27//!
28//! ## Types
29//!
30//! - [`Bucket`] — a duration-only description of a bucket.
31//! - [`BucketLocation`] — the full `(bucket_type, base_id, duration, bucket_time_millis,
32//!   location_id)` tuple with deterministic hashing, self-verification via
33//!   [`BucketLocation::validate`], and a versioned tilde-delimited serialisation for use in
34//!   HTML attributes and URLs.
35
36use crate::tools::hashing;
37use crate::tools::time::{DurationMillis, MILLIS_IN_DAY, MILLIS_IN_HOUR, MILLIS_IN_MINUTE, MILLIS_IN_MONTH, MILLIS_IN_WEEK, MILLIS_IN_YEAR, TimeMillis};
38use crate::tools::types::Id;
39use derive_more::Display;
40use serde::{Deserialize, Serialize};
41
42pub struct Bucket {
43    pub duration: DurationMillis,
44}
45
46#[repr(u8)]
47#[derive(Serialize, Deserialize, Debug, PartialEq, Clone, Copy, Display)]
48pub enum BucketType {
49    User = 0,
50    Hashtag = 1,
51    Mention = 2,
52    ReplyToPost = 3,
53    Sequel = 4,
54}
55
56#[derive(Serialize, Deserialize, Debug, PartialEq, Clone, Display)]
57#[display(fmt = "[ location_id: {}, bucket_type: {}, base_id: {}, duration: {}, bucket_time_millis: {}]", location_id, bucket_type, base_id, duration, bucket_time_millis)]
58pub struct BucketLocation {
59    pub bucket_type: BucketType,
60    pub base_id: Id,
61    pub duration: DurationMillis, // The bucket granularity
62    pub bucket_time_millis: TimeMillis,    // The timestamp at the beginning of the bucket (i.e. rounded down by duration)
63    pub location_id: Id,          // The resulting location_id
64}
65
66impl BucketLocation {
67    pub fn round_down_to_bucket_start(timestamp: TimeMillis, duration: DurationMillis) -> TimeMillis {
68        TimeMillis((timestamp.0 / duration.0) * duration.0)
69    }
70
71    pub fn new(bucket_type: BucketType, base_id: Id, duration: DurationMillis, timestamp: TimeMillis) -> anyhow::Result<Self> {
72        let bucket_time_millis = Self::round_down_to_bucket_start(timestamp, duration);
73
74        let duration_be = duration.encode_be();
75        let bucket_time_millis_be = bucket_time_millis.encode_be();
76
77        let hash = hashing::hash_multiple(&[&[bucket_type as u8], base_id.as_ref(), duration_be.as_ref(), bucket_time_millis_be.as_ref()]);
78        let location_id = Id::from_hash(hash)?;
79
80        Ok(BucketLocation { bucket_type, base_id, duration, bucket_time_millis, location_id })
81    }
82    pub fn get_hash_for_signing(&self) -> crate::tools::types::Hash {
83        let bucket_type_byte = [self.bucket_type as u8];
84        let duration_be = self.duration.encode_be();
85        let bucket_time_be = self.bucket_time_millis.encode_be();
86        hashing::hash_multiple(&[&bucket_type_byte, self.base_id.as_ref(), duration_be.as_ref(), bucket_time_be.as_ref()])
87    }
88
89    pub fn validate(&self) -> anyhow::Result<()> {
90        let other = Self::new(self.bucket_type, self.base_id, self.duration, self.bucket_time_millis)?;
91        if self.location_id != other.location_id {
92            anyhow::bail!("BucketLocation validation failed");
93        }
94        Ok(())
95    }
96
97    /// Serialise to a versioned, tilde-delimited string safe for use as an HTML attribute value or URL segment.
98    /// Format: `{version}~{bucket_type}~{base_id_hex}~{duration}~{bucket_time_millis}`
99    /// Example: `1~User~aabb...cc~1D~20240115.123045.000`
100    /// `location_id` is omitted — it is recomputed from the other fields on deserialisation.
101    /// Future versions may append additional tilde-separated fields; readers must ignore unknown fields.
102    pub fn to_html_attr(&self) -> String {
103        format!("1~{}~{}~{}~{}", self.bucket_type, self.base_id.to_hex_str(), self.duration, self.bucket_time_millis)
104    }
105
106    /// Deserialise from the format produced by [`BucketLocation::to_html_attr`].
107    pub fn from_html_attr(s: &str) -> anyhow::Result<Self> {
108        let parts: Vec<&str> = s.splitn(6, '~').collect();
109        anyhow::ensure!(parts.len() >= 5, "Invalid BucketLocation attr: expected at least 5 tilde-separated parts");
110        anyhow::ensure!(parts[0] == "1", "Unsupported BucketLocation attr version: {}", parts[0]);
111
112        let bucket_type = match parts[1] {
113            "User" => BucketType::User,
114            "Hashtag" => BucketType::Hashtag,
115            "Mention" => BucketType::Mention,
116            "ReplyToPost" => BucketType::ReplyToPost,
117            "Sequel" => BucketType::Sequel,
118            other => anyhow::bail!("Unknown BucketType: {}", other),
119        };
120        let base_id = Id::from_hex_str(parts[2])?;
121        let duration = DurationMillis::parse(parts[3])?;
122        let bucket_time_millis = TimeMillis::parse(parts[4])?;
123
124        Self::new(bucket_type, base_id, duration, bucket_time_millis)
125    }
126}
127pub fn generate_bucket_location(bucket_type: BucketType, base_id: Id, bucket_duration: DurationMillis, time_millis: TimeMillis) -> anyhow::Result<BucketLocation> {
128        BucketLocation::new(bucket_type, base_id, bucket_duration, time_millis)
129    }
130
131/// These are the durations of the hierarchical buckets that posts will collect in.
132/// They are spaced such that there are always roughly 4-6 "more granular" buckets per "less granular" bucket.
133/// This ensures that no overflowed bucket would result in many tiny buckets needing to be scanned at the next level.
134/// Also make sure that the "less granular" bucket is a round multiple of its "more` granular" successor.
135pub const BUCKET_DURATIONS: [DurationMillis; 9] = [MILLIS_IN_YEAR, MILLIS_IN_MONTH, MILLIS_IN_WEEK, MILLIS_IN_DAY, MILLIS_IN_HOUR.const_mul(6), MILLIS_IN_HOUR, MILLIS_IN_MINUTE.const_mul(15), MILLIS_IN_MINUTE.const_mul(5), MILLIS_IN_MINUTE];
136
137/// Returns the appropriate bucket duration slice for a given bucket type.
138/// Sequel buckets start at year granularity for cheaper discoverability.
139/// All other bucket types start at month granularity so that a single location_id (set of servers) is not responsible for a year of potentially high activity.
140pub fn bucket_durations_for_type(bucket_type: BucketType) -> &'static [DurationMillis] {
141    match bucket_type {
142        BucketType::Sequel => &BUCKET_DURATIONS,       // starts at year
143        _ => &BUCKET_DURATIONS[1..],                    // starts at month
144    }
145}
146
147
148#[cfg(test)]
149pub mod tests {
150    use crate::tools::buckets::{bucket_durations_for_type, BucketLocation, BucketType, BUCKET_DURATIONS};
151    use crate::tools::time::{TimeMillis, MILLIS_IN_DAY, MILLIS_IN_MONTH, MILLIS_IN_YEAR};
152    use crate::tools::types::Id;
153
154    #[tokio::test]
155    async fn ensure_bucket_duration_multiples_test() -> anyhow::Result<()> {
156        for i in 0..BUCKET_DURATIONS.len()-1 {
157            assert_eq!(0, BUCKET_DURATIONS[i].0 % BUCKET_DURATIONS[i+1].0)
158        }
159        Ok(())
160    }
161
162    #[tokio::test]
163    async fn bucket_location_html_attr_round_trip() -> anyhow::Result<()> {
164        let base_id = Id::random();
165        let original = BucketLocation::new(BucketType::Hashtag, base_id, MILLIS_IN_DAY, TimeMillis(1_700_000_000_000))?;
166        let attr = original.to_html_attr();
167        let restored = BucketLocation::from_html_attr(&attr)?;
168        assert_eq!(original, restored);
169        // Sanity: attr starts with version prefix
170        assert!(attr.starts_with("1~"));
171        Ok(())
172    }
173
174    #[tokio::test]
175    async fn bucket_location_sequel_html_attr_round_trip() -> anyhow::Result<()> {
176        let base_id = Id::random();
177        let original = BucketLocation::new(BucketType::Sequel, base_id, MILLIS_IN_YEAR, TimeMillis(1_700_000_000_000))?;
178        let attr = original.to_html_attr();
179        let restored = BucketLocation::from_html_attr(&attr)?;
180        assert_eq!(original, restored);
181        assert!(attr.contains("Sequel"));
182        Ok(())
183    }
184
185    #[tokio::test]
186    async fn bucket_durations_for_type_sequel_starts_at_year() -> anyhow::Result<()> {
187        let sequel_durations = bucket_durations_for_type(BucketType::Sequel);
188        let user_durations = bucket_durations_for_type(BucketType::User);
189        assert_eq!(sequel_durations[0], MILLIS_IN_YEAR);
190        assert_eq!(user_durations[0], MILLIS_IN_MONTH);
191        assert_eq!(sequel_durations.len(), user_durations.len() + 1);
192        Ok(())
193    }
194}