Skip to main content

hashiverse_lib/tools/
url_preview.rs

1//! # Open Graph / link-preview extraction
2//!
3//! Parses the `<head>` of an HTML page and extracts the fields needed to render a link
4//! preview card in a post: title, description, image, and canonical URL.
5//!
6//! Open Graph (`og:title`, `og:description`, `og:image`, `og:url`) is preferred; the
7//! extractor falls back to the page's `<title>` element and `<meta name="description">`
8//! when OG isn't present. The [`UrlPreviewData`] struct is what callers hand up to the
9//! protocol layer — servers fetch the target URL under an RPC budget gated by
10//! `POW_MINIMUM_PER_URL_FETCH` and return this struct back to the client so the preview
11//! card can render without every client individually fetching (and thus leaking its IP to)
12//! the target site.
13
14use scraper::{Html, Selector};
15
16pub struct UrlPreviewData {
17    pub title: String,
18    pub description: String,
19    pub image_url: String,
20    pub canonical_url: String,
21}
22
23pub fn extract_url_preview(html: &str) -> UrlPreviewData {
24    let document = Html::parse_document(html);
25
26    let og_title = select_meta_content(&document, "meta[property='og:title']");
27    let og_description = select_meta_content(&document, "meta[property='og:description']");
28    let og_image = select_meta_content(&document, "meta[property='og:image']");
29    let og_url = select_meta_content(&document, "meta[property='og:url']");
30
31    let title = og_title.or_else(|| select_title(&document)).unwrap_or_default();
32    let description = og_description
33        .or_else(|| select_meta_content(&document, "meta[name='description']"))
34        .unwrap_or_default();
35
36    UrlPreviewData {
37        title,
38        description,
39        image_url: og_image.unwrap_or_default(),
40        canonical_url: og_url.unwrap_or_default(),
41    }
42}
43
44fn select_meta_content(document: &Html, selector_str: &str) -> Option<String> {
45    let selector = Selector::parse(selector_str).ok()?;
46    document.select(&selector).next()?.value().attr("content").map(|s| s.to_string())
47}
48
49fn select_title(document: &Html) -> Option<String> {
50    let selector = Selector::parse("title").ok()?;
51    Some(document.select(&selector).next()?.text().collect::<String>())
52}
53
54#[cfg(test)]
55mod tests {
56    use super::*;
57
58    #[test]
59    fn test_extract_url_preview_with_og_tags() {
60        let html = r#"
61            <!DOCTYPE html>
62            <html>
63            <head>
64                <meta property="og:title" content="OG Title" />
65                <meta property="og:description" content="OG Description" />
66                <meta property="og:image" content="https://example.com/og.png" />
67                <meta property="og:url" content="https://example.com/canonical" />
68                <title>Page Title</title>
69            </head>
70            <body></body>
71            </html>
72        "#;
73
74        let data = extract_url_preview(html);
75        assert_eq!(data.title, "OG Title");
76        assert_eq!(data.description, "OG Description");
77        assert_eq!(data.image_url, "https://example.com/og.png");
78        assert_eq!(data.canonical_url, "https://example.com/canonical");
79    }
80
81    #[test]
82    fn test_extract_url_preview_fallback_to_title_and_meta_description() {
83        let html = r#"
84            <!DOCTYPE html>
85            <html>
86            <head>
87                <title>Fallback Title</title>
88                <meta name="description" content="Fallback Description" />
89            </head>
90            <body></body>
91            </html>
92        "#;
93
94        let data = extract_url_preview(html);
95        assert_eq!(data.title, "Fallback Title");
96        assert_eq!(data.description, "Fallback Description");
97        assert_eq!(data.image_url, "");
98        assert_eq!(data.canonical_url, "");
99    }
100
101    #[test]
102    fn test_extract_url_preview_empty_html() {
103        let data = extract_url_preview("");
104        assert_eq!(data.title, "");
105        assert_eq!(data.description, "");
106        assert_eq!(data.image_url, "");
107        assert_eq!(data.canonical_url, "");
108    }
109
110    #[cfg(not(target_arch = "wasm32"))]
111    mod bolero_fuzz {
112        use super::*;
113
114        #[test]
115        fn fuzz_extract_url_preview() {
116            bolero::check!().for_each(|data: &[u8]| {
117                if let Ok(html) = std::str::from_utf8(data) {
118                    let _ = extract_url_preview(html);
119                }
120            });
121        }
122    }
123
124    #[test]
125    fn test_extract_url_preview_og_overrides_title() {
126        let html = r#"
127            <!DOCTYPE html>
128            <html>
129            <head>
130                <title>Page Title</title>
131                <meta property="og:title" content="OG Title" />
132            </head>
133            <body></body>
134            </html>
135        "#;
136
137        let data = extract_url_preview(html);
138        assert_eq!(data.title, "OG Title");
139    }
140}