hashiverse_lib/tools/
url_preview.rs1use scraper::{Html, Selector};
15
16pub struct UrlPreviewData {
17 pub title: String,
18 pub description: String,
19 pub image_url: String,
20 pub canonical_url: String,
21}
22
23pub fn extract_url_preview(html: &str) -> UrlPreviewData {
24 let document = Html::parse_document(html);
25
26 let og_title = select_meta_content(&document, "meta[property='og:title']");
27 let og_description = select_meta_content(&document, "meta[property='og:description']");
28 let og_image = select_meta_content(&document, "meta[property='og:image']");
29 let og_url = select_meta_content(&document, "meta[property='og:url']");
30
31 let title = og_title.or_else(|| select_title(&document)).unwrap_or_default();
32 let description = og_description
33 .or_else(|| select_meta_content(&document, "meta[name='description']"))
34 .unwrap_or_default();
35
36 UrlPreviewData {
37 title,
38 description,
39 image_url: og_image.unwrap_or_default(),
40 canonical_url: og_url.unwrap_or_default(),
41 }
42}
43
44fn select_meta_content(document: &Html, selector_str: &str) -> Option<String> {
45 let selector = Selector::parse(selector_str).ok()?;
46 document.select(&selector).next()?.value().attr("content").map(|s| s.to_string())
47}
48
49fn select_title(document: &Html) -> Option<String> {
50 let selector = Selector::parse("title").ok()?;
51 Some(document.select(&selector).next()?.text().collect::<String>())
52}
53
54#[cfg(test)]
55mod tests {
56 use super::*;
57
58 #[test]
59 fn test_extract_url_preview_with_og_tags() {
60 let html = r#"
61 <!DOCTYPE html>
62 <html>
63 <head>
64 <meta property="og:title" content="OG Title" />
65 <meta property="og:description" content="OG Description" />
66 <meta property="og:image" content="https://example.com/og.png" />
67 <meta property="og:url" content="https://example.com/canonical" />
68 <title>Page Title</title>
69 </head>
70 <body></body>
71 </html>
72 "#;
73
74 let data = extract_url_preview(html);
75 assert_eq!(data.title, "OG Title");
76 assert_eq!(data.description, "OG Description");
77 assert_eq!(data.image_url, "https://example.com/og.png");
78 assert_eq!(data.canonical_url, "https://example.com/canonical");
79 }
80
81 #[test]
82 fn test_extract_url_preview_fallback_to_title_and_meta_description() {
83 let html = r#"
84 <!DOCTYPE html>
85 <html>
86 <head>
87 <title>Fallback Title</title>
88 <meta name="description" content="Fallback Description" />
89 </head>
90 <body></body>
91 </html>
92 "#;
93
94 let data = extract_url_preview(html);
95 assert_eq!(data.title, "Fallback Title");
96 assert_eq!(data.description, "Fallback Description");
97 assert_eq!(data.image_url, "");
98 assert_eq!(data.canonical_url, "");
99 }
100
101 #[test]
102 fn test_extract_url_preview_empty_html() {
103 let data = extract_url_preview("");
104 assert_eq!(data.title, "");
105 assert_eq!(data.description, "");
106 assert_eq!(data.image_url, "");
107 assert_eq!(data.canonical_url, "");
108 }
109
110 #[cfg(not(target_arch = "wasm32"))]
111 mod bolero_fuzz {
112 use super::*;
113
114 #[test]
115 fn fuzz_extract_url_preview() {
116 bolero::check!().for_each(|data: &[u8]| {
117 if let Ok(html) = std::str::from_utf8(data) {
118 let _ = extract_url_preview(html);
119 }
120 });
121 }
122 }
123
124 #[test]
125 fn test_extract_url_preview_og_overrides_title() {
126 let html = r#"
127 <!DOCTYPE html>
128 <html>
129 <head>
130 <title>Page Title</title>
131 <meta property="og:title" content="OG Title" />
132 </head>
133 <body></body>
134 </html>
135 "#;
136
137 let data = extract_url_preview(html);
138 assert_eq!(data.title, "OG Title");
139 }
140}