use anyhow::{anyhow, Result}; use scraper::{Html, Selector}; #[derive(Debug, Clone)] pub enum ContentBlock { Title(String), Subtitle(String), Paragraph(String), Image { url: String, alt: String }, Quote(String), Heading(String), ListItem(String), } #[derive(Debug)] pub struct Article { pub title: String, pub content: Vec, pub source_url: String, } pub fn parse_article(html: &str, source_url: &str) -> Result
{ let document = Html::parse_document(html); let title = extract_title(&document); let content = extract_content(&document, source_url); if content.is_empty() { return Err(anyhow!("Nao foi possivel extrair conteudo do artigo")); } Ok(Article { title, content, source_url: source_url.to_string(), }) } fn extract_title(document: &Html) -> String { let selectors = [ // Sites BR ".content-head__title", ".titulo-materia", ".materia-titulo", ".post-title", ".entry-title", ".article-title", ".headline", // Sites internacionais "h1[data-testid='headline']", "h1.article-headline", "h1.post-headline", "h1.story-title", // Genéricos "article h1", ".headline h1", "header h1", "h1", "title", ]; for sel_str in selectors { if let Ok(selector) = Selector::parse(sel_str) { if let Some(element) = document.select(&selector).next() { let text: String = element.text().collect::>().join(" "); let text = text.trim().to_string(); if !text.is_empty() && text.len() > 5 { return text; } } } } String::from("Sem titulo") } fn extract_content(document: &Html, base_url: &str) -> Vec { let mut content = Vec::new(); // Seletores para o container principal do artigo // Ordem importa: mais específicos primeiro let article_selectors = [ // Globo/O Globo/G1 ".mc-article-body", ".mc-body", ".content-text__container", ".materia-conteudo", ".post-content", // Folha de SP ".c-news__body", ".news__content", // Estadão ".news-body", ".content-body", // UOL ".text", ".corpo-texto", // Valor Econômico ".article-text", // Exame ".single-content", // Veja ".article-content", // Medium/Freedium/Scribe.rip "article section", "article", ".main-content", ".postArticle-content", // NY Times "[data-testid='article-body']", ".StoryBodyCompanionColumn", ".story-body-supplemental", // Washington Post ".article-body", ".teaser-content", // The Guardian ".article-body-commercial-selector", ".content__article-body", // Wall Street Journal ".article-content", ".wsj-snippet-body", // Bloomberg ".body-content", // Financial Times ".article__content-body", // Forbes ".article-body", ".vestibule", // Reuters ".article-body__content", // BBC "[data-component='text-block']", ".story-body__inner", // CNN ".article__content", ".zn-body__paragraph", // Wired ".body__inner-container", // The Atlantic ".article-body", // Economist ".article__body", // Archive.is wrapper "#CONTENT", // Genéricos (fallback) "article", "[role='main']", "[role='article']", "main", ".entry-content", ".story-body", ".article-body", ".content", "#content", "#article-body", ".post", "body", ]; let mut article_html: Option = None; for sel_str in article_selectors { if let Ok(selector) = Selector::parse(sel_str) { if let Some(element) = document.select(&selector).next() { // Verifica se tem conteúdo útil let text_len: usize = element.text().collect::().len(); if text_len > 200 { article_html = Some(element); break; } } } } let article = match article_html { Some(a) => a, None => return content, }; // Extrai título se presente if let Ok(h1_sel) = Selector::parse("h1") { if let Some(h1) = article.select(&h1_sel).next() { let text: String = h1.text().collect::>().join(" "); let text = clean_text(&text); if !text.is_empty() && text.len() > 5 { content.push(ContentBlock::Title(text)); } } } // Extrai subtítulo/lead let subtitle_selectors = [ ".content-head__subtitle", ".subtitulo", ".lead", ".excerpt", ".article-summary", ".deck", "h2.subtitle", ]; for sel_str in subtitle_selectors { if let Ok(selector) = Selector::parse(sel_str) { if let Some(element) = article.select(&selector).next() { let text: String = element.text().collect::>().join(" "); let text = clean_text(&text); if !text.is_empty() && text.len() > 10 { content.push(ContentBlock::Subtitle(text)); break; } } } } // Seletor abrangente para conteúdo let content_selector = "p, h2, h3, h4, h5, h6, blockquote, img, li, figure img, \ .content-text, .paragraph, .text-paragraph, \ .content-intertitle h2, .intertitle, \ [data-component='text-block'], \ .paywall"; if let Ok(p_sel) = Selector::parse(content_selector) { for element in article.select(&p_sel) { let tag_name = element.value().name(); let class_attr = element.value().attr("class").unwrap_or(""); // Ignora elementos de navegação, ads, etc if should_skip_element(class_attr) { continue; } match tag_name { "p" => { let text: String = element.text().collect::>().join(" "); let text = clean_text(&text); if is_valid_paragraph(&text) { content.push(ContentBlock::Paragraph(text)); } } "div" if class_attr.contains("text") || class_attr.contains("paragraph") => { let text: String = element.text().collect::>().join(" "); let text = clean_text(&text); if is_valid_paragraph(&text) { content.push(ContentBlock::Paragraph(text)); } } "h2" | "h3" | "h4" | "h5" | "h6" => { let text: String = element.text().collect::>().join(" "); let text = clean_text(&text); if !text.is_empty() && text.len() > 2 { content.push(ContentBlock::Heading(text)); } } "blockquote" => { let text: String = element.text().collect::>().join(" "); let text = clean_text(&text); if !text.is_empty() { content.push(ContentBlock::Quote(text)); } } "li" => { let text: String = element.text().collect::>().join(" "); let text = clean_text(&text); if !text.is_empty() && text.len() > 5 { content.push(ContentBlock::ListItem(text)); } } "img" | "figure" => { if let Some(src) = element.value().attr("src") .or_else(|| element.value().attr("data-src")) .or_else(|| element.value().attr("data-lazy-src")) { let url = resolve_url(src, base_url); let alt = element.value().attr("alt").unwrap_or("").to_string(); if is_valid_image_url(&url) { content.push(ContentBlock::Image { url, alt }); } } } _ => {} } } } // Fallback 1: tenta pegar todos os

do documento if content.len() <= 2 { if let Ok(p_sel) = Selector::parse("p") { for element in document.select(&p_sel) { let class_attr = element.value().attr("class").unwrap_or(""); if should_skip_element(class_attr) { continue; } let text: String = element.text().collect::>().join(" "); let text = clean_text(&text); if is_valid_paragraph(&text) { content.push(ContentBlock::Paragraph(text)); } } } } // Fallback 2: tenta divs com texto if content.len() <= 2 { let div_selectors = "div.text, div.paragraph, div.content-text, \ section p, article p, .prose p, .body p"; if let Ok(div_sel) = Selector::parse(div_selectors) { for element in document.select(&div_sel) { let text: String = element.text().collect::>().join(" "); let text = clean_text(&text); if is_valid_paragraph(&text) { content.push(ContentBlock::Paragraph(text)); } } } } // Fallback 3: extrai texto de qualquer elemento com conteúdo substancial if content.len() <= 2 { extract_text_fallback(document, &mut content); } // Remove duplicatas consecutivas content.dedup_by(|a, b| { match (a, b) { (ContentBlock::Paragraph(t1), ContentBlock::Paragraph(t2)) => t1 == t2, (ContentBlock::Heading(t1), ContentBlock::Heading(t2)) => t1 == t2, _ => false, } }); content } fn should_skip_element(class_attr: &str) -> bool { let skip_classes = [ "nav", "menu", "footer", "header", "sidebar", "comment", "share", "social", "related", "recommend", "advertisement", "banner", "promo", "newsletter", "subscribe", "login", "signup", "paywall-msg", "paywall-banner", "author-bio", "tags", "breadcrumb", "pagination", "copyright", ]; let lower = class_attr.to_lowercase(); for skip in skip_classes { // Check for word boundary match to avoid false positives like "leading" matching "ad" for class in lower.split_whitespace() { if class == skip || class.starts_with(&format!("{}-", skip)) || class.ends_with(&format!("-{}", skip)) { return true; } } } // Also check for specific ad-related patterns if lower.split_whitespace().any(|c| c == "ad" || c == "ads" || c.starts_with("ad-") || c.starts_with("ads-")) { return true; } false } fn is_valid_paragraph(text: &str) -> bool { if text.is_empty() || text.len() < 15 { return false; } // Ignora textos que parecem ser metadata let invalid_starts = [ "compartilh", "compart", "publicado", "atualizado", "por ", "foto:", "imagem:", "crédito", "leia mais", "veja também", "saiba mais", "continue lendo", "assine", "cadastre", "©", "copyright", "todos os direitos", ]; let lower = text.to_lowercase(); for start in invalid_starts { if lower.starts_with(start) { return false; } } // Deve ter pelo menos algumas palavras text.split_whitespace().count() >= 4 } fn is_valid_image_url(url: &str) -> bool { !url.contains("data:") && !url.contains("base64") && !url.contains("pixel") && !url.contains("tracking") && !url.contains("1x1") && !url.contains("spacer") && !url.contains("blank") && !url.contains("logo") && !url.contains("icon") && (url.contains(".jpg") || url.contains(".jpeg") || url.contains(".png") || url.contains(".webp") || url.contains(".gif") || url.contains("image")) } fn extract_text_fallback(document: &Html, content: &mut Vec) { // Última tentativa: pega blocos de texto grandes if let Ok(selector) = Selector::parse("div, section, article") { let mut texts: Vec = Vec::new(); for element in document.select(&selector) { let class_attr = element.value().attr("class").unwrap_or(""); if should_skip_element(class_attr) { continue; } // Pega apenas texto direto (não de filhos) let text: String = element .text() .collect::>() .join(" "); let text = clean_text(&text); if text.len() > 100 && !texts.contains(&text) { // Divide em parágrafos por pontuação for part in text.split(". ") { let part = part.trim(); if part.len() > 50 { let para = if part.ends_with('.') { part.to_string() } else { format!("{}.", part) }; if !texts.contains(¶) { texts.push(para.clone()); content.push(ContentBlock::Paragraph(para)); } } } } } } } fn clean_text(text: &str) -> String { text.split_whitespace() .collect::>() .join(" ") .trim() .to_string() } fn resolve_url(src: &str, base_url: &str) -> String { if src.starts_with("http://") || src.starts_with("https://") { return src.to_string(); } if src.starts_with("//") { return format!("https:{}", src); } if let Some(pos) = base_url.find("://") { let after_protocol = &base_url[pos + 3..]; if let Some(slash_pos) = after_protocol.find('/') { let base = &base_url[..pos + 3 + slash_pos]; if src.starts_with('/') { return format!("{}{}", base, src); } else { return format!("{}/{}", base, src); } } } format!("{}/{}", base_url.trim_end_matches('/'), src.trim_start_matches('/')) }