RIP/src/parser.rs
2025-12-30 10:04:34 -03:00

477 lines
15 KiB
Rust

use anyhow::{anyhow, Result};
use scraper::{Html, Selector};
#[derive(Debug, Clone)]
pub enum ContentBlock {
Title(String),
Subtitle(String),
Paragraph(String),
Image { url: String, alt: String },
Quote(String),
Heading(String),
ListItem(String),
}
#[derive(Debug)]
pub struct Article {
pub title: String,
pub content: Vec<ContentBlock>,
pub source_url: String,
}
pub fn parse_article(html: &str, source_url: &str) -> Result<Article> {
let document = Html::parse_document(html);
let title = extract_title(&document);
let content = extract_content(&document, source_url);
if content.is_empty() {
return Err(anyhow!("Nao foi possivel extrair conteudo do artigo"));
}
Ok(Article {
title,
content,
source_url: source_url.to_string(),
})
}
fn extract_title(document: &Html) -> String {
let selectors = [
// Sites BR
".content-head__title",
".titulo-materia",
".materia-titulo",
".post-title",
".entry-title",
".article-title",
".headline",
// Sites internacionais
"h1[data-testid='headline']",
"h1.article-headline",
"h1.post-headline",
"h1.story-title",
// Genéricos
"article h1",
".headline h1",
"header h1",
"h1",
"title",
];
for sel_str in selectors {
if let Ok(selector) = Selector::parse(sel_str) {
if let Some(element) = document.select(&selector).next() {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = text.trim().to_string();
if !text.is_empty() && text.len() > 5 {
return text;
}
}
}
}
String::from("Sem titulo")
}
fn extract_content(document: &Html, base_url: &str) -> Vec<ContentBlock> {
let mut content = Vec::new();
// Seletores para o container principal do artigo
// Ordem importa: mais específicos primeiro
let article_selectors = [
// Globo/O Globo/G1
".mc-article-body",
".mc-body",
".content-text__container",
".materia-conteudo",
".post-content",
// Folha de SP
".c-news__body",
".news__content",
// Estadão
".news-body",
".content-body",
// UOL
".text",
".corpo-texto",
// Valor Econômico
".article-text",
// Exame
".single-content",
// Veja
".article-content",
// Medium/Freedium/Scribe.rip
"article section",
"article",
".main-content",
".postArticle-content",
// NY Times
"[data-testid='article-body']",
".StoryBodyCompanionColumn",
".story-body-supplemental",
// Washington Post
".article-body",
".teaser-content",
// The Guardian
".article-body-commercial-selector",
".content__article-body",
// Wall Street Journal
".article-content",
".wsj-snippet-body",
// Bloomberg
".body-content",
// Financial Times
".article__content-body",
// Forbes
".article-body",
".vestibule",
// Reuters
".article-body__content",
// BBC
"[data-component='text-block']",
".story-body__inner",
// CNN
".article__content",
".zn-body__paragraph",
// Wired
".body__inner-container",
// The Atlantic
".article-body",
// Economist
".article__body",
// Archive.is wrapper
"#CONTENT",
// Genéricos (fallback)
"article",
"[role='main']",
"[role='article']",
"main",
".entry-content",
".story-body",
".article-body",
".content",
"#content",
"#article-body",
".post",
"body",
];
let mut article_html: Option<scraper::ElementRef> = None;
for sel_str in article_selectors {
if let Ok(selector) = Selector::parse(sel_str) {
if let Some(element) = document.select(&selector).next() {
// Verifica se tem conteúdo útil
let text_len: usize = element.text().collect::<String>().len();
if text_len > 200 {
article_html = Some(element);
break;
}
}
}
}
let article = match article_html {
Some(a) => a,
None => return content,
};
// Extrai título se presente
if let Ok(h1_sel) = Selector::parse("h1") {
if let Some(h1) = article.select(&h1_sel).next() {
let text: String = h1.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if !text.is_empty() && text.len() > 5 {
content.push(ContentBlock::Title(text));
}
}
}
// Extrai subtítulo/lead
let subtitle_selectors = [
".content-head__subtitle",
".subtitulo",
".lead",
".excerpt",
".article-summary",
".deck",
"h2.subtitle",
];
for sel_str in subtitle_selectors {
if let Ok(selector) = Selector::parse(sel_str) {
if let Some(element) = article.select(&selector).next() {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if !text.is_empty() && text.len() > 10 {
content.push(ContentBlock::Subtitle(text));
break;
}
}
}
}
// Seletor abrangente para conteúdo
let content_selector = "p, h2, h3, h4, h5, h6, blockquote, img, li, figure img, \
.content-text, .paragraph, .text-paragraph, \
.content-intertitle h2, .intertitle, \
[data-component='text-block'], \
.paywall";
if let Ok(p_sel) = Selector::parse(content_selector) {
for element in article.select(&p_sel) {
let tag_name = element.value().name();
let class_attr = element.value().attr("class").unwrap_or("");
// Ignora elementos de navegação, ads, etc
if should_skip_element(class_attr) {
continue;
}
match tag_name {
"p" => {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if is_valid_paragraph(&text) {
content.push(ContentBlock::Paragraph(text));
}
}
"div" if class_attr.contains("text") || class_attr.contains("paragraph") => {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if is_valid_paragraph(&text) {
content.push(ContentBlock::Paragraph(text));
}
}
"h2" | "h3" | "h4" | "h5" | "h6" => {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if !text.is_empty() && text.len() > 2 {
content.push(ContentBlock::Heading(text));
}
}
"blockquote" => {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if !text.is_empty() {
content.push(ContentBlock::Quote(text));
}
}
"li" => {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if !text.is_empty() && text.len() > 5 {
content.push(ContentBlock::ListItem(text));
}
}
"img" | "figure" => {
if let Some(src) = element.value().attr("src")
.or_else(|| element.value().attr("data-src"))
.or_else(|| element.value().attr("data-lazy-src"))
{
let url = resolve_url(src, base_url);
let alt = element.value().attr("alt").unwrap_or("").to_string();
if is_valid_image_url(&url) {
content.push(ContentBlock::Image { url, alt });
}
}
}
_ => {}
}
}
}
// Fallback 1: tenta pegar todos os <p> do documento
if content.len() <= 2 {
if let Ok(p_sel) = Selector::parse("p") {
for element in document.select(&p_sel) {
let class_attr = element.value().attr("class").unwrap_or("");
if should_skip_element(class_attr) {
continue;
}
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if is_valid_paragraph(&text) {
content.push(ContentBlock::Paragraph(text));
}
}
}
}
// Fallback 2: tenta divs com texto
if content.len() <= 2 {
let div_selectors = "div.text, div.paragraph, div.content-text, \
section p, article p, .prose p, .body p";
if let Ok(div_sel) = Selector::parse(div_selectors) {
for element in document.select(&div_sel) {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if is_valid_paragraph(&text) {
content.push(ContentBlock::Paragraph(text));
}
}
}
}
// Fallback 3: extrai texto de qualquer elemento com conteúdo substancial
if content.len() <= 2 {
extract_text_fallback(document, &mut content);
}
// Remove duplicatas consecutivas
content.dedup_by(|a, b| {
match (a, b) {
(ContentBlock::Paragraph(t1), ContentBlock::Paragraph(t2)) => t1 == t2,
(ContentBlock::Heading(t1), ContentBlock::Heading(t2)) => t1 == t2,
_ => false,
}
});
content
}
fn should_skip_element(class_attr: &str) -> bool {
let skip_classes = [
"nav", "menu", "footer", "header", "sidebar", "comment",
"share", "social", "related", "recommend", "advertisement",
"banner", "promo", "newsletter", "subscribe", "login",
"signup", "paywall-msg", "paywall-banner", "author-bio",
"tags", "breadcrumb", "pagination", "copyright",
];
let lower = class_attr.to_lowercase();
for skip in skip_classes {
// Check for word boundary match to avoid false positives like "leading" matching "ad"
for class in lower.split_whitespace() {
if class == skip || class.starts_with(&format!("{}-", skip)) || class.ends_with(&format!("-{}", skip)) {
return true;
}
}
}
// Also check for specific ad-related patterns
if lower.split_whitespace().any(|c| c == "ad" || c == "ads" || c.starts_with("ad-") || c.starts_with("ads-")) {
return true;
}
false
}
fn is_valid_paragraph(text: &str) -> bool {
if text.is_empty() || text.len() < 15 {
return false;
}
// Ignora textos que parecem ser metadata
let invalid_starts = [
"compartilh", "compart", "publicado", "atualizado", "por ",
"foto:", "imagem:", "crédito", "leia mais", "veja também",
"saiba mais", "continue lendo", "assine", "cadastre",
"©", "copyright", "todos os direitos",
];
let lower = text.to_lowercase();
for start in invalid_starts {
if lower.starts_with(start) {
return false;
}
}
// Deve ter pelo menos algumas palavras
text.split_whitespace().count() >= 4
}
fn is_valid_image_url(url: &str) -> bool {
!url.contains("data:")
&& !url.contains("base64")
&& !url.contains("pixel")
&& !url.contains("tracking")
&& !url.contains("1x1")
&& !url.contains("spacer")
&& !url.contains("blank")
&& !url.contains("logo")
&& !url.contains("icon")
&& (url.contains(".jpg")
|| url.contains(".jpeg")
|| url.contains(".png")
|| url.contains(".webp")
|| url.contains(".gif")
|| url.contains("image"))
}
fn extract_text_fallback(document: &Html, content: &mut Vec<ContentBlock>) {
// Última tentativa: pega blocos de texto grandes
if let Ok(selector) = Selector::parse("div, section, article") {
let mut texts: Vec<String> = Vec::new();
for element in document.select(&selector) {
let class_attr = element.value().attr("class").unwrap_or("");
if should_skip_element(class_attr) {
continue;
}
// Pega apenas texto direto (não de filhos)
let text: String = element
.text()
.collect::<Vec<_>>()
.join(" ");
let text = clean_text(&text);
if text.len() > 100 && !texts.contains(&text) {
// Divide em parágrafos por pontuação
for part in text.split(". ") {
let part = part.trim();
if part.len() > 50 {
let para = if part.ends_with('.') {
part.to_string()
} else {
format!("{}.", part)
};
if !texts.contains(&para) {
texts.push(para.clone());
content.push(ContentBlock::Paragraph(para));
}
}
}
}
}
}
}
fn clean_text(text: &str) -> String {
text.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
.trim()
.to_string()
}
fn resolve_url(src: &str, base_url: &str) -> String {
if src.starts_with("http://") || src.starts_with("https://") {
return src.to_string();
}
if src.starts_with("//") {
return format!("https:{}", src);
}
if let Some(pos) = base_url.find("://") {
let after_protocol = &base_url[pos + 3..];
if let Some(slash_pos) = after_protocol.find('/') {
let base = &base_url[..pos + 3 + slash_pos];
if src.starts_with('/') {
return format!("{}{}", base, src);
} else {
return format!("{}/{}", base, src);
}
}
}
format!("{}/{}", base_url.trim_end_matches('/'), src.trim_start_matches('/'))
}