477 lines
15 KiB
Rust
477 lines
15 KiB
Rust
use anyhow::{anyhow, Result};
|
|
use scraper::{Html, Selector};
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub enum ContentBlock {
|
|
Title(String),
|
|
Subtitle(String),
|
|
Paragraph(String),
|
|
Image { url: String, alt: String },
|
|
Quote(String),
|
|
Heading(String),
|
|
ListItem(String),
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub struct Article {
|
|
pub title: String,
|
|
pub content: Vec<ContentBlock>,
|
|
pub source_url: String,
|
|
}
|
|
|
|
pub fn parse_article(html: &str, source_url: &str) -> Result<Article> {
|
|
let document = Html::parse_document(html);
|
|
|
|
let title = extract_title(&document);
|
|
let content = extract_content(&document, source_url);
|
|
|
|
if content.is_empty() {
|
|
return Err(anyhow!("Nao foi possivel extrair conteudo do artigo"));
|
|
}
|
|
|
|
Ok(Article {
|
|
title,
|
|
content,
|
|
source_url: source_url.to_string(),
|
|
})
|
|
}
|
|
|
|
fn extract_title(document: &Html) -> String {
|
|
let selectors = [
|
|
// Sites BR
|
|
".content-head__title",
|
|
".titulo-materia",
|
|
".materia-titulo",
|
|
".post-title",
|
|
".entry-title",
|
|
".article-title",
|
|
".headline",
|
|
// Sites internacionais
|
|
"h1[data-testid='headline']",
|
|
"h1.article-headline",
|
|
"h1.post-headline",
|
|
"h1.story-title",
|
|
// Genéricos
|
|
"article h1",
|
|
".headline h1",
|
|
"header h1",
|
|
"h1",
|
|
"title",
|
|
];
|
|
|
|
for sel_str in selectors {
|
|
if let Ok(selector) = Selector::parse(sel_str) {
|
|
if let Some(element) = document.select(&selector).next() {
|
|
let text: String = element.text().collect::<Vec<_>>().join(" ");
|
|
let text = text.trim().to_string();
|
|
if !text.is_empty() && text.len() > 5 {
|
|
return text;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
String::from("Sem titulo")
|
|
}
|
|
|
|
fn extract_content(document: &Html, base_url: &str) -> Vec<ContentBlock> {
|
|
let mut content = Vec::new();
|
|
|
|
// Seletores para o container principal do artigo
|
|
// Ordem importa: mais específicos primeiro
|
|
let article_selectors = [
|
|
// Globo/O Globo/G1
|
|
".mc-article-body",
|
|
".mc-body",
|
|
".content-text__container",
|
|
".materia-conteudo",
|
|
".post-content",
|
|
// Folha de SP
|
|
".c-news__body",
|
|
".news__content",
|
|
// Estadão
|
|
".news-body",
|
|
".content-body",
|
|
// UOL
|
|
".text",
|
|
".corpo-texto",
|
|
// Valor Econômico
|
|
".article-text",
|
|
// Exame
|
|
".single-content",
|
|
// Veja
|
|
".article-content",
|
|
// Medium/Freedium/Scribe.rip
|
|
"article section",
|
|
"article",
|
|
".main-content",
|
|
".postArticle-content",
|
|
// NY Times
|
|
"[data-testid='article-body']",
|
|
".StoryBodyCompanionColumn",
|
|
".story-body-supplemental",
|
|
// Washington Post
|
|
".article-body",
|
|
".teaser-content",
|
|
// The Guardian
|
|
".article-body-commercial-selector",
|
|
".content__article-body",
|
|
// Wall Street Journal
|
|
".article-content",
|
|
".wsj-snippet-body",
|
|
// Bloomberg
|
|
".body-content",
|
|
// Financial Times
|
|
".article__content-body",
|
|
// Forbes
|
|
".article-body",
|
|
".vestibule",
|
|
// Reuters
|
|
".article-body__content",
|
|
// BBC
|
|
"[data-component='text-block']",
|
|
".story-body__inner",
|
|
// CNN
|
|
".article__content",
|
|
".zn-body__paragraph",
|
|
// Wired
|
|
".body__inner-container",
|
|
// The Atlantic
|
|
".article-body",
|
|
// Economist
|
|
".article__body",
|
|
// Archive.is wrapper
|
|
"#CONTENT",
|
|
// Genéricos (fallback)
|
|
"article",
|
|
"[role='main']",
|
|
"[role='article']",
|
|
"main",
|
|
".entry-content",
|
|
".story-body",
|
|
".article-body",
|
|
".content",
|
|
"#content",
|
|
"#article-body",
|
|
".post",
|
|
"body",
|
|
];
|
|
|
|
let mut article_html: Option<scraper::ElementRef> = None;
|
|
|
|
for sel_str in article_selectors {
|
|
if let Ok(selector) = Selector::parse(sel_str) {
|
|
if let Some(element) = document.select(&selector).next() {
|
|
// Verifica se tem conteúdo útil
|
|
let text_len: usize = element.text().collect::<String>().len();
|
|
if text_len > 200 {
|
|
article_html = Some(element);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
let article = match article_html {
|
|
Some(a) => a,
|
|
None => return content,
|
|
};
|
|
|
|
// Extrai título se presente
|
|
if let Ok(h1_sel) = Selector::parse("h1") {
|
|
if let Some(h1) = article.select(&h1_sel).next() {
|
|
let text: String = h1.text().collect::<Vec<_>>().join(" ");
|
|
let text = clean_text(&text);
|
|
if !text.is_empty() && text.len() > 5 {
|
|
content.push(ContentBlock::Title(text));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extrai subtítulo/lead
|
|
let subtitle_selectors = [
|
|
".content-head__subtitle",
|
|
".subtitulo",
|
|
".lead",
|
|
".excerpt",
|
|
".article-summary",
|
|
".deck",
|
|
"h2.subtitle",
|
|
];
|
|
|
|
for sel_str in subtitle_selectors {
|
|
if let Ok(selector) = Selector::parse(sel_str) {
|
|
if let Some(element) = article.select(&selector).next() {
|
|
let text: String = element.text().collect::<Vec<_>>().join(" ");
|
|
let text = clean_text(&text);
|
|
if !text.is_empty() && text.len() > 10 {
|
|
content.push(ContentBlock::Subtitle(text));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Seletor abrangente para conteúdo
|
|
let content_selector = "p, h2, h3, h4, h5, h6, blockquote, img, li, figure img, \
|
|
.content-text, .paragraph, .text-paragraph, \
|
|
.content-intertitle h2, .intertitle, \
|
|
[data-component='text-block'], \
|
|
.paywall";
|
|
|
|
if let Ok(p_sel) = Selector::parse(content_selector) {
|
|
for element in article.select(&p_sel) {
|
|
let tag_name = element.value().name();
|
|
let class_attr = element.value().attr("class").unwrap_or("");
|
|
|
|
// Ignora elementos de navegação, ads, etc
|
|
if should_skip_element(class_attr) {
|
|
continue;
|
|
}
|
|
|
|
match tag_name {
|
|
"p" => {
|
|
let text: String = element.text().collect::<Vec<_>>().join(" ");
|
|
let text = clean_text(&text);
|
|
if is_valid_paragraph(&text) {
|
|
content.push(ContentBlock::Paragraph(text));
|
|
}
|
|
}
|
|
"div" if class_attr.contains("text") || class_attr.contains("paragraph") => {
|
|
let text: String = element.text().collect::<Vec<_>>().join(" ");
|
|
let text = clean_text(&text);
|
|
if is_valid_paragraph(&text) {
|
|
content.push(ContentBlock::Paragraph(text));
|
|
}
|
|
}
|
|
"h2" | "h3" | "h4" | "h5" | "h6" => {
|
|
let text: String = element.text().collect::<Vec<_>>().join(" ");
|
|
let text = clean_text(&text);
|
|
if !text.is_empty() && text.len() > 2 {
|
|
content.push(ContentBlock::Heading(text));
|
|
}
|
|
}
|
|
"blockquote" => {
|
|
let text: String = element.text().collect::<Vec<_>>().join(" ");
|
|
let text = clean_text(&text);
|
|
if !text.is_empty() {
|
|
content.push(ContentBlock::Quote(text));
|
|
}
|
|
}
|
|
"li" => {
|
|
let text: String = element.text().collect::<Vec<_>>().join(" ");
|
|
let text = clean_text(&text);
|
|
if !text.is_empty() && text.len() > 5 {
|
|
content.push(ContentBlock::ListItem(text));
|
|
}
|
|
}
|
|
"img" | "figure" => {
|
|
if let Some(src) = element.value().attr("src")
|
|
.or_else(|| element.value().attr("data-src"))
|
|
.or_else(|| element.value().attr("data-lazy-src"))
|
|
{
|
|
let url = resolve_url(src, base_url);
|
|
let alt = element.value().attr("alt").unwrap_or("").to_string();
|
|
if is_valid_image_url(&url) {
|
|
content.push(ContentBlock::Image { url, alt });
|
|
}
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback 1: tenta pegar todos os <p> do documento
|
|
if content.len() <= 2 {
|
|
if let Ok(p_sel) = Selector::parse("p") {
|
|
for element in document.select(&p_sel) {
|
|
let class_attr = element.value().attr("class").unwrap_or("");
|
|
if should_skip_element(class_attr) {
|
|
continue;
|
|
}
|
|
|
|
let text: String = element.text().collect::<Vec<_>>().join(" ");
|
|
let text = clean_text(&text);
|
|
if is_valid_paragraph(&text) {
|
|
content.push(ContentBlock::Paragraph(text));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback 2: tenta divs com texto
|
|
if content.len() <= 2 {
|
|
let div_selectors = "div.text, div.paragraph, div.content-text, \
|
|
section p, article p, .prose p, .body p";
|
|
|
|
if let Ok(div_sel) = Selector::parse(div_selectors) {
|
|
for element in document.select(&div_sel) {
|
|
let text: String = element.text().collect::<Vec<_>>().join(" ");
|
|
let text = clean_text(&text);
|
|
if is_valid_paragraph(&text) {
|
|
content.push(ContentBlock::Paragraph(text));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback 3: extrai texto de qualquer elemento com conteúdo substancial
|
|
if content.len() <= 2 {
|
|
extract_text_fallback(document, &mut content);
|
|
}
|
|
|
|
// Remove duplicatas consecutivas
|
|
content.dedup_by(|a, b| {
|
|
match (a, b) {
|
|
(ContentBlock::Paragraph(t1), ContentBlock::Paragraph(t2)) => t1 == t2,
|
|
(ContentBlock::Heading(t1), ContentBlock::Heading(t2)) => t1 == t2,
|
|
_ => false,
|
|
}
|
|
});
|
|
|
|
content
|
|
}
|
|
|
|
fn should_skip_element(class_attr: &str) -> bool {
|
|
let skip_classes = [
|
|
"nav", "menu", "footer", "header", "sidebar", "comment",
|
|
"share", "social", "related", "recommend", "advertisement",
|
|
"banner", "promo", "newsletter", "subscribe", "login",
|
|
"signup", "paywall-msg", "paywall-banner", "author-bio",
|
|
"tags", "breadcrumb", "pagination", "copyright",
|
|
];
|
|
|
|
let lower = class_attr.to_lowercase();
|
|
|
|
for skip in skip_classes {
|
|
// Check for word boundary match to avoid false positives like "leading" matching "ad"
|
|
for class in lower.split_whitespace() {
|
|
if class == skip || class.starts_with(&format!("{}-", skip)) || class.ends_with(&format!("-{}", skip)) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Also check for specific ad-related patterns
|
|
if lower.split_whitespace().any(|c| c == "ad" || c == "ads" || c.starts_with("ad-") || c.starts_with("ads-")) {
|
|
return true;
|
|
}
|
|
|
|
false
|
|
}
|
|
|
|
fn is_valid_paragraph(text: &str) -> bool {
|
|
if text.is_empty() || text.len() < 15 {
|
|
return false;
|
|
}
|
|
|
|
// Ignora textos que parecem ser metadata
|
|
let invalid_starts = [
|
|
"compartilh", "compart", "publicado", "atualizado", "por ",
|
|
"foto:", "imagem:", "crédito", "leia mais", "veja também",
|
|
"saiba mais", "continue lendo", "assine", "cadastre",
|
|
"©", "copyright", "todos os direitos",
|
|
];
|
|
|
|
let lower = text.to_lowercase();
|
|
for start in invalid_starts {
|
|
if lower.starts_with(start) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Deve ter pelo menos algumas palavras
|
|
text.split_whitespace().count() >= 4
|
|
}
|
|
|
|
fn is_valid_image_url(url: &str) -> bool {
|
|
!url.contains("data:")
|
|
&& !url.contains("base64")
|
|
&& !url.contains("pixel")
|
|
&& !url.contains("tracking")
|
|
&& !url.contains("1x1")
|
|
&& !url.contains("spacer")
|
|
&& !url.contains("blank")
|
|
&& !url.contains("logo")
|
|
&& !url.contains("icon")
|
|
&& (url.contains(".jpg")
|
|
|| url.contains(".jpeg")
|
|
|| url.contains(".png")
|
|
|| url.contains(".webp")
|
|
|| url.contains(".gif")
|
|
|| url.contains("image"))
|
|
}
|
|
|
|
fn extract_text_fallback(document: &Html, content: &mut Vec<ContentBlock>) {
|
|
// Última tentativa: pega blocos de texto grandes
|
|
if let Ok(selector) = Selector::parse("div, section, article") {
|
|
let mut texts: Vec<String> = Vec::new();
|
|
|
|
for element in document.select(&selector) {
|
|
let class_attr = element.value().attr("class").unwrap_or("");
|
|
if should_skip_element(class_attr) {
|
|
continue;
|
|
}
|
|
|
|
// Pega apenas texto direto (não de filhos)
|
|
let text: String = element
|
|
.text()
|
|
.collect::<Vec<_>>()
|
|
.join(" ");
|
|
|
|
let text = clean_text(&text);
|
|
|
|
if text.len() > 100 && !texts.contains(&text) {
|
|
// Divide em parágrafos por pontuação
|
|
for part in text.split(". ") {
|
|
let part = part.trim();
|
|
if part.len() > 50 {
|
|
let para = if part.ends_with('.') {
|
|
part.to_string()
|
|
} else {
|
|
format!("{}.", part)
|
|
};
|
|
if !texts.contains(¶) {
|
|
texts.push(para.clone());
|
|
content.push(ContentBlock::Paragraph(para));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
fn clean_text(text: &str) -> String {
|
|
text.split_whitespace()
|
|
.collect::<Vec<_>>()
|
|
.join(" ")
|
|
.trim()
|
|
.to_string()
|
|
}
|
|
|
|
fn resolve_url(src: &str, base_url: &str) -> String {
|
|
if src.starts_with("http://") || src.starts_with("https://") {
|
|
return src.to_string();
|
|
}
|
|
|
|
if src.starts_with("//") {
|
|
return format!("https:{}", src);
|
|
}
|
|
|
|
if let Some(pos) = base_url.find("://") {
|
|
let after_protocol = &base_url[pos + 3..];
|
|
if let Some(slash_pos) = after_protocol.find('/') {
|
|
let base = &base_url[..pos + 3 + slash_pos];
|
|
if src.starts_with('/') {
|
|
return format!("{}{}", base, src);
|
|
} else {
|
|
return format!("{}/{}", base, src);
|
|
}
|
|
}
|
|
}
|
|
|
|
format!("{}/{}", base_url.trim_end_matches('/'), src.trim_start_matches('/'))
|
|
}
|