>().join(" ");
let text = clean_text(&text);
if !text.is_empty() && text.len() > 5 {
content.push(ContentBlock::ListItem(text));
}
}
"img" | "figure" => {
if let Some(src) = element.value().attr("src")
.or_else(|| element.value().attr("data-src"))
.or_else(|| element.value().attr("data-lazy-src"))
{
let url = resolve_url(src, base_url);
let alt = element.value().attr("alt").unwrap_or("").to_string();
if is_valid_image_url(&url) {
content.push(ContentBlock::Image { url, alt });
}
}
}
_ => {}
}
}
}
// Fallback 1: tenta pegar todos os do documento
if content.len() <= 2 {
if let Ok(p_sel) = Selector::parse("p") {
for element in document.select(&p_sel) {
let class_attr = element.value().attr("class").unwrap_or("");
if should_skip_element(class_attr) {
continue;
}
let text: String = element.text().collect::>().join(" ");
let text = clean_text(&text);
if is_valid_paragraph(&text) {
content.push(ContentBlock::Paragraph(text));
}
}
}
}
// Fallback 2: tenta divs com texto
if content.len() <= 2 {
let div_selectors = "div.text, div.paragraph, div.content-text, \
section p, article p, .prose p, .body p";
if let Ok(div_sel) = Selector::parse(div_selectors) {
for element in document.select(&div_sel) {
let text: String = element.text().collect::>().join(" ");
let text = clean_text(&text);
if is_valid_paragraph(&text) {
content.push(ContentBlock::Paragraph(text));
}
}
}
}
// Fallback 3: extrai texto de qualquer elemento com conteúdo substancial
if content.len() <= 2 {
extract_text_fallback(document, &mut content);
}
// Remove duplicatas consecutivas
content.dedup_by(|a, b| {
match (a, b) {
(ContentBlock::Paragraph(t1), ContentBlock::Paragraph(t2)) => t1 == t2,
(ContentBlock::Heading(t1), ContentBlock::Heading(t2)) => t1 == t2,
_ => false,
}
});
content
}
fn should_skip_element(class_attr: &str) -> bool {
let skip_classes = [
"nav", "menu", "footer", "header", "sidebar", "comment",
"share", "social", "related", "recommend", "advertisement",
"banner", "promo", "newsletter", "subscribe", "login",
"signup", "paywall-msg", "paywall-banner", "author-bio",
"tags", "breadcrumb", "pagination", "copyright",
];
let lower = class_attr.to_lowercase();
for skip in skip_classes {
// Check for word boundary match to avoid false positives like "leading" matching "ad"
for class in lower.split_whitespace() {
if class == skip || class.starts_with(&format!("{}-", skip)) || class.ends_with(&format!("-{}", skip)) {
return true;
}
}
}
// Also check for specific ad-related patterns
if lower.split_whitespace().any(|c| c == "ad" || c == "ads" || c.starts_with("ad-") || c.starts_with("ads-")) {
return true;
}
false
}
fn is_valid_paragraph(text: &str) -> bool {
if text.is_empty() || text.len() < 15 {
return false;
}
// Ignora textos que parecem ser metadata
let invalid_starts = [
"compartilh", "compart", "publicado", "atualizado", "por ",
"foto:", "imagem:", "crédito", "leia mais", "veja também",
"saiba mais", "continue lendo", "assine", "cadastre",
"©", "copyright", "todos os direitos",
];
let lower = text.to_lowercase();
for start in invalid_starts {
if lower.starts_with(start) {
return false;
}
}
// Deve ter pelo menos algumas palavras
text.split_whitespace().count() >= 4
}
fn is_valid_image_url(url: &str) -> bool {
!url.contains("data:")
&& !url.contains("base64")
&& !url.contains("pixel")
&& !url.contains("tracking")
&& !url.contains("1x1")
&& !url.contains("spacer")
&& !url.contains("blank")
&& !url.contains("logo")
&& !url.contains("icon")
&& (url.contains(".jpg")
|| url.contains(".jpeg")
|| url.contains(".png")
|| url.contains(".webp")
|| url.contains(".gif")
|| url.contains("image"))
}
fn extract_text_fallback(document: &Html, content: &mut Vec) {
// Última tentativa: pega blocos de texto grandes
if let Ok(selector) = Selector::parse("div, section, article") {
let mut texts: Vec = Vec::new();
for element in document.select(&selector) {
let class_attr = element.value().attr("class").unwrap_or("");
if should_skip_element(class_attr) {
continue;
}
// Pega apenas texto direto (não de filhos)
let text: String = element
.text()
.collect::>()
.join(" ");
let text = clean_text(&text);
if text.len() > 100 && !texts.contains(&text) {
// Divide em parágrafos por pontuação
for part in text.split(". ") {
let part = part.trim();
if part.len() > 50 {
let para = if part.ends_with('.') {
part.to_string()
} else {
format!("{}.", part)
};
if !texts.contains(¶) {
texts.push(para.clone());
content.push(ContentBlock::Paragraph(para));
}
}
}
}
}
}
}
fn clean_text(text: &str) -> String {
text.split_whitespace()
.collect::>()
.join(" ")
.trim()
.to_string()
}
fn resolve_url(src: &str, base_url: &str) -> String {
if src.starts_with("http://") || src.starts_with("https://") {
return src.to_string();
}
if src.starts_with("//") {
return format!("https:{}", src);
}
if let Some(pos) = base_url.find("://") {
let after_protocol = &base_url[pos + 3..];
if let Some(slash_pos) = after_protocol.find('/') {
let base = &base_url[..pos + 3 + slash_pos];
if src.starts_with('/') {
return format!("{}{}", base, src);
} else {
return format!("{}/{}", base, src);
}
}
}
format!("{}/{}", base_url.trim_end_matches('/'), src.trim_start_matches('/'))
}