RIP/src/parser.rs

use anyhow::{anyhow, Result};
use scraper::{Html, Selector};

#[derive(Debug, Clone)]
pub enum ContentBlock {
    Title(String),
    Subtitle(String),
    Paragraph(String),
    Image { url: String, alt: String },
    Quote(String),
    Heading(String),
    ListItem(String),
}

#[derive(Debug)]
pub struct Article {
    pub title: String,
    pub content: Vec<ContentBlock>,
    pub source_url: String,
}

pub fn parse_article(html: &str, source_url: &str) -> Result<Article> {
    let document = Html::parse_document(html);

    let title = extract_title(&document);
    let content = extract_content(&document, source_url);

    if content.is_empty() {
        return Err(anyhow!("Nao foi possivel extrair conteudo do artigo"));
    }

    Ok(Article {
        title,
        content,
        source_url: source_url.to_string(),
    })
}

fn extract_title(document: &Html) -> String {
    let selectors = [
        // Sites BR
        ".content-head__title",
        ".titulo-materia",
        ".materia-titulo",
        ".post-title",
        ".entry-title",
        ".article-title",
        ".headline",
        // Sites internacionais
        "h1[data-testid='headline']",
        "h1.article-headline",
        "h1.post-headline",
        "h1.story-title",
        // Genéricos
        "article h1",
        ".headline h1",
        "header h1",
        "h1",
        "title",
    ];

    for sel_str in selectors {
        if let Ok(selector) = Selector::parse(sel_str) {
            if let Some(element) = document.select(&selector).next() {
                let text: String = element.text().collect::<Vec<_>>().join(" ");
                let text = text.trim().to_string();
                if !text.is_empty() && text.len() > 5 {
                    return text;
                }
            }
        }
    }

    String::from("Sem titulo")
}

fn extract_content(document: &Html, base_url: &str) -> Vec<ContentBlock> {
    let mut content = Vec::new();

    // Seletores para o container principal do artigo
    // Ordem importa: mais específicos primeiro
    let article_selectors = [
        // Globo/O Globo/G1
        ".mc-article-body",
        ".mc-body",
        ".content-text__container",
        ".materia-conteudo",
        ".post-content",
        // Folha de SP
        ".c-news__body",
        ".news__content",
        // Estadão
        ".news-body",
        ".content-body",
        // UOL
        ".text",
        ".corpo-texto",
        // Valor Econômico
        ".article-text",
        // Exame
        ".single-content",
        // Veja
        ".article-content",
        // Medium/Freedium/Scribe.rip
        "article section",
        "article",
        ".main-content",
        ".postArticle-content",
        // NY Times
        "[data-testid='article-body']",
        ".StoryBodyCompanionColumn",
        ".story-body-supplemental",
        // Washington Post
        ".article-body",
        ".teaser-content",
        // The Guardian
        ".article-body-commercial-selector",
        ".content__article-body",
        // Wall Street Journal
        ".article-content",
        ".wsj-snippet-body",
        // Bloomberg
        ".body-content",
        // Financial Times
        ".article__content-body",
        // Forbes
        ".article-body",
        ".vestibule",
        // Reuters
        ".article-body__content",
        // BBC
        "[data-component='text-block']",
        ".story-body__inner",
        // CNN
        ".article__content",
        ".zn-body__paragraph",
        // Wired
        ".body__inner-container",
        // The Atlantic
        ".article-body",
        // Economist
        ".article__body",
        // Archive.is wrapper
        "#CONTENT",
        // Genéricos (fallback)
        "article",
        "[role='main']",
        "[role='article']",
        "main",
        ".entry-content",
        ".story-body",
        ".article-body",
        ".content",
        "#content",
        "#article-body",
        ".post",
        "body",
    ];

    let mut article_html: Option<scraper::ElementRef> = None;

    for sel_str in article_selectors {
        if let Ok(selector) = Selector::parse(sel_str) {
            if let Some(element) = document.select(&selector).next() {
                // Verifica se tem conteúdo útil
                let text_len: usize = element.text().collect::<String>().len();
                if text_len > 200 {
                    article_html = Some(element);
                    break;
                }
            }
        }
    }

    let article = match article_html {
        Some(a) => a,
        None => return content,
    };

    // Extrai título se presente
    if let Ok(h1_sel) = Selector::parse("h1") {
        if let Some(h1) = article.select(&h1_sel).next() {
            let text: String = h1.text().collect::<Vec<_>>().join(" ");
            let text = clean_text(&text);
            if !text.is_empty() && text.len() > 5 {
                content.push(ContentBlock::Title(text));
            }
        }
    }

    // Extrai subtítulo/lead
    let subtitle_selectors = [
        ".content-head__subtitle",
        ".subtitulo",
        ".lead",
        ".excerpt",
        ".article-summary",
        ".deck",
        "h2.subtitle",
    ];

    for sel_str in subtitle_selectors {
        if let Ok(selector) = Selector::parse(sel_str) {
            if let Some(element) = article.select(&selector).next() {
                let text: String = element.text().collect::<Vec<_>>().join(" ");
                let text = clean_text(&text);
                if !text.is_empty() && text.len() > 10 {
                    content.push(ContentBlock::Subtitle(text));
                    break;
                }
            }
        }
    }

    // Seletor abrangente para conteúdo
    let content_selector = "p, h2, h3, h4, h5, h6, blockquote, img, li, figure img, \
        .content-text, .paragraph, .text-paragraph, \
        .content-intertitle h2, .intertitle, \
        [data-component='text-block'], \
        .paywall";

    if let Ok(p_sel) = Selector::parse(content_selector) {
        for element in article.select(&p_sel) {
            let tag_name = element.value().name();
            let class_attr = element.value().attr("class").unwrap_or("");

            // Ignora elementos de navegação, ads, etc
            if should_skip_element(class_attr) {
                continue;
            }

            match tag_name {
                "p" => {
                    let text: String = element.text().collect::<Vec<_>>().join(" ");
                    let text = clean_text(&text);
                    if is_valid_paragraph(&text) {
                        content.push(ContentBlock::Paragraph(text));
                    }
                }
                "div" if class_attr.contains("text") || class_attr.contains("paragraph") => {
                    let text: String = element.text().collect::<Vec<_>>().join(" ");
                    let text = clean_text(&text);
                    if is_valid_paragraph(&text) {
                        content.push(ContentBlock::Paragraph(text));
                    }
                }
                "h2" | "h3" | "h4" | "h5" | "h6" => {
                    let text: String = element.text().collect::<Vec<_>>().join(" ");
                    let text = clean_text(&text);
                    if !text.is_empty() && text.len() > 2 {
                        content.push(ContentBlock::Heading(text));
                    }
                }
                "blockquote" => {
                    let text: String = element.text().collect::<Vec<_>>().join(" ");
                    let text = clean_text(&text);
                    if !text.is_empty() {
                        content.push(ContentBlock::Quote(text));
                    }
                }
                "li" => {
                    let text: String = element.text().collect::<Vec<_>>().join(" ");
                    let text = clean_text(&text);
                    if !text.is_empty() && text.len() > 5 {
                        content.push(ContentBlock::ListItem(text));
                    }
                }
                "img" | "figure" => {
                    if let Some(src) = element.value().attr("src")
                        .or_else(|| element.value().attr("data-src"))
                        .or_else(|| element.value().attr("data-lazy-src"))
                    {
                        let url = resolve_url(src, base_url);
                        let alt = element.value().attr("alt").unwrap_or("").to_string();
                        if is_valid_image_url(&url) {
                            content.push(ContentBlock::Image { url, alt });
                        }
                    }
                }
                _ => {}
            }
        }
    }

    // Fallback 1: tenta pegar todos os <p> do documento
    if content.len() <= 2 {
        if let Ok(p_sel) = Selector::parse("p") {
            for element in document.select(&p_sel) {
                let class_attr = element.value().attr("class").unwrap_or("");
                if should_skip_element(class_attr) {
                    continue;
                }

                let text: String = element.text().collect::<Vec<_>>().join(" ");
                let text = clean_text(&text);
                if is_valid_paragraph(&text) {
                    content.push(ContentBlock::Paragraph(text));
                }
            }
        }
    }

    // Fallback 2: tenta divs com texto
    if content.len() <= 2 {
        let div_selectors = "div.text, div.paragraph, div.content-text, \
            section p, article p, .prose p, .body p";

        if let Ok(div_sel) = Selector::parse(div_selectors) {
            for element in document.select(&div_sel) {
                let text: String = element.text().collect::<Vec<_>>().join(" ");
                let text = clean_text(&text);
                if is_valid_paragraph(&text) {
                    content.push(ContentBlock::Paragraph(text));
                }
            }
        }
    }

    // Fallback 3: extrai texto de qualquer elemento com conteúdo substancial
    if content.len() <= 2 {
        extract_text_fallback(document, &mut content);
    }

    // Remove duplicatas consecutivas
    content.dedup_by(|a, b| {
        match (a, b) {
            (ContentBlock::Paragraph(t1), ContentBlock::Paragraph(t2)) => t1 == t2,
            (ContentBlock::Heading(t1), ContentBlock::Heading(t2)) => t1 == t2,
            _ => false,
        }
    });

    content
}

fn should_skip_element(class_attr: &str) -> bool {
    let skip_classes = [
        "nav", "menu", "footer", "header", "sidebar", "comment",
        "share", "social", "related", "recommend", "advertisement",
        "banner", "promo", "newsletter", "subscribe", "login",
        "signup", "paywall-msg", "paywall-banner", "author-bio",
        "tags", "breadcrumb", "pagination", "copyright",
    ];

    let lower = class_attr.to_lowercase();

    for skip in skip_classes {
        // Check for word boundary match to avoid false positives like "leading" matching "ad"
        for class in lower.split_whitespace() {
            if class == skip || class.starts_with(&format!("{}-", skip)) || class.ends_with(&format!("-{}", skip)) {
                return true;
            }
        }
    }

    // Also check for specific ad-related patterns
    if lower.split_whitespace().any(|c| c == "ad" || c == "ads" || c.starts_with("ad-") || c.starts_with("ads-")) {
        return true;
    }

    false
}

fn is_valid_paragraph(text: &str) -> bool {
    if text.is_empty() || text.len() < 15 {
        return false;
    }

    // Ignora textos que parecem ser metadata
    let invalid_starts = [
        "compartilh", "compart", "publicado", "atualizado", "por ",
        "foto:", "imagem:", "crédito", "leia mais", "veja também",
        "saiba mais", "continue lendo", "assine", "cadastre",
        "©", "copyright", "todos os direitos",
    ];

    let lower = text.to_lowercase();
    for start in invalid_starts {
        if lower.starts_with(start) {
            return false;
        }
    }

    // Deve ter pelo menos algumas palavras
    text.split_whitespace().count() >= 4
}

fn is_valid_image_url(url: &str) -> bool {
    !url.contains("data:")
        && !url.contains("base64")
        && !url.contains("pixel")
        && !url.contains("tracking")
        && !url.contains("1x1")
        && !url.contains("spacer")
        && !url.contains("blank")
        && !url.contains("logo")
        && !url.contains("icon")
        && (url.contains(".jpg")
            || url.contains(".jpeg")
            || url.contains(".png")
            || url.contains(".webp")
            || url.contains(".gif")
            || url.contains("image"))
}

fn extract_text_fallback(document: &Html, content: &mut Vec<ContentBlock>) {
    // Última tentativa: pega blocos de texto grandes
    if let Ok(selector) = Selector::parse("div, section, article") {
        let mut texts: Vec<String> = Vec::new();

        for element in document.select(&selector) {
            let class_attr = element.value().attr("class").unwrap_or("");
            if should_skip_element(class_attr) {
                continue;
            }

            // Pega apenas texto direto (não de filhos)
            let text: String = element
                .text()
                .collect::<Vec<_>>()
                .join(" ");

            let text = clean_text(&text);

            if text.len() > 100 && !texts.contains(&text) {
                // Divide em parágrafos por pontuação
                for part in text.split(". ") {
                    let part = part.trim();
                    if part.len() > 50 {
                        let para = if part.ends_with('.') {
                            part.to_string()
                        } else {
                            format!("{}.", part)
                        };
                        if !texts.contains(&para) {
                            texts.push(para.clone());
                            content.push(ContentBlock::Paragraph(para));
                        }
                    }
                }
            }
        }
    }
}

fn clean_text(text: &str) -> String {
    text.split_whitespace()
        .collect::<Vec<_>>()
        .join(" ")
        .trim()
        .to_string()
}

fn resolve_url(src: &str, base_url: &str) -> String {
    if src.starts_with("http://") || src.starts_with("https://") {
        return src.to_string();
    }

    if src.starts_with("//") {
        return format!("https:{}", src);
    }

    if let Some(pos) = base_url.find("://") {
        let after_protocol = &base_url[pos + 3..];
        if let Some(slash_pos) = after_protocol.find('/') {
            let base = &base_url[..pos + 3 + slash_pos];
            if src.starts_with('/') {
                return format!("{}{}", base, src);
            } else {
                return format!("{}/{}", base, src);
            }
        }
    }

    format!("{}/{}", base_url.trim_end_matches('/'), src.trim_start_matches('/'))
}