first commit

This commit is contained in:
andre.andresilva 2025-12-30 10:04:34 -03:00
parent edc398fe78
commit 97b30fbfc3
10 changed files with 5039 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target

3348
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

20
Cargo.toml Normal file
View File

@ -0,0 +1,20 @@
[package]
name = "rip"
version = "0.1.0"
edition = "2021"
description = "Read In Peace - TUI para ler artigos sem paywall"
[dependencies]
ratatui = { version = "0.29", features = ["all-widgets"] }
crossterm = { version = "0.28", features = ["event-stream"] }
reqwest = { version = "0.12", features = ["blocking", "cookies", "json"] }
scraper = "0.21"
tokio = { version = "1", features = ["full"] }
ratatui-image = { version = "3", features = ["crossterm"] }
image = "0.25"
anyhow = "1.0"
urlencoding = "2.1"
textwrap = "0.16"
unicode-width = "0.2"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"

133
README.md Normal file
View File

@ -0,0 +1,133 @@
# rip - Read In Peace
TUI (Terminal User Interface) para ler artigos sem paywall diretamente no terminal.
## Funcionalidades
- Remove paywall de diversos sites de notícias
- Interface de leitura no terminal com navegação estilo vim
- Tradução automática para Português (Brasil)
- Suporte a sites brasileiros e internacionais
## Sites Suportados
### Brasileiros
- O Globo, G1
- Folha de São Paulo
- Estadão
- UOL
- Valor Econômico
- Exame
- Veja
### Internacionais
- Medium (via Freedium/Scribe.rip)
- New York Times
- Washington Post
- Wall Street Journal
- Financial Times
- Bloomberg
- The Atlantic
- Wired
- The Guardian
- BBC, CNN, Reuters
- E muitos outros...
## Instalação
### Requisitos
- Rust 1.70+ ([instalar](https://rustup.rs))
### Via Cargo (recomendado)
```bash
# Clone o repositório
git clone https://github.com/seu-usuario/rip.git
cd rip
# Instala globalmente
cargo install --path .
```
### Configurar PATH
Adicione o cargo bin ao seu PATH (necessário apenas uma vez):
```bash
# Para bash
echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> ~/.bashrc
source ~/.bashrc
# Para zsh
echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> ~/.zshrc
source ~/.zshrc
```
Depois de instalado, rode `rip` de qualquer diretório.
### Build manual
```bash
# Clone o repositório
git clone https://github.com/seu-usuario/rip.git
cd rip
# Build release
cargo build --release
# O binário estará em:
./target/release/rip
```
## Uso
```bash
rip
```
### Teclas de Atalho
| Tecla | Ação |
|-------|------|
| `Enter` | Buscar artigo (modo input) |
| `j` / `↓` | Rolar para baixo |
| `k` / `↑` | Rolar para cima |
| `Space` / `PageDown` | Página abaixo |
| `PageUp` | Página acima |
| `g` / `Home` | Ir para o início |
| `G` / `End` | Ir para o final |
| `t` | Traduzir para Português |
| `i` / `/` | Modo input (nova URL) |
| `Esc` | Voltar para leitura |
| `q` | Sair |
| `Ctrl+C` | Sair |
## Como Funciona
O rip utiliza múltiplas estratégias para contornar paywalls:
1. **Proxies de leitura**: Freedium, Scribe.rip para Medium
2. **User-Agent spoofing**: Simula Googlebot e Facebook Bot
3. **Cache**: Google Cache
4. **Arquivos**: Archive.is, Archive.org
5. **Serviços**: 12ft.io, RemovePaywall
O sistema tenta automaticamente cada estratégia até encontrar o conteúdo completo.
## Exemplos
```bash
# Inicie o rip
rip
# Cole a URL no campo de input e pressione Enter
# Exemplo: https://medium.com/@user/artigo-interessante
# Use j/k para navegar
# Pressione 't' para traduzir
# Pressione 'q' para sair
```
## Licença
MIT

165
src/app.rs Normal file
View File

@ -0,0 +1,165 @@
use crate::fetcher;
use crate::parser::{Article, ContentBlock};
use crate::translator;
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum Mode {
Input,
Reading,
Loading,
Translating,
}
pub struct App {
pub mode: Mode,
pub input: String,
pub article: Option<Article>,
pub scroll: usize,
pub error: Option<String>,
pub status: String,
pub content_height: usize,
pub is_translated: bool,
}
impl App {
pub fn new() -> Self {
Self {
mode: Mode::Input,
input: String::new(),
article: None,
scroll: 0,
error: None,
status: String::from("Digite a URL e pressione Enter"),
content_height: 0,
is_translated: false,
}
}
pub fn fetch_article(&mut self) {
self.mode = Mode::Loading;
self.status = String::from("Buscando artigo...");
self.error = None;
self.is_translated = false;
let url = self.input.clone();
match fetcher::fetch_without_paywall(&url) {
Ok(html) => {
match crate::parser::parse_article(&html, &url) {
Ok(article) => {
self.article = Some(article);
self.scroll = 0;
self.mode = Mode::Reading;
self.status = format!("Artigo carregado - {}", url);
}
Err(e) => {
self.error = Some(format!("Erro ao parsear: {}", e));
self.mode = Mode::Input;
self.status = String::from("Erro ao processar artigo");
}
}
}
Err(e) => {
self.error = Some(format!("Erro ao buscar: {}", e));
self.mode = Mode::Input;
self.status = String::from("Erro ao buscar artigo");
}
}
}
pub fn translate_article(&mut self) {
if self.is_translated {
self.status = String::from("Artigo ja traduzido!");
return;
}
if let Some(ref mut article) = self.article {
self.mode = Mode::Translating;
self.status = String::from("Traduzindo artigo para Portugues...");
// Traduz o titulo
if let Ok(translated_title) = translator::translate_to_portuguese(&article.title) {
article.title = translated_title;
}
// Traduz cada bloco de conteudo
let mut translated_content = Vec::new();
let total = article.content.len();
for (i, block) in article.content.iter().enumerate() {
self.status = format!("Traduzindo... {}/{}", i + 1, total);
let translated_block = match block {
ContentBlock::Title(text) => {
match translator::translate_to_portuguese(text) {
Ok(t) => ContentBlock::Title(t),
Err(_) => block.clone(),
}
}
ContentBlock::Subtitle(text) => {
match translator::translate_to_portuguese(text) {
Ok(t) => ContentBlock::Subtitle(t),
Err(_) => block.clone(),
}
}
ContentBlock::Paragraph(text) => {
match translator::translate_to_portuguese(text) {
Ok(t) => ContentBlock::Paragraph(t),
Err(_) => block.clone(),
}
}
ContentBlock::Heading(text) => {
match translator::translate_to_portuguese(text) {
Ok(t) => ContentBlock::Heading(t),
Err(_) => block.clone(),
}
}
ContentBlock::Quote(text) => {
match translator::translate_to_portuguese(text) {
Ok(t) => ContentBlock::Quote(t),
Err(_) => block.clone(),
}
}
ContentBlock::ListItem(text) => {
match translator::translate_to_portuguese(text) {
Ok(t) => ContentBlock::ListItem(t),
Err(_) => block.clone(),
}
}
ContentBlock::Image { url, alt } => {
let translated_alt = translator::translate_to_portuguese(alt)
.unwrap_or_else(|_| alt.clone());
ContentBlock::Image {
url: url.clone(),
alt: translated_alt,
}
}
};
translated_content.push(translated_block);
}
article.content = translated_content;
self.is_translated = true;
self.mode = Mode::Reading;
self.status = String::from("Artigo traduzido para Portugues!");
self.scroll = 0;
}
}
pub fn scroll_down(&mut self) {
if self.content_height > 0 {
self.scroll = self.scroll.saturating_add(1);
}
}
pub fn scroll_up(&mut self) {
self.scroll = self.scroll.saturating_sub(1);
}
pub fn scroll_to_end(&mut self) {
if self.content_height > 0 {
self.scroll = self.content_height.saturating_sub(1);
}
}
}

436
src/fetcher.rs Normal file
View File

@ -0,0 +1,436 @@
use anyhow::{anyhow, Result};
use reqwest::blocking::Client;
use reqwest::header::{HeaderMap, HeaderValue, ACCEPT, ACCEPT_LANGUAGE, USER_AGENT, REFERER};
use std::time::Duration;
fn build_browser_client() -> Result<Client> {
let mut headers = HeaderMap::new();
headers.insert(
USER_AGENT,
HeaderValue::from_static(
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
),
);
headers.insert(
ACCEPT,
HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"),
);
headers.insert(ACCEPT_LANGUAGE, HeaderValue::from_static("pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7"));
headers.insert(REFERER, HeaderValue::from_static("https://www.google.com/"));
Client::builder()
.default_headers(headers)
.timeout(Duration::from_secs(30))
.cookie_store(true)
.redirect(reqwest::redirect::Policy::limited(10))
.danger_accept_invalid_certs(true)
.build()
.map_err(|e| anyhow!("Erro ao criar cliente: {}", e))
}
fn build_googlebot_client() -> Result<Client> {
let mut headers = HeaderMap::new();
headers.insert(
USER_AGENT,
HeaderValue::from_static(
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
),
);
headers.insert(
ACCEPT,
HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
);
Client::builder()
.default_headers(headers)
.timeout(Duration::from_secs(30))
.cookie_store(true)
.build()
.map_err(|e| anyhow!("Erro ao criar cliente: {}", e))
}
fn build_facebook_bot_client() -> Result<Client> {
let mut headers = HeaderMap::new();
headers.insert(
USER_AGENT,
HeaderValue::from_static(
"facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)",
),
);
headers.insert(
ACCEPT,
HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
);
Client::builder()
.default_headers(headers)
.timeout(Duration::from_secs(30))
.cookie_store(true)
.build()
.map_err(|e| anyhow!("Erro ao criar cliente: {}", e))
}
pub fn fetch_without_paywall(url: &str) -> Result<String> {
let url_lower = url.to_lowercase();
// Detecta o tipo de site para escolher estratégias otimizadas
let strategies = get_strategies_for_url(&url_lower, url);
let mut last_error = anyhow!("Nenhuma estrategia funcionou");
let mut best_content: Option<String> = None;
let mut best_len = 0;
for strategy in strategies {
match try_strategy(strategy, url) {
Ok(html) if is_valid_content(&html) => {
let content_len = html.len();
// Guarda o melhor resultado (mais conteúdo)
if content_len > best_len {
best_len = content_len;
best_content = Some(html);
}
// Se encontrou conteúdo bom, retorna
if content_len > 5000 {
return best_content.ok_or(last_error);
}
}
Ok(_) => {
last_error = anyhow!("Conteudo bloqueado ou invalido");
}
Err(e) => {
last_error = e;
}
}
}
best_content.ok_or(last_error)
}
fn get_strategies_for_url(url_lower: &str, _url: &str) -> Vec<Strategy> {
// Medium e similares
if url_lower.contains("medium.com")
|| url_lower.contains("towardsdatascience.com")
|| url_lower.contains("levelup.gitconnected")
|| url_lower.contains("betterprogramming.pub")
{
return vec![
Strategy::ScribeRip,
Strategy::LibMedium,
Strategy::Freedium,
Strategy::ArchiveIs,
Strategy::ArchiveOrg,
];
}
// Sites brasileiros
if url_lower.contains("globo.com")
|| url_lower.contains("uol.com.br")
|| url_lower.contains("folha.uol.com.br")
|| url_lower.contains("estadao.com.br")
|| url_lower.contains("valor.com.br")
|| url_lower.contains("exame.com")
|| url_lower.contains("veja.abril.com.br")
{
return vec![
Strategy::DirectBrowser,
Strategy::GoogleCache,
Strategy::DirectGooglebot,
Strategy::DirectFacebookBot,
Strategy::ArchiveIs,
Strategy::ArchiveOrg,
];
}
// NY Times
if url_lower.contains("nytimes.com") {
return vec![
Strategy::ArchiveIs,
Strategy::ArchiveOrg,
Strategy::GoogleCache,
Strategy::DirectGooglebot,
Strategy::TwelveFt,
];
}
// Washington Post
if url_lower.contains("washingtonpost.com") {
return vec![
Strategy::ArchiveIs,
Strategy::ArchiveOrg,
Strategy::DirectGooglebot,
Strategy::GoogleCache,
];
}
// Wall Street Journal
if url_lower.contains("wsj.com") {
return vec![
Strategy::ArchiveIs,
Strategy::ArchiveOrg,
Strategy::GoogleCache,
Strategy::DirectFacebookBot,
];
}
// Financial Times
if url_lower.contains("ft.com") {
return vec![
Strategy::ArchiveIs,
Strategy::ArchiveOrg,
Strategy::DirectGooglebot,
];
}
// Bloomberg
if url_lower.contains("bloomberg.com") {
return vec![
Strategy::ArchiveIs,
Strategy::ArchiveOrg,
Strategy::DirectGooglebot,
Strategy::GoogleCache,
];
}
// The Atlantic
if url_lower.contains("theatlantic.com") {
return vec![
Strategy::ArchiveIs,
Strategy::DirectGooglebot,
Strategy::GoogleCache,
];
}
// Wired
if url_lower.contains("wired.com") {
return vec![
Strategy::ArchiveIs,
Strategy::DirectGooglebot,
Strategy::TwelveFt,
];
}
// Estratégias genéricas para outros sites
vec![
Strategy::DirectBrowser,
Strategy::DirectGooglebot,
Strategy::DirectFacebookBot,
Strategy::ArchiveIs,
Strategy::GoogleCache,
Strategy::ArchiveOrg,
Strategy::TwelveFt,
Strategy::RemovePaywall,
]
}
fn is_valid_content(html: &str) -> bool {
if html.len() < 1000 {
return false;
}
let blocked_indicators = [
"Just a moment...",
"Enable JavaScript and cookies",
"challenge-platform",
"cf-browser-verification",
"Checking your browser",
"DDoS protection by",
"Attention Required!",
"Access denied",
"Please enable JS",
"You need to enable JavaScript",
"Este site requer JavaScript",
"captcha",
"robot",
"blocked",
"Acesso negado",
"403 Forbidden",
"404 Not Found",
"500 Internal Server Error",
"502 Bad Gateway",
"503 Service Unavailable",
"Opppps..",
"Something went wrong",
"Error loading",
"Page not found",
];
let html_lower = html.to_lowercase();
for indicator in blocked_indicators {
if html_lower.contains(&indicator.to_lowercase()) {
// Verifica se é apenas menção e não página de bloqueio
if html.len() < 5000 {
return false;
}
}
}
// Deve ter conteúdo HTML real com parágrafos
let has_paragraphs = html.matches("<p").count() >= 3;
let has_content = html.contains("<article") || html.contains("<div") || has_paragraphs;
has_content && has_paragraphs
}
#[derive(Clone, Copy, Debug)]
enum Strategy {
DirectBrowser,
DirectGooglebot,
DirectFacebookBot,
GoogleCache,
ArchiveIs,
ArchiveOrg,
TwelveFt,
Freedium,
ScribeRip,
LibMedium,
RemovePaywall,
}
fn try_strategy(strategy: Strategy, url: &str) -> Result<String> {
match strategy {
Strategy::DirectBrowser => {
let client = build_browser_client()?;
let response = client.get(url).send()?;
if response.status().is_success() {
Ok(response.text()?)
} else {
Err(anyhow!("Status: {}", response.status()))
}
}
Strategy::DirectGooglebot => {
let client = build_googlebot_client()?;
let response = client.get(url).send()?;
if response.status().is_success() {
Ok(response.text()?)
} else {
Err(anyhow!("Status: {}", response.status()))
}
}
Strategy::DirectFacebookBot => {
let client = build_facebook_bot_client()?;
let response = client.get(url).send()?;
if response.status().is_success() {
Ok(response.text()?)
} else {
Err(anyhow!("Status: {}", response.status()))
}
}
Strategy::GoogleCache => {
let client = build_browser_client()?;
let cache_url = format!(
"https://webcache.googleusercontent.com/search?q=cache:{}",
urlencoding::encode(url)
);
let response = client.get(&cache_url).send()?;
if response.status().is_success() {
Ok(response.text()?)
} else {
Err(anyhow!("Google Cache nao disponivel"))
}
}
Strategy::ArchiveIs => {
let client = build_browser_client()?;
let archive_url = format!("https://archive.is/latest/{}", url);
let response = client
.get(&archive_url)
.timeout(Duration::from_secs(45))
.send()?;
if response.status().is_success() {
Ok(response.text()?)
} else {
Err(anyhow!("Archive.is nao disponivel: {}", response.status()))
}
}
Strategy::ArchiveOrg => {
let client = build_browser_client()?;
// Tenta a versão mais recente do Wayback Machine
let archive_url = format!(
"https://web.archive.org/web/2024/{}",
url
);
let response = client.get(&archive_url).send()?;
if response.status().is_success() {
Ok(response.text()?)
} else {
Err(anyhow!("Archive.org nao disponivel"))
}
}
Strategy::TwelveFt => {
let client = build_browser_client()?;
let twelve_url = format!("https://12ft.io/{}", url);
let response = client.get(&twelve_url).send()?;
if response.status().is_success() {
Ok(response.text()?)
} else {
Err(anyhow!("12ft.io nao disponivel: {}", response.status()))
}
}
Strategy::Freedium => {
let client = build_browser_client()?;
let freedium_url = format!("https://freedium.cfd/{}", url);
let response = client.get(&freedium_url).send()?;
if response.status().is_success() {
Ok(response.text()?)
} else {
Err(anyhow!("Freedium nao disponivel: {}", response.status()))
}
}
Strategy::ScribeRip => {
let client = build_browser_client()?;
let path = extract_medium_path(url);
let scribe_url = format!("https://scribe.rip{}", path);
let response = client.get(&scribe_url).send()?;
if response.status().is_success() {
Ok(response.text()?)
} else {
Err(anyhow!("Scribe.rip nao disponivel: {}", response.status()))
}
}
Strategy::LibMedium => {
let client = build_browser_client()?;
let path = extract_medium_path(url);
let libmedium_url = format!("https://md.vern.cc{}", path);
let response = client.get(&libmedium_url).send()?;
if response.status().is_success() {
Ok(response.text()?)
} else {
Err(anyhow!("LibMedium nao disponivel: {}", response.status()))
}
}
Strategy::RemovePaywall => {
let client = build_browser_client()?;
let remove_url = format!(
"https://www.removepaywall.com/search?url={}",
urlencoding::encode(url)
);
let response = client.get(&remove_url).send()?;
if response.status().is_success() {
Ok(response.text()?)
} else {
Err(anyhow!("RemovePaywall nao disponivel"))
}
}
}
}
fn extract_medium_path(url: &str) -> String {
// Remove protocolo e domínio para pegar apenas o path
if let Some(pos) = url.find("medium.com") {
let after = &url[pos + 10..];
if after.starts_with('/') {
return after.to_string();
} else {
return format!("/{}", after);
}
}
// Para subdomínios do Medium (towardsdatascience.com, etc)
if let Some(pos) = url.find("://") {
let after_protocol = &url[pos + 3..];
if let Some(slash_pos) = after_protocol.find('/') {
return after_protocol[slash_pos..].to_string();
}
}
url.to_string()
}

107
src/main.rs Normal file
View File

@ -0,0 +1,107 @@
mod app;
mod fetcher;
mod parser;
mod translator;
mod ui;
use anyhow::Result;
use app::App;
use crossterm::{
event::{self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode, KeyModifiers},
execute,
terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen},
};
use ratatui::{backend::CrosstermBackend, Terminal};
use std::io;
fn main() -> Result<()> {
enable_raw_mode()?;
let mut stdout = io::stdout();
execute!(stdout, EnterAlternateScreen, EnableMouseCapture)?;
let backend = CrosstermBackend::new(stdout);
let mut terminal = Terminal::new(backend)?;
let mut app = App::new();
let res = run_app(&mut terminal, &mut app);
disable_raw_mode()?;
execute!(
terminal.backend_mut(),
LeaveAlternateScreen,
DisableMouseCapture
)?;
terminal.show_cursor()?;
if let Err(err) = res {
eprintln!("Error: {err:?}");
}
Ok(())
}
fn run_app(terminal: &mut Terminal<CrosstermBackend<io::Stdout>>, app: &mut App) -> Result<()> {
loop {
terminal.draw(|f| ui::draw(f, app))?;
if let Event::Key(key) = event::read()? {
if key.code == KeyCode::Char('c') && key.modifiers.contains(KeyModifiers::CONTROL) {
return Ok(());
}
match app.mode {
app::Mode::Input => match key.code {
KeyCode::Enter => {
if !app.input.is_empty() {
app.fetch_article();
}
}
KeyCode::Char(c) => {
app.input.push(c);
}
KeyCode::Backspace => {
app.input.pop();
}
KeyCode::Esc => {
if app.article.is_some() {
app.mode = app::Mode::Reading;
}
}
_ => {}
},
app::Mode::Reading => match key.code {
KeyCode::Char('q') => return Ok(()),
KeyCode::Char('i') | KeyCode::Char('/') => {
app.mode = app::Mode::Input;
}
KeyCode::Char('t') => {
app.translate_article();
}
KeyCode::Down | KeyCode::Char('j') => {
app.scroll_down();
}
KeyCode::Up | KeyCode::Char('k') => {
app.scroll_up();
}
KeyCode::PageDown | KeyCode::Char(' ') => {
for _ in 0..10 {
app.scroll_down();
}
}
KeyCode::PageUp => {
for _ in 0..10 {
app.scroll_up();
}
}
KeyCode::Home | KeyCode::Char('g') => {
app.scroll = 0;
}
KeyCode::End | KeyCode::Char('G') => {
app.scroll_to_end();
}
_ => {}
},
app::Mode::Loading | app::Mode::Translating => {}
}
}
}
}

476
src/parser.rs Normal file
View File

@ -0,0 +1,476 @@
use anyhow::{anyhow, Result};
use scraper::{Html, Selector};
#[derive(Debug, Clone)]
pub enum ContentBlock {
Title(String),
Subtitle(String),
Paragraph(String),
Image { url: String, alt: String },
Quote(String),
Heading(String),
ListItem(String),
}
#[derive(Debug)]
pub struct Article {
pub title: String,
pub content: Vec<ContentBlock>,
pub source_url: String,
}
pub fn parse_article(html: &str, source_url: &str) -> Result<Article> {
let document = Html::parse_document(html);
let title = extract_title(&document);
let content = extract_content(&document, source_url);
if content.is_empty() {
return Err(anyhow!("Nao foi possivel extrair conteudo do artigo"));
}
Ok(Article {
title,
content,
source_url: source_url.to_string(),
})
}
fn extract_title(document: &Html) -> String {
let selectors = [
// Sites BR
".content-head__title",
".titulo-materia",
".materia-titulo",
".post-title",
".entry-title",
".article-title",
".headline",
// Sites internacionais
"h1[data-testid='headline']",
"h1.article-headline",
"h1.post-headline",
"h1.story-title",
// Genéricos
"article h1",
".headline h1",
"header h1",
"h1",
"title",
];
for sel_str in selectors {
if let Ok(selector) = Selector::parse(sel_str) {
if let Some(element) = document.select(&selector).next() {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = text.trim().to_string();
if !text.is_empty() && text.len() > 5 {
return text;
}
}
}
}
String::from("Sem titulo")
}
fn extract_content(document: &Html, base_url: &str) -> Vec<ContentBlock> {
let mut content = Vec::new();
// Seletores para o container principal do artigo
// Ordem importa: mais específicos primeiro
let article_selectors = [
// Globo/O Globo/G1
".mc-article-body",
".mc-body",
".content-text__container",
".materia-conteudo",
".post-content",
// Folha de SP
".c-news__body",
".news__content",
// Estadão
".news-body",
".content-body",
// UOL
".text",
".corpo-texto",
// Valor Econômico
".article-text",
// Exame
".single-content",
// Veja
".article-content",
// Medium/Freedium/Scribe.rip
"article section",
"article",
".main-content",
".postArticle-content",
// NY Times
"[data-testid='article-body']",
".StoryBodyCompanionColumn",
".story-body-supplemental",
// Washington Post
".article-body",
".teaser-content",
// The Guardian
".article-body-commercial-selector",
".content__article-body",
// Wall Street Journal
".article-content",
".wsj-snippet-body",
// Bloomberg
".body-content",
// Financial Times
".article__content-body",
// Forbes
".article-body",
".vestibule",
// Reuters
".article-body__content",
// BBC
"[data-component='text-block']",
".story-body__inner",
// CNN
".article__content",
".zn-body__paragraph",
// Wired
".body__inner-container",
// The Atlantic
".article-body",
// Economist
".article__body",
// Archive.is wrapper
"#CONTENT",
// Genéricos (fallback)
"article",
"[role='main']",
"[role='article']",
"main",
".entry-content",
".story-body",
".article-body",
".content",
"#content",
"#article-body",
".post",
"body",
];
let mut article_html: Option<scraper::ElementRef> = None;
for sel_str in article_selectors {
if let Ok(selector) = Selector::parse(sel_str) {
if let Some(element) = document.select(&selector).next() {
// Verifica se tem conteúdo útil
let text_len: usize = element.text().collect::<String>().len();
if text_len > 200 {
article_html = Some(element);
break;
}
}
}
}
let article = match article_html {
Some(a) => a,
None => return content,
};
// Extrai título se presente
if let Ok(h1_sel) = Selector::parse("h1") {
if let Some(h1) = article.select(&h1_sel).next() {
let text: String = h1.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if !text.is_empty() && text.len() > 5 {
content.push(ContentBlock::Title(text));
}
}
}
// Extrai subtítulo/lead
let subtitle_selectors = [
".content-head__subtitle",
".subtitulo",
".lead",
".excerpt",
".article-summary",
".deck",
"h2.subtitle",
];
for sel_str in subtitle_selectors {
if let Ok(selector) = Selector::parse(sel_str) {
if let Some(element) = article.select(&selector).next() {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if !text.is_empty() && text.len() > 10 {
content.push(ContentBlock::Subtitle(text));
break;
}
}
}
}
// Seletor abrangente para conteúdo
let content_selector = "p, h2, h3, h4, h5, h6, blockquote, img, li, figure img, \
.content-text, .paragraph, .text-paragraph, \
.content-intertitle h2, .intertitle, \
[data-component='text-block'], \
.paywall";
if let Ok(p_sel) = Selector::parse(content_selector) {
for element in article.select(&p_sel) {
let tag_name = element.value().name();
let class_attr = element.value().attr("class").unwrap_or("");
// Ignora elementos de navegação, ads, etc
if should_skip_element(class_attr) {
continue;
}
match tag_name {
"p" => {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if is_valid_paragraph(&text) {
content.push(ContentBlock::Paragraph(text));
}
}
"div" if class_attr.contains("text") || class_attr.contains("paragraph") => {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if is_valid_paragraph(&text) {
content.push(ContentBlock::Paragraph(text));
}
}
"h2" | "h3" | "h4" | "h5" | "h6" => {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if !text.is_empty() && text.len() > 2 {
content.push(ContentBlock::Heading(text));
}
}
"blockquote" => {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if !text.is_empty() {
content.push(ContentBlock::Quote(text));
}
}
"li" => {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if !text.is_empty() && text.len() > 5 {
content.push(ContentBlock::ListItem(text));
}
}
"img" | "figure" => {
if let Some(src) = element.value().attr("src")
.or_else(|| element.value().attr("data-src"))
.or_else(|| element.value().attr("data-lazy-src"))
{
let url = resolve_url(src, base_url);
let alt = element.value().attr("alt").unwrap_or("").to_string();
if is_valid_image_url(&url) {
content.push(ContentBlock::Image { url, alt });
}
}
}
_ => {}
}
}
}
// Fallback 1: tenta pegar todos os <p> do documento
if content.len() <= 2 {
if let Ok(p_sel) = Selector::parse("p") {
for element in document.select(&p_sel) {
let class_attr = element.value().attr("class").unwrap_or("");
if should_skip_element(class_attr) {
continue;
}
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if is_valid_paragraph(&text) {
content.push(ContentBlock::Paragraph(text));
}
}
}
}
// Fallback 2: tenta divs com texto
if content.len() <= 2 {
let div_selectors = "div.text, div.paragraph, div.content-text, \
section p, article p, .prose p, .body p";
if let Ok(div_sel) = Selector::parse(div_selectors) {
for element in document.select(&div_sel) {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = clean_text(&text);
if is_valid_paragraph(&text) {
content.push(ContentBlock::Paragraph(text));
}
}
}
}
// Fallback 3: extrai texto de qualquer elemento com conteúdo substancial
if content.len() <= 2 {
extract_text_fallback(document, &mut content);
}
// Remove duplicatas consecutivas
content.dedup_by(|a, b| {
match (a, b) {
(ContentBlock::Paragraph(t1), ContentBlock::Paragraph(t2)) => t1 == t2,
(ContentBlock::Heading(t1), ContentBlock::Heading(t2)) => t1 == t2,
_ => false,
}
});
content
}
fn should_skip_element(class_attr: &str) -> bool {
let skip_classes = [
"nav", "menu", "footer", "header", "sidebar", "comment",
"share", "social", "related", "recommend", "advertisement",
"banner", "promo", "newsletter", "subscribe", "login",
"signup", "paywall-msg", "paywall-banner", "author-bio",
"tags", "breadcrumb", "pagination", "copyright",
];
let lower = class_attr.to_lowercase();
for skip in skip_classes {
// Check for word boundary match to avoid false positives like "leading" matching "ad"
for class in lower.split_whitespace() {
if class == skip || class.starts_with(&format!("{}-", skip)) || class.ends_with(&format!("-{}", skip)) {
return true;
}
}
}
// Also check for specific ad-related patterns
if lower.split_whitespace().any(|c| c == "ad" || c == "ads" || c.starts_with("ad-") || c.starts_with("ads-")) {
return true;
}
false
}
fn is_valid_paragraph(text: &str) -> bool {
if text.is_empty() || text.len() < 15 {
return false;
}
// Ignora textos que parecem ser metadata
let invalid_starts = [
"compartilh", "compart", "publicado", "atualizado", "por ",
"foto:", "imagem:", "crédito", "leia mais", "veja também",
"saiba mais", "continue lendo", "assine", "cadastre",
"©", "copyright", "todos os direitos",
];
let lower = text.to_lowercase();
for start in invalid_starts {
if lower.starts_with(start) {
return false;
}
}
// Deve ter pelo menos algumas palavras
text.split_whitespace().count() >= 4
}
fn is_valid_image_url(url: &str) -> bool {
!url.contains("data:")
&& !url.contains("base64")
&& !url.contains("pixel")
&& !url.contains("tracking")
&& !url.contains("1x1")
&& !url.contains("spacer")
&& !url.contains("blank")
&& !url.contains("logo")
&& !url.contains("icon")
&& (url.contains(".jpg")
|| url.contains(".jpeg")
|| url.contains(".png")
|| url.contains(".webp")
|| url.contains(".gif")
|| url.contains("image"))
}
fn extract_text_fallback(document: &Html, content: &mut Vec<ContentBlock>) {
// Última tentativa: pega blocos de texto grandes
if let Ok(selector) = Selector::parse("div, section, article") {
let mut texts: Vec<String> = Vec::new();
for element in document.select(&selector) {
let class_attr = element.value().attr("class").unwrap_or("");
if should_skip_element(class_attr) {
continue;
}
// Pega apenas texto direto (não de filhos)
let text: String = element
.text()
.collect::<Vec<_>>()
.join(" ");
let text = clean_text(&text);
if text.len() > 100 && !texts.contains(&text) {
// Divide em parágrafos por pontuação
for part in text.split(". ") {
let part = part.trim();
if part.len() > 50 {
let para = if part.ends_with('.') {
part.to_string()
} else {
format!("{}.", part)
};
if !texts.contains(&para) {
texts.push(para.clone());
content.push(ContentBlock::Paragraph(para));
}
}
}
}
}
}
}
fn clean_text(text: &str) -> String {
text.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
.trim()
.to_string()
}
fn resolve_url(src: &str, base_url: &str) -> String {
if src.starts_with("http://") || src.starts_with("https://") {
return src.to_string();
}
if src.starts_with("//") {
return format!("https:{}", src);
}
if let Some(pos) = base_url.find("://") {
let after_protocol = &base_url[pos + 3..];
if let Some(slash_pos) = after_protocol.find('/') {
let base = &base_url[..pos + 3 + slash_pos];
if src.starts_with('/') {
return format!("{}{}", base, src);
} else {
return format!("{}/{}", base, src);
}
}
}
format!("{}/{}", base_url.trim_end_matches('/'), src.trim_start_matches('/'))
}

98
src/translator.rs Normal file
View File

@ -0,0 +1,98 @@
use anyhow::{anyhow, Result};
use reqwest::blocking::Client;
use serde::Deserialize;
use std::time::Duration;
#[derive(Debug, Deserialize)]
struct LibreTranslateResponse {
#[serde(rename = "translatedText")]
translated_text: String,
}
pub fn translate_to_portuguese(text: &str) -> Result<String> {
if text.trim().is_empty() {
return Ok(String::new());
}
match translate_google(text) {
Ok(translated) => Ok(translated),
Err(_) => translate_libre(text),
}
}
fn translate_google(text: &str) -> Result<String> {
let client = Client::builder()
.timeout(Duration::from_secs(30))
.build()?;
let url = format!(
"https://translate.googleapis.com/translate_a/single?client=gtx&sl=auto&tl=pt-BR&dt=t&q={}",
urlencoding::encode(text)
);
let response = client
.get(&url)
.header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36")
.send()?;
if !response.status().is_success() {
return Err(anyhow!("Google Translate erro: {}", response.status()));
}
let json_text = response.text()?;
// Parse the nested array response from Google Translate
let parsed: serde_json::Value = serde_json::from_str(&json_text)?;
let mut result = String::new();
if let Some(sentences) = parsed.get(0).and_then(|v| v.as_array()) {
for sentence in sentences {
if let Some(trans) = sentence.get(0).and_then(|v| v.as_str()) {
result.push_str(trans);
}
}
}
if result.is_empty() {
return Err(anyhow!("Resposta vazia do Google Translate"));
}
Ok(result)
}
fn translate_libre(text: &str) -> Result<String> {
let client = Client::builder()
.timeout(Duration::from_secs(30))
.build()?;
let instances = [
"https://libretranslate.com/translate",
"https://translate.argosopentech.com/translate",
"https://translate.terraprint.co/translate",
];
for instance in instances {
let body = serde_json::json!({
"q": text,
"source": "auto",
"target": "pt",
"format": "text"
});
if let Ok(response) = client
.post(instance)
.header("Content-Type", "application/json")
.json(&body)
.send()
{
if response.status().is_success() {
if let Ok(data) = response.json::<LibreTranslateResponse>() {
return Ok(data.translated_text);
}
}
}
}
Err(anyhow!("Nenhum servidor de traducao disponivel"))
}

255
src/ui.rs Normal file
View File

@ -0,0 +1,255 @@
use crate::app::{App, Mode};
use crate::parser::ContentBlock;
use ratatui::{
layout::{Constraint, Direction, Layout, Rect},
style::{Color, Modifier, Style},
text::{Line, Span, Text},
widgets::{Block, Borders, Paragraph, Wrap},
Frame,
};
use textwrap::wrap;
pub fn draw(frame: &mut Frame, app: &mut App) {
let chunks = Layout::default()
.direction(Direction::Vertical)
.constraints([
Constraint::Length(3),
Constraint::Min(1),
Constraint::Length(3),
])
.split(frame.area());
draw_input(frame, app, chunks[0]);
draw_content(frame, app, chunks[1]);
draw_status(frame, app, chunks[2]);
}
fn draw_input(frame: &mut Frame, app: &App, area: Rect) {
let style = if app.mode == Mode::Input {
Style::default().fg(Color::Yellow)
} else {
Style::default().fg(Color::Gray)
};
let input = Paragraph::new(app.input.as_str())
.style(style)
.block(
Block::default()
.borders(Borders::ALL)
.title(" URL ")
.border_style(if app.mode == Mode::Input {
Style::default().fg(Color::Cyan)
} else {
Style::default().fg(Color::Gray)
}),
);
frame.render_widget(input, area);
if app.mode == Mode::Input {
frame.set_cursor_position((area.x + app.input.len() as u16 + 1, area.y + 1));
}
}
fn draw_content(frame: &mut Frame, app: &mut App, area: Rect) {
let inner_width = area.width.saturating_sub(4) as usize;
match app.mode {
Mode::Loading => {
let loading = Paragraph::new("Carregando artigo...")
.style(Style::default().fg(Color::Yellow))
.block(Block::default().borders(Borders::ALL).title(" Conteudo "));
frame.render_widget(loading, area);
}
Mode::Translating => {
let translating = Paragraph::new(app.status.clone())
.style(Style::default().fg(Color::Magenta))
.block(Block::default().borders(Borders::ALL).title(" Traduzindo "));
frame.render_widget(translating, area);
}
_ => {
if let Some(ref article) = app.article {
let mut lines: Vec<Line> = Vec::new();
lines.push(Line::from(vec![Span::styled(
&article.title,
Style::default()
.fg(Color::Cyan)
.add_modifier(Modifier::BOLD),
)]));
lines.push(Line::from(""));
lines.push(Line::from(vec![Span::styled(
&article.source_url,
Style::default().fg(Color::DarkGray),
)]));
lines.push(Line::from(""));
lines.push(Line::from(vec![Span::styled(
"".repeat(inner_width.min(80)),
Style::default().fg(Color::DarkGray),
)]));
lines.push(Line::from(""));
for block in &article.content {
match block {
ContentBlock::Title(text) => {
for wrapped in wrap(text, inner_width.min(80)) {
lines.push(Line::from(vec![Span::styled(
wrapped.to_string(),
Style::default()
.fg(Color::White)
.add_modifier(Modifier::BOLD),
)]));
}
lines.push(Line::from(""));
}
ContentBlock::Subtitle(text) => {
for wrapped in wrap(text, inner_width.min(80)) {
lines.push(Line::from(vec![Span::styled(
wrapped.to_string(),
Style::default().fg(Color::Gray).add_modifier(Modifier::ITALIC),
)]));
}
lines.push(Line::from(""));
}
ContentBlock::Heading(text) => {
lines.push(Line::from(""));
for wrapped in wrap(text, inner_width.min(80)) {
lines.push(Line::from(vec![Span::styled(
format!("## {}", wrapped),
Style::default()
.fg(Color::Yellow)
.add_modifier(Modifier::BOLD),
)]));
}
lines.push(Line::from(""));
}
ContentBlock::Paragraph(text) => {
for wrapped in wrap(text, inner_width.min(80)) {
lines.push(Line::from(wrapped.to_string()));
}
lines.push(Line::from(""));
}
ContentBlock::Quote(text) => {
for wrapped in wrap(text, inner_width.min(76)) {
lines.push(Line::from(vec![
Span::styled(" | ", Style::default().fg(Color::DarkGray)),
Span::styled(
wrapped.to_string(),
Style::default().fg(Color::Gray).add_modifier(Modifier::ITALIC),
),
]));
}
lines.push(Line::from(""));
}
ContentBlock::ListItem(text) => {
for (i, wrapped) in wrap(text, inner_width.min(76)).iter().enumerate() {
if i == 0 {
lines.push(Line::from(vec![
Span::styled(" * ", Style::default().fg(Color::Cyan)),
Span::raw(wrapped.to_string()),
]));
} else {
lines.push(Line::from(format!(" {}", wrapped)));
}
}
}
ContentBlock::Image { url, alt } => {
lines.push(Line::from(""));
lines.push(Line::from(vec![Span::styled(
format!("[IMG: {}]", if alt.is_empty() { "imagem" } else { alt }),
Style::default().fg(Color::Magenta),
)]));
lines.push(Line::from(vec![Span::styled(
format!(" {}", truncate_url(url, inner_width.saturating_sub(4))),
Style::default().fg(Color::DarkGray),
)]));
lines.push(Line::from(""));
}
}
}
app.content_height = lines.len();
let visible_lines: Vec<Line> = lines
.into_iter()
.skip(app.scroll)
.collect();
let content = Paragraph::new(Text::from(visible_lines))
.block(Block::default().borders(Borders::ALL).title(format!(
" Artigo [{}/{}] ",
app.scroll + 1,
app.content_height.max(1)
)))
.wrap(Wrap { trim: false });
frame.render_widget(content, area);
} else if let Some(ref error) = app.error {
let error_text = Paragraph::new(error.as_str())
.style(Style::default().fg(Color::Red))
.block(Block::default().borders(Borders::ALL).title(" Erro "));
frame.render_widget(error_text, area);
} else {
let help_text = vec![
Line::from(""),
Line::from(vec![Span::styled(
" Remove Paywall - Leitor de Artigos",
Style::default().fg(Color::Cyan).add_modifier(Modifier::BOLD),
)]),
Line::from(""),
Line::from(" Digite a URL de um artigo e pressione Enter"),
Line::from(""),
Line::from(vec![Span::styled(" Comandos:", Style::default().fg(Color::Yellow))]),
Line::from(""),
Line::from(" Enter - Buscar artigo"),
Line::from(" t - Traduzir para Portugues"),
Line::from(" j/k - Scroll linha a linha"),
Line::from(" Space - Scroll pagina"),
Line::from(" g/G - Inicio/Fim"),
Line::from(" i ou / - Novo URL"),
Line::from(" q - Sair"),
Line::from(" Ctrl+C - Sair"),
];
let help = Paragraph::new(Text::from(help_text))
.block(Block::default().borders(Borders::ALL).title(" Ajuda "));
frame.render_widget(help, area);
}
}
}
}
fn draw_status(frame: &mut Frame, app: &App, area: Rect) {
let status_style = match app.mode {
Mode::Loading => Style::default().fg(Color::Yellow),
Mode::Translating => Style::default().fg(Color::Magenta),
Mode::Reading => Style::default().fg(Color::Green),
Mode::Input => Style::default().fg(Color::Cyan),
};
let mode_text = match app.mode {
Mode::Loading => "CARREGANDO",
Mode::Translating => "TRADUZINDO",
Mode::Reading => if app.is_translated { "LENDO [PT-BR]" } else { "LENDO" },
Mode::Input => "INPUT",
};
let status = Paragraph::new(Line::from(vec![
Span::styled(
format!(" [{}] ", mode_text),
status_style.add_modifier(Modifier::BOLD),
),
Span::styled(&app.status, Style::default().fg(Color::White)),
]))
.block(Block::default().borders(Borders::ALL));
frame.render_widget(status, area);
}
fn truncate_url(url: &str, max_len: usize) -> String {
if url.len() <= max_len {
url.to_string()
} else {
format!("{}...", &url[..max_len.saturating_sub(3)])
}
}