diff --git a/Cargo.toml b/Cargo.toml index 8bda5d4..2f3f439 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,6 @@ [package] name = "readability" +edition = "2021" version = "0.2.0" authors = ["Hiroki Kumamoto "] license = "MIT" @@ -12,14 +13,28 @@ categories = [] [dependencies] regex = "1.4" url = "2.2" -html5ever = "0.25" -markup5ever_rcdom = "0.1" +html5ever = "0.26.0" +markup5ever_rcdom = "0.2.0" lazy_static = "1.4" [dependencies.reqwest] version = "0.11" optional = true -features = ["blocking"] [features] -default = ["reqwest"] +default = ["http-async"] +http-async = ["reqwest"] +http-blocking = ["reqwest", "reqwest/blocking"] + +[dev-dependencies] +tokio = { version = "1.25.0", features = ["full"] } + +[[test]] +required-features = ["http-blocking"] +name = "blocking" +path = "tests/blocking.rs" + +[[test]] +required-features = ["http-async"] +name = "async" +path = "tests/async.rs" diff --git a/src/dom.rs b/src/dom.rs index 14e6e97..f263d96 100644 --- a/src/dom.rs +++ b/src/dom.rs @@ -1,51 +1,52 @@ -use std::rc::Rc; use html5ever::tendril::StrTendril; +use html5ever::Attribute; use markup5ever_rcdom::NodeData::{Element, Text}; use markup5ever_rcdom::{Handle, Node}; -use html5ever::Attribute; +use std::rc::Rc; use std::str::FromStr; pub fn get_tag_name(handle: Handle) -> Option { match handle.data { - Element { ref name, .. } => Some(name.local.as_ref().to_lowercase().to_string()), + Element { ref name, .. } => Some(name.local.as_ref().to_lowercase()), _ => None, } } -pub fn get_attr<'a>(name: &str, handle: Handle) -> Option { +pub fn get_attr(name: &str, handle: Handle) -> Option { match handle.data { - Element { name: _, ref attrs, .. } => attr(name, &attrs.borrow()), - _ => None, + Element { + name: _, ref attrs, .. + } => attr(name, &attrs.borrow()), + _ => None, } } -pub fn attr(attr_name: &str, attrs: &Vec) -> Option { +pub fn attr(attr_name: &str, attrs: &[Attribute]) -> Option { for attr in attrs.iter() { if attr.name.local.as_ref() == attr_name { - return Some(attr.value.to_string()) + return Some(attr.value.to_string()); } } None } pub fn set_attr(attr_name: &str, value: &str, handle: Handle) { - match handle.data { - Element { name: _, ref attrs, .. } => { - let attrs = &mut attrs.borrow_mut(); - if let Some(index) = attrs.iter().position(|attr| { - let name = attr.name.local.as_ref(); - name == attr_name - }) { - match StrTendril::from_str(value) { - Ok(value) => attrs[index] = Attribute { - name: attrs[index].name.clone(), - value: value, - }, - Err(_) => (), + if let Element { + name: _, ref attrs, .. + } = handle.data + { + let attrs = &mut attrs.borrow_mut(); + if let Some(index) = attrs.iter().position(|attr| { + let name = attr.name.local.as_ref(); + name == attr_name + }) { + if let Ok(value) = StrTendril::from_str(value) { + attrs[index] = Attribute { + name: attrs[index].name.clone(), + value, } } } - _ => (), } } @@ -64,54 +65,52 @@ pub fn is_empty(handle: Handle) -> bool { match c.data { Text { ref contents } => { if contents.borrow().trim().len() > 0 { - return false + return false; } - }, + } Element { ref name, .. } => { let tag_name = name.local.as_ref(); match tag_name.to_lowercase().as_ref() { "li" | "dt" | "dd" | "p" | "div" => { if !is_empty(child.clone()) { - return false + return false; } - }, + } _ => return false, } - }, - _ => () + } + _ => (), } } - match get_tag_name(handle.clone()).unwrap_or_default().as_ref() { - "li" | "dt" | "dd" | "p" | "div" | "canvas" => true, - _ => false, - } + matches!( + get_tag_name(handle).unwrap_or_default().as_ref(), + "li" | "dt" | "dd" | "p" | "div" | "canvas" + ) } pub fn has_link(handle: Handle) -> bool { if "a" == &get_tag_name(handle.clone()).unwrap_or_default() { - return true + return true; } for child in handle.children.borrow().iter() { if has_link(child.clone()) { - return true + return true; } } - return false + false } pub fn extract_text(handle: Handle, text: &mut String, deep: bool) { for child in handle.children.borrow().iter() { let c = child.clone(); - match c.data { - Text { ref contents } => { - text.push_str(contents.borrow().trim()); - }, + match &c.data { + Text { contents } => text.push_str(contents.borrow().trim()), Element { .. } => { if deep { extract_text(child.clone(), text, deep); } - }, - _ => () + } + _ => (), } } } @@ -123,11 +122,11 @@ pub fn text_len(handle: Handle) -> usize { match c.data { Text { ref contents } => { len += contents.borrow().trim().chars().count(); - }, + } Element { .. } => { len += text_len(child.clone()); - }, - _ => () + } + _ => (), } } len @@ -136,15 +135,12 @@ pub fn text_len(handle: Handle) -> usize { pub fn find_node(handle: Handle, tag_name: &str, nodes: &mut Vec>) { for child in handle.children.borrow().iter() { let c = child.clone(); - match c.data { - Element { ref name, .. } => { - let t = name.local.as_ref(); - if t.to_lowercase() == tag_name { - nodes.push(child.clone()); - }; - find_node(child.clone(), tag_name, nodes) - }, - _ => () + if let Element { name, .. } = &c.data { + let t = name.local.as_ref(); + if t.to_lowercase() == tag_name { + nodes.push(child.clone()); + }; + find_node(child.clone(), tag_name, nodes) } } } @@ -153,32 +149,27 @@ pub fn has_nodes(handle: Handle, tag_names: &Vec<&'static str>) -> bool { for child in handle.children.borrow().iter() { let tag_name: &str = &get_tag_name(child.clone()).unwrap_or_default(); if tag_names.iter().any(|&n| n == tag_name) { - return true + return true; } if match child.clone().data { - Element { .. } => { - has_nodes(child.clone(), tag_names) - }, + Element { .. } => has_nodes(child.clone(), tag_names), _ => false, } { - return true + return true; } } - return false + false } pub fn text_children_count(handle: Handle) -> usize { let mut count = 0; for child in handle.children.borrow().iter() { let c = child.clone(); - match c.data { - Text { ref contents } => { - let s = contents.borrow(); - if s.trim().len() >= 20 { - count += 1 - } - }, - _ => () + if let Text { ref contents } = c.data { + let s = contents.borrow(); + if s.trim().len() >= 20 { + count += 1 + } } } count diff --git a/src/error.rs b/src/error.rs index e8d74ba..5aa4fff 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,24 +1,24 @@ -use std::fmt::{Display, Formatter, Result as FmtResult}; -use std::error; -#[cfg(feature = "reqwest")] +#[cfg(any(feature = "http-async", feature = "http-blocking"))] use reqwest; +use std::error; +use std::fmt::{Display, Formatter, Result as FmtResult}; use url; #[derive(Debug)] pub enum Error { - #[cfg(feature = "reqwest")] + #[cfg(any(feature = "http-async", feature = "http-blocking"))] NetworkError(reqwest::Error), UrlParseError(url::ParseError), - Unexpected, + HttpError(reqwest::StatusCode), } impl Display for Error { fn fmt(&self, f: &mut Formatter) -> FmtResult { match *self { - #[cfg(feature = "reqwest")] - Error::NetworkError(ref e) => write!(f, "NetworkError: {}", e), - Error::UrlParseError(ref e) => write!(f, "UrlParseError: {}", e), - Error::Unexpected => write!(f, "UnexpectedError"), + #[cfg(any(feature = "http-async", feature = "http-blocking"))] + Error::NetworkError(ref e) => write!(f, "NetworkError: {e}"), + Error::UrlParseError(ref e) => write!(f, "UrlParseError: {e}"), + Error::HttpError(status_code) => write!(f, "Http error, status: {status_code}"), } } } @@ -29,7 +29,7 @@ impl From for Error { } } -#[cfg(feature = "reqwest")] +#[cfg(any(feature = "http-async", feature = "http-blocking"))] impl From for Error { fn from(err: reqwest::Error) -> Error { Error::NetworkError(err) @@ -37,5 +37,7 @@ impl From for Error { } impl error::Error for Error { - fn description(&self) -> &str { "" } + fn description(&self) -> &str { + "" + } } diff --git a/src/extractor/blocking_client.rs b/src/extractor/blocking_client.rs new file mode 100644 index 0000000..51e84c2 --- /dev/null +++ b/src/extractor/blocking_client.rs @@ -0,0 +1,32 @@ +use crate::dom; +use crate::error::Error; +use crate::extractor::{extract, ReadableHtmlPage}; +use crate::scorer; +use crate::scorer::Candidate; +use html5ever::tendril::stream::TendrilSink; +use html5ever::{parse_document, serialize}; +use markup5ever_rcdom::{RcDom, SerializableHandle}; +use reqwest; +use std::cell::Cell; +use std::collections::BTreeMap; +use std::default::Default; +use std::io::Read; +use std::path::Path; +use std::time::Duration; +use url::Url; + +/// Scrape the given url and return a [`ReadableHtmlPage`] +pub fn scrape(url: &str) -> Result { + let client = reqwest::blocking::Client::builder() + .timeout(Duration::new(30, 0)) + .user_agent(super::APP_USER_AGENT) + .build()?; + + let mut res = client.get(url).send()?; + if res.status().is_success() { + let url = Url::parse(url)?; + extract(&mut res, &url) + } else { + Err(Error::HttpError) + } +} diff --git a/src/extractor/client.rs b/src/extractor/client.rs new file mode 100644 index 0000000..9a84d5e --- /dev/null +++ b/src/extractor/client.rs @@ -0,0 +1,22 @@ +use crate::error::Error; +use crate::extractor::{extract, ReadableHtmlPage}; +use std::time::Duration; +use url::Url; + +/// Scrape the given url and return a [`ReadableHtmlPage`] +pub async fn scrape(url: &str) -> Result { + let client = reqwest::Client::builder() + .timeout(Duration::new(30, 0)) + .user_agent(super::APP_USER_AGENT) + .build()?; + + let res = client.get(url).send().await?; + + if res.status().is_success() { + let url = Url::parse(url)?; + let read = res.text().await?; + extract(&mut read.as_bytes(), &url) + } else { + Err(Error::HttpError(res.status())) + } +} diff --git a/src/extractor.rs b/src/extractor/mod.rs similarity index 50% rename from src/extractor.rs rename to src/extractor/mod.rs index 9c315a2..ae6328a 100644 --- a/src/extractor.rs +++ b/src/extractor/mod.rs @@ -1,57 +1,61 @@ -use std::io::Read; -use std::collections::BTreeMap; -use std::path::Path; -use std::cell::Cell; -use markup5ever_rcdom::{RcDom, SerializableHandle}; -use html5ever::{parse_document, serialize}; +use crate::dom; +use crate::error::Error; +use crate::scorer; +use crate::scorer::Candidate; use html5ever::tendril::stream::TendrilSink; +use html5ever::{parse_document, serialize}; +use markup5ever_rcdom::{RcDom, SerializableHandle}; + +use std::cell::Cell; +use std::collections::BTreeMap; use std::default::Default; -#[cfg(feature = "reqwest")] -use std::time::Duration; -#[cfg(feature = "reqwest")] -use reqwest; +use std::io::Read; +use std::path::Path; + use url::Url; -use error::Error; -use dom; -use scorer; -use scorer::Candidate; + +#[cfg(feature = "http-async")] +#[cfg(not(feature = "http-blocking"))] +mod client; + +#[cfg(feature = "http-async")] +#[cfg(not(feature = "http-blocking"))] +pub use client::scrape; + +#[cfg(feature = "http-blocking")] +#[cfg(not(feature = "http-async"))] +mod blocking_client; + +#[cfg(feature = "http-blocking")] +#[cfg(not(feature = "http-async"))] +pub use blocking_client::scrape; #[derive(Debug)] -pub struct Product { - pub title: String, - pub content: String, - pub text: String, +pub struct ReadableHtmlPage { + pub title: String, + pub content: String, + pub text: String, } -#[cfg(feature = "reqwest")] -pub fn scrape(url: &str) -> Result { - let client = reqwest::blocking::Client::builder() - .timeout(Duration::new(30, 0)) - .build()?; - let mut res = client.get(url) - .send()?; - if res.status().is_success() { - let url = Url::parse(url)?; - extract(&mut res, &url) - } else { - Err(Error::Unexpected) - } -} +static APP_USER_AGENT: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"),); -pub fn extract(input: &mut R, url: &Url) -> Result where R: Read { +pub fn extract(input: &mut R, url: &Url) -> Result +where + R: Read, +{ let mut dom = parse_document(RcDom::default(), Default::default()) .from_utf8() .read_from(input) .unwrap(); - let mut title = String::new(); + let mut title = String::new(); let mut candidates = BTreeMap::new(); - let mut nodes = BTreeMap::new(); + let mut nodes = BTreeMap::new(); let handle = dom.document.clone(); scorer::preprocess(&mut dom, handle.clone(), &mut title); - scorer::find_candidates(&mut dom, Path::new("/"), handle.clone(), &mut candidates, &mut nodes); + scorer::find_candidates(Path::new("/"), handle.clone(), &mut candidates, &mut nodes); let mut id: &str = "/"; let mut top_candidate: &Candidate = &Candidate { - node: handle.clone(), + node: handle, score: Cell::new(0.0), }; for (i, c) in candidates.iter() { @@ -60,7 +64,7 @@ pub fn extract(input: &mut R, url: &Url) -> Result where R: R if score <= top_candidate.score.get() { continue; } - id = i; + id = i; top_candidate = c; } let mut bytes = vec![]; @@ -68,10 +72,19 @@ pub fn extract(input: &mut R, url: &Url) -> Result where R: R let node = top_candidate.node.clone(); scorer::clean(&mut dom, Path::new(id), node.clone(), url, &candidates); - serialize(&mut bytes, &SerializableHandle::from(node.clone()), Default::default()).ok(); + serialize( + &mut bytes, + &SerializableHandle::from(node.clone()), + Default::default(), + ) + .ok(); let content = String::from_utf8(bytes).unwrap_or_default(); let mut text: String = String::new(); - dom::extract_text(node.clone(), &mut text, true); - Ok(Product { title: title, content: content, text: text }) + dom::extract_text(node, &mut text, true); + Ok(ReadableHtmlPage { + title, + content, + text, + }) } diff --git a/src/lib.rs b/src/lib.rs index 3427fe9..bfdc6ce 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,14 +1,7 @@ -#[macro_use] extern crate html5ever; -extern crate markup5ever_rcdom; -extern crate regex; -extern crate url; -#[macro_use] extern crate lazy_static; -#[cfg(feature = "reqwest")] -extern crate reqwest; -pub mod extractor; -pub mod scorer; pub mod dom; pub mod error; +pub mod extractor; +pub mod scorer; diff --git a/src/scorer.rs b/src/scorer.rs index 63013f1..280a4eb 100644 --- a/src/scorer.rs +++ b/src/scorer.rs @@ -1,67 +1,68 @@ -use std::rc::Rc; -use std::path::Path; -use std::cell::Cell; -use std::collections::BTreeMap; -use url::Url; -use regex::Regex; +use crate::dom; use html5ever::tree_builder::TreeSink; +use html5ever::tree_builder::{ElementFlags, NodeOrText}; +use html5ever::{namespace_url, ns, LocalName, QualName}; +use lazy_static::lazy_static; +use markup5ever_rcdom::Handle; use markup5ever_rcdom::Node; +use markup5ever_rcdom::NodeData::{Comment, Doctype, Document, ProcessingInstruction}; use markup5ever_rcdom::NodeData::{Element, Text}; -use markup5ever_rcdom::Handle; -use markup5ever_rcdom::NodeData::{ - Document, - Doctype, - Comment, - ProcessingInstruction -}; use markup5ever_rcdom::RcDom; -use html5ever::{QualName, LocalName}; -use html5ever::tree_builder::{NodeOrText, ElementFlags}; -use dom; +use regex::Regex; +use std::cell::Cell; +use std::collections::BTreeMap; +use std::path::Path; +use std::rc::Rc; +use url::Url; -pub static PUNCTUATIONS_REGEX: &'static str = r"([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)"; -pub static UNLIKELY_CANDIDATES: &'static str = - "combx|comment|community|disqus|extra|foot|header|menu\ +pub static PUNCTUATIONS_REGEX: &str = r"([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)"; +pub static UNLIKELY_CANDIDATES: &str = "combx|comment|community|disqus|extra|foot|header|menu\ |remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate\ |pagination|pager|popup|tweet|twitter\ |ssba"; -pub static LIKELY_CANDIDATES: &'static str = "and|article|body|column|main|shadow\ +pub static LIKELY_CANDIDATES: &str = "and|article|body|column|main|shadow\ |content|hentry"; -pub static POSITIVE_CANDIDATES: &'static str = - "article|body|content|entry|hentry|main|page\ +pub static POSITIVE_CANDIDATES: &str = "article|body|content|entry|hentry|main|page\ |pagination|post|text|blog|story"; -pub static NEGATIVE_CANDIDATES: &'static str = - "combx|comment|com|contact|foot|footer|footnote\ +pub static NEGATIVE_CANDIDATES: &str = "combx|comment|com|contact|foot|footer|footnote\ |masthead|media|meta|outbrain|promo|related\ |scroll|shoutbox|sidebar|sponsor|shopping\ |tags|tool|widget|form|textfield\ |uiScale|hidden"; -static BLOCK_CHILD_TAGS: [&'static str; 10] = [ - "a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul", +static BLOCK_CHILD_TAGS: [&str; 10] = [ + "a", + "blockquote", + "dl", + "div", + "img", + "ol", + "p", + "pre", + "table", + "ul", ]; lazy_static! { static ref PUNCTUATIONS: Regex = Regex::new(PUNCTUATIONS_REGEX).unwrap(); - static ref LIKELY: Regex = Regex::new(LIKELY_CANDIDATES).unwrap(); - static ref UNLIKELY: Regex = Regex::new(UNLIKELY_CANDIDATES).unwrap(); - static ref POSITIVE: Regex = Regex::new(POSITIVE_CANDIDATES).unwrap(); - static ref NEGATIVE: Regex = Regex::new(NEGATIVE_CANDIDATES).unwrap(); + static ref LIKELY: Regex = Regex::new(LIKELY_CANDIDATES).unwrap(); + static ref UNLIKELY: Regex = Regex::new(UNLIKELY_CANDIDATES).unwrap(); + static ref POSITIVE: Regex = Regex::new(POSITIVE_CANDIDATES).unwrap(); + static ref NEGATIVE: Regex = Regex::new(NEGATIVE_CANDIDATES).unwrap(); } pub struct Candidate { - pub node: Rc, + pub node: Rc, pub score: Cell, } pub fn fix_img_path(handle: Handle, url: &Url) -> bool { let src = dom::get_attr("src", handle.clone()); if src.is_none() { - return false + return false; } let s = src.unwrap(); if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") { - match url.join(&s) { - Ok(new_url) => dom::set_attr("src", new_url.as_str(), handle), - Err(_) => (), + if let Ok(new_url) = url.join(&s) { + dom::set_attr("src", new_url.as_str(), handle) } } true @@ -70,13 +71,12 @@ pub fn fix_img_path(handle: Handle, url: &Url) -> bool { pub fn fix_anchor_path(handle: Handle, url: &Url) -> bool { let src = dom::get_attr("href", handle.clone()); if src.is_none() { - return false + return false; } let s = src.unwrap(); if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") { - match url.join(&s) { - Ok(new_url) => dom::set_attr("href", new_url.as_str(), handle), - Err(_) => (), + if let Ok(new_url) = url.join(&s) { + dom::set_attr("href", new_url.as_str(), handle) } } true @@ -89,7 +89,7 @@ pub fn get_link_density(handle: Handle) -> f32 { } let mut link_length = 0.0; let mut links: Vec> = vec![]; - dom::find_node(handle.clone(), "a", &mut links); + dom::find_node(handle, "a", &mut links); for link in links.iter() { link_length += dom::text_len(link.clone()) as f32; } @@ -99,147 +99,154 @@ pub fn get_link_density(handle: Handle) -> f32 { pub fn is_candidate(handle: Handle) -> bool { let text_len = dom::text_len(handle.clone()); if text_len < 20 { - return false + return false; } - let n: &str = &dom::get_tag_name(handle. clone()).unwrap_or_default(); + let n: &str = &dom::get_tag_name(handle.clone()).unwrap_or_default(); match n { "p" => true, - "div" | "article" | "center" | "section" => - !dom::has_nodes(handle.clone(), &BLOCK_CHILD_TAGS.iter().map(|t| *t).collect()), - _ => false + "div" | "article" | "center" | "section" => { + !dom::has_nodes(handle, &BLOCK_CHILD_TAGS.to_vec()) + } + _ => false, } } pub fn init_content_score(handle: Handle) -> f32 { let tag_name = dom::get_tag_name(handle.clone()).unwrap_or_default(); let score = match tag_name.as_ref() { - "article" => 10.0, - "div" => 5.0, + "article" => 10.0, + "div" => 5.0, "blockquote" => 3.0, - "form" => -3.0, - "th" => 5.0, - _ => 0.0, + "form" => -3.0, + "th" => 5.0, + _ => 0.0, }; - score + get_class_weight(handle.clone()) + score + get_class_weight(handle) } pub fn calc_content_score(handle: Handle) -> f32 { let mut score: f32 = 1.0; let mut text = String::new(); - dom::extract_text(handle.clone(), &mut text, true); + dom::extract_text(handle, &mut text, true); let mat = PUNCTUATIONS.find_iter(&text); score += mat.count() as f32; score += f32::min(f32::floor(text.chars().count() as f32 / 100.0), 3.0); - return score + score } pub fn get_class_weight(handle: Handle) -> f32 { let mut weight: f32 = 0.0; - match handle.data { - Element { name: _, ref attrs, .. } => { - for name in ["id", "class"].iter() { - if let Some(val) = dom::attr(name, &attrs.borrow()) { - if POSITIVE.is_match(&val) { - weight += 25.0 - }; - if NEGATIVE.is_match(&val) { - weight -= 25.0 - } + if let Element { + name: _, ref attrs, .. + } = handle.data + { + for name in ["id", "class"].iter() { + if let Some(val) = dom::attr(name, &attrs.borrow()) { + if POSITIVE.is_match(&val) { + weight += 25.0 + }; + if NEGATIVE.is_match(&val) { + weight -= 25.0 } } - }, - _ => (), + } }; weight } -pub fn preprocess(mut dom: &mut RcDom, handle: Handle, mut title: &mut String) -> bool { - match handle.clone().data { - Element { ref name, ref attrs, .. } => { - let tag_name = name.local.as_ref(); - match tag_name.to_lowercase().as_ref() { - "script" | "link" | "style" => { - return true - }, - "title" => dom::extract_text(handle.clone(), &mut title, true), - _ => (), +pub fn preprocess(dom: &mut RcDom, handle: Handle, title: &mut String) -> bool { + if let Element { + ref name, + ref attrs, + .. + } = handle.data + { + let tag_name = name.local.as_ref(); + match tag_name.to_lowercase().as_ref() { + "script" | "link" | "style" => return true, + "title" => { + if title.is_empty() { + dom::extract_text(handle.clone(), title, true) + } } - for name in ["id", "class"].iter() { - if let Some(val) = dom::attr(name, &attrs.borrow()) { - if tag_name != "body" && UNLIKELY.is_match(&val) { - if !LIKELY.is_match(&val) { - return true - } - } + _ => (), + } + + for name in ["id", "class"].iter() { + if let Some(val) = dom::attr(name, &attrs.borrow()) { + if tag_name != "body" && UNLIKELY.is_match(&val) && !LIKELY.is_match(&val) { + return true; } } - }, - _ => (), + } } let mut useless_nodes = vec![]; let mut paragraph_nodes = vec![]; let mut br_count = 0; + for child in handle.children.borrow().iter() { - if preprocess(&mut dom, child.clone(), &mut title) { + if preprocess(dom, child.clone(), title) { useless_nodes.push(child.clone()); } let c = child.clone(); - match c.data { - Element { ref name, .. } => { + match &c.data { + Element { name, .. } => { let tag_name = name.local.as_ref(); if "br" == tag_name.to_lowercase() { br_count += 1 } else { br_count = 0 } - }, - Text { ref contents } => { + } + Text { contents } => { let s = contents.borrow(); - if br_count >= 2 && s.trim().len() > 0 { + if br_count >= 2 && !s.trim().is_empty() { paragraph_nodes.push(child.clone()); br_count = 0 } - }, - _ => () + } + _ => (), } } + for node in useless_nodes.iter() { dom.remove_from_parent(node); } + for node in paragraph_nodes.iter() { let name = QualName::new(None, ns!(), LocalName::from("p")); let p = dom.create_element(name, vec![], ElementFlags::default()); dom.append_before_sibling(node, NodeOrText::AppendNode(p.clone())); dom.remove_from_parent(node); - match node.clone().data { - Text { ref contents } => { - let text = contents.clone().into_inner().clone(); - dom.append(&p, NodeOrText::AppendText(text)) - }, - _ => (), + if let Text { contents } = &node.data { + let text = contents.clone().into_inner().clone(); + dom.append(&p, NodeOrText::AppendText(text)) } } + false } -pub fn find_candidates(mut dom: &mut RcDom, - id: &Path, - handle: Handle, - candidates: &mut BTreeMap, - nodes: &mut BTreeMap>) { - +pub fn find_candidates( + id: &Path, + handle: Handle, + candidates: &mut BTreeMap, + nodes: &mut BTreeMap>, +) { if let Some(id) = id.to_str().map(|id| id.to_string()) { nodes.insert(id, handle.clone()); } if is_candidate(handle.clone()) { let score = calc_content_score(handle.clone()); - if let Some(c) = id.parent() + if let Some(c) = id + .parent() .and_then(|pid| find_or_create_candidate(pid, candidates, nodes)) { c.score.set(c.score.get() + score) } - if let Some(c) = id.parent() + if let Some(c) = id + .parent() .and_then(|pid| pid.parent()) .and_then(|gpid| find_or_create_candidate(gpid, candidates, nodes)) { @@ -247,91 +254,110 @@ pub fn find_candidates(mut dom: &mut RcDom, } } - if is_candidate(handle.clone()) { let score = calc_content_score(handle.clone()); - if let Some(c) = id.to_str() + if let Some(c) = id + .to_str() .map(|id| id.to_string()) - .and_then(|id| candidates.get(&id)) { - c.score.set(c.score.get() + score) - } - if let Some(c) = id.parent() + .and_then(|id| candidates.get(&id)) + { + c.score.set(c.score.get() + score) + } + if let Some(c) = id + .parent() .and_then(|pid| pid.to_str()) .map(|id| id.to_string()) - .and_then(|pid| candidates.get(&pid)) { - c.score.set(c.score.get() + score) - } - if let Some(c) = id.parent() + .and_then(|pid| candidates.get(&pid)) + { + c.score.set(c.score.get() + score) + } + if let Some(c) = id + .parent() .and_then(|p| p.parent()) .and_then(|pid| pid.to_str()) .map(|id| id.to_string()) - .and_then(|pid| candidates.get(&pid)) { - c.score.set(c.score.get() + score) - } + .and_then(|pid| candidates.get(&pid)) + { + c.score.set(c.score.get() + score) + } } for (i, child) in handle.children.borrow().iter().enumerate() { - find_candidates(&mut dom, - id.join(i.to_string()).as_path(), - child.clone(), - candidates, - nodes) + find_candidates( + id.join(i.to_string()).as_path(), + child.clone(), + candidates, + nodes, + ) } } -fn find_or_create_candidate<'a>(id: &Path, - candidates: &'a mut BTreeMap, - nodes: &BTreeMap>) -> Option<&'a Candidate> { +fn find_or_create_candidate<'a>( + id: &Path, + candidates: &'a mut BTreeMap, + nodes: &BTreeMap>, +) -> Option<&'a Candidate> { if let Some(id) = id.to_str().map(|id| id.to_string()) { if let Some(node) = nodes.get(&id) { if candidates.get(&id).is_none() { - candidates.insert(id.clone(), Candidate { - node: node.clone(), - score: Cell::new(init_content_score(node.clone())), - }); + candidates.insert( + id.clone(), + Candidate { + node: node.clone(), + score: Cell::new(init_content_score(node.clone())), + }, + ); } - return candidates.get(&id) + return candidates.get(&id); } } None } -pub fn clean(mut dom: &mut RcDom, id: &Path, handle: Handle, url: &Url, candidates: &BTreeMap) -> bool { +pub fn clean( + dom: &mut RcDom, + id: &Path, + handle: Handle, + url: &Url, + candidates: &BTreeMap, +) -> bool { let mut useless = false; match handle.data { - Document => (), + Document => (), Doctype { .. } => (), Text { ref contents } => { let s = contents.borrow(); - if s.trim().len() == 0 { + if s.trim().is_empty() { useless = true } - }, + } Comment { .. } => useless = true, - Element { ref name, ref attrs, .. } => { + Element { + ref name, + ref attrs, + .. + } => { let tag_name = name.local.as_ref(); match tag_name.to_lowercase().as_ref() { - "script" | "link" | "style" | "noscript" | "meta" - | "h1" | "object" | "header" | "footer" | "aside" => { - useless = true - }, + "script" | "link" | "style" | "noscript" | "meta" | "h1" | "object" | "header" + | "footer" | "aside" => useless = true, "form" | "table" | "ul" | "div" => { useless = is_useless(id, handle.clone(), candidates) - }, + } "img" => useless = !fix_img_path(handle.clone(), url), "a" => useless = !fix_anchor_path(handle.clone(), url), - _ => (), + _ => (), } - dom::clean_attr("id" , &mut *attrs.borrow_mut()); - dom::clean_attr("class", &mut *attrs.borrow_mut()); - dom::clean_attr("style", &mut *attrs.borrow_mut()); - }, - ProcessingInstruction { .. } => unreachable!() + dom::clean_attr("id", &mut attrs.borrow_mut()); + dom::clean_attr("class", &mut attrs.borrow_mut()); + dom::clean_attr("style", &mut attrs.borrow_mut()); + } + ProcessingInstruction { .. } => unreachable!(), } let mut useless_nodes = vec![]; for (i, child) in handle.children.borrow().iter().enumerate() { let pid = id.join(i.to_string()); - if clean(&mut dom, pid.as_path(), child.clone(), url, candidates) { + if clean(dom, pid.as_path(), child.clone(), url, candidates) { useless_nodes.push(child.clone()); } } @@ -347,49 +373,51 @@ pub fn clean(mut dom: &mut RcDom, id: &Path, handle: Handle, url: &Url, candidat pub fn is_useless(id: &Path, handle: Handle, candidates: &BTreeMap) -> bool { let tag_name = &dom::get_tag_name(handle.clone()).unwrap_or_default(); let weight = get_class_weight(handle.clone()); - let score = id.to_str() + let score = id + .to_str() .and_then(|id| candidates.get(id)) - .map(|c| c.score.get()).unwrap_or(0.0); + .map(|c| c.score.get()) + .unwrap_or(0.0); if weight + score < 0.0 { - return true + return true; } let text_nodes_len = dom::text_children_count(handle.clone()); - let mut p_nodes: Vec> = vec![]; - let mut img_nodes: Vec> = vec![]; - let mut li_nodes: Vec> = vec![]; + let mut p_nodes: Vec> = vec![]; + let mut img_nodes: Vec> = vec![]; + let mut li_nodes: Vec> = vec![]; let mut input_nodes: Vec> = vec![]; let mut embed_nodes: Vec> = vec![]; - dom::find_node(handle.clone(), "p" , &mut p_nodes); - dom::find_node(handle.clone(), "img" , &mut img_nodes); - dom::find_node(handle.clone(), "li" , &mut li_nodes); - dom::find_node(handle.clone(), "input" , &mut input_nodes); - dom::find_node(handle.clone(), "embed" , &mut embed_nodes); - let p_count = p_nodes.len(); - let img_count = img_nodes.len(); - let li_count = li_nodes.len() as i32 - 100; - let input_count = input_nodes.len(); - let embed_count = embed_nodes.len(); - let link_density = get_link_density(handle.clone()); - let content_length = dom::text_len(handle.clone()); + dom::find_node(handle.clone(), "p", &mut p_nodes); + dom::find_node(handle.clone(), "img", &mut img_nodes); + dom::find_node(handle.clone(), "li", &mut li_nodes); + dom::find_node(handle.clone(), "input", &mut input_nodes); + dom::find_node(handle.clone(), "embed", &mut embed_nodes); + let p_count = p_nodes.len(); + let img_count = img_nodes.len(); + let li_count = li_nodes.len() as i32 - 100; + let input_count = input_nodes.len(); + let embed_count = embed_nodes.len(); + let link_density = get_link_density(handle.clone()); + let content_length = dom::text_len(handle); let para_count = text_nodes_len + p_count; if img_count > para_count + text_nodes_len { - return true + return true; } if li_count > para_count as i32 && tag_name != "ul" && tag_name != "ol" { - return true + return true; } if input_count as f32 > f32::floor(para_count as f32 / 3.0) { - return true + return true; } if content_length < 25 && (img_count == 0 || img_count > 2) { - return true + return true; } if weight < 25.0 && link_density > 0.2 { - return true + return true; } if (embed_count == 1 && content_length < 35) || embed_count > 1 { - return true + return true; } - return false + false } diff --git a/tests/async.rs b/tests/async.rs new file mode 100644 index 0000000..37c6b4f --- /dev/null +++ b/tests/async.rs @@ -0,0 +1,45 @@ +use readability::extractor::ReadableHtmlPage; + +#[tokio::test] +async fn should_scrape_blocking() { + let page: ReadableHtmlPage = + readability::extractor::scrape("https://blog.rust-lang.org/2023/02/01/Rustup-1.25.2.html") + .await + .expect("scrape blog entry"); + + assert_eq!(page.title, "Announcing Rustup 1.25.2 | Rust Blog"); + assert!(!page.text.is_empty()); + assert!(!page.content.is_empty()); +} + +#[tokio::test] +async fn should_scrape_website_checking_user_agent() { + let page: ReadableHtmlPage = readability::extractor::scrape( + "https://dev.to/mayashavin/testing-vue-components-the-right-way-2hio", + ) + .await + .expect("scrape blog entry"); + + assert_eq!( + page.title, + "Testing Vue components the right way - DEV Community 👩\u{200d}💻👨\u{200d}💻" + ); + assert!(!page.text.is_empty()); + assert!(!page.content.is_empty()); +} + +#[tokio::test] +async fn should_scrape_w() { + let page: ReadableHtmlPage = readability::extractor::scrape( + "https://medium.com/@stevenchayes/so-you-messed-up-your-new-years-resolution-a4052e502906", + ) + .await + .expect("scrape blog entry"); + + assert_eq!( + page.title, + "So You Messed Up Your New Year’s Resolution | by Steven C. Hayes | Jan, 2023 | Medium" + ); + assert_eq!(page.text, ""); + assert_eq!(page.content, ""); +} diff --git a/tests/blocking.rs b/tests/blocking.rs new file mode 100644 index 0000000..7283ed6 --- /dev/null +++ b/tests/blocking.rs @@ -0,0 +1,12 @@ +use readability::extractor::ReadableHtmlPage; + +#[test] +fn should_scrape_blocking() { + let product: ReadableHtmlPage = + readability::extractor::scrape("https://blog.rust-lang.org/2023/02/01/Rustup-1.25.2.html") + .expect("scrape blog entry"); + + assert_eq!(product.title, "Announcing Rustup 1.25.2 | Rust Blog"); + assert!(!product.text.is_empty()); + assert!(!product.content.is_empty()); +} diff --git a/tests/lib.rs b/tests/lib.rs index f1990e4..249ab85 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -6,7 +6,6 @@ use url::Url; #[test] fn test_extract_title() { - assert!(true); let mut file = File::open("./data/title.html").unwrap(); let url = Url::parse("https://example.com").unwrap(); let product = readability::extractor::extract(&mut file, &url).unwrap(); @@ -15,7 +14,6 @@ fn test_extract_title() { #[test] fn test_fix_rel_links() { - assert!(true); let mut file = File::open("./data/rel.html").unwrap(); let url = Url::parse("https://example.com").unwrap(); let product = readability::extractor::extract(&mut file, &url).unwrap(); @@ -24,9 +22,8 @@ fn test_fix_rel_links() { #[test] fn test_fix_img_links() { - assert!(true); let mut file = File::open("./data/img.html").unwrap(); let url = Url::parse("https://example.com").unwrap(); let product = readability::extractor::extract(&mut file, &url).unwrap(); assert_eq!(product.content, "This is title

"); -} \ No newline at end of file +}