diff --git a/Cargo.toml b/Cargo.toml index 1a1f149..2a679fb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ url = "2.4" html5ever = "0.26" markup5ever_rcdom = "0.2" lazy_static = "1.4" +log = "0.4.22" [dependencies.reqwest] version = "0.11" diff --git a/data/comment.html b/data/comment.html new file mode 100644 index 0000000..484e6b2 --- /dev/null +++ b/data/comment.html @@ -0,0 +1,15 @@ + + + + + This is title + + + +
+
My div with more than 25 characters.

My paragraph with more than 25 characters.

+
+
+ + + \ No newline at end of file diff --git a/src/dom.rs b/src/dom.rs index 9a0b636..84976d2 100644 --- a/src/dom.rs +++ b/src/dom.rs @@ -147,7 +147,7 @@ pub fn find_node(handle: Handle, tag_name: &str, nodes: &mut Vec>) { } } -pub fn has_nodes(handle: Handle, tag_names: &Vec<&'static str>) -> bool { +pub fn has_nodes(handle: Handle, tag_names: &[&str]) -> bool { for child in handle.children.borrow().iter() { let tag_name: &str = &get_tag_name(child.clone()).unwrap_or_default(); if tag_names.iter().any(|&n| n == tag_name) { diff --git a/src/extractor.rs b/src/extractor.rs index 1ff2ce9..c10ecf9 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -1,7 +1,9 @@ +use crate::scorer::{Scorer, DEFAULT_SCORER}; use dom; use error::Error; use html5ever::tendril::stream::TendrilSink; use html5ever::{parse_document, serialize}; +use log::debug; use markup5ever_rcdom::{RcDom, SerializableHandle}; #[cfg(feature = "reqwest")] use reqwest; @@ -37,7 +39,8 @@ pub fn scrape(url: &str) -> Result { } } -pub fn extract(input: &mut R, url: &Url) -> Result +/// Extract text with a custom [`Scorer`]. +pub fn extract_with_scorer(input: &mut R, url: &Url, scorer: &Scorer) -> Result where R: Read, { @@ -48,8 +51,11 @@ where let mut candidates = BTreeMap::new(); let mut nodes = BTreeMap::new(); let handle = dom.document.clone(); - scorer::preprocess(&mut dom, handle.clone(), &mut title); - scorer::find_candidates(Path::new("/"), handle.clone(), &mut candidates, &mut nodes); + scorer.preprocess(&mut dom, handle.clone(), &mut title); + scorer.find_candidates(Path::new("/"), handle.clone(), &mut candidates, &mut nodes); + + debug!("Found candidates: {}", candidates.values().len()); + let mut id: &str = "/"; let mut top_candidate: &Candidate = &Candidate { node: handle.clone(), @@ -67,7 +73,7 @@ where let mut bytes = vec![]; let node = top_candidate.node.clone(); - scorer::clean(&mut dom, Path::new(id), node.clone(), url, &candidates); + scorer.clean(&mut dom, Path::new(id), node.clone(), url, &candidates); serialize( &mut bytes, @@ -85,3 +91,11 @@ where text, }) } + +/// Extract text with the default [`Scorer`]. +pub fn extract(input: &mut R, url: &Url) -> Result +where + R: Read, +{ + extract_with_scorer(input, url, &DEFAULT_SCORER) +} diff --git a/src/lib.rs b/src/lib.rs index 8977431..3baa530 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,12 +1,13 @@ #[macro_use] extern crate html5ever; -extern crate markup5ever_rcdom; -extern crate regex; -extern crate url; #[macro_use] extern crate lazy_static; +extern crate log; +extern crate markup5ever_rcdom; +extern crate regex; #[cfg(feature = "reqwest")] extern crate reqwest; +extern crate url; pub mod dom; pub mod error; diff --git a/src/scorer.rs b/src/scorer.rs index 4eb73c2..c24a852 100644 --- a/src/scorer.rs +++ b/src/scorer.rs @@ -14,21 +14,21 @@ use std::path::Path; use std::rc::Rc; use url::Url; -pub static PUNCTUATIONS_REGEX: &str = r"([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)"; -pub static UNLIKELY_CANDIDATES: &str = "combx|comment|community|disqus|extra|foot|header|menu\ +const PUNCTUATIONS_REGEX: &str = r"([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)"; +const UNLIKELY_CANDIDATES: &str = "combx|comment|community|disqus|extra|foot|header|menu\ |remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate\ |pagination|pager|popup|tweet|twitter\ |ssba"; -pub static LIKELY_CANDIDATES: &str = "and|article|body|column|main|shadow\ +const LIKELY_CANDIDATES: &str = "and|article|body|column|main|shadow\ |content|hentry"; -pub static POSITIVE_CANDIDATES: &str = "article|body|content|entry|hentry|main|page\ +const POSITIVE_CANDIDATES: &str = "article|body|content|entry|hentry|main|page\ |pagination|post|text|blog|story"; -pub static NEGATIVE_CANDIDATES: &str = "combx|comment|com|contact|foot|footer|footnote\ +const NEGATIVE_CANDIDATES: &str = "combx|comment|com|contact|foot|footer|footnote\ |masthead|media|meta|outbrain|promo|related\ |scroll|shoutbox|sidebar|sponsor|shopping\ |tags|tool|widget|form|textfield\ |uiScale|hidden"; -static BLOCK_CHILD_TAGS: [&str; 10] = [ +const BLOCK_CHILD_TAGS: [&str; 10] = [ "a", "blockquote", "dl", @@ -46,6 +46,7 @@ lazy_static! { static ref UNLIKELY: Regex = Regex::new(UNLIKELY_CANDIDATES).unwrap(); static ref POSITIVE: Regex = Regex::new(POSITIVE_CANDIDATES).unwrap(); static ref NEGATIVE: Regex = Regex::new(NEGATIVE_CANDIDATES).unwrap(); + pub static ref DEFAULT_SCORER: Scorer<'static> = Scorer::default(); } pub struct Candidate { @@ -53,361 +54,414 @@ pub struct Candidate { pub score: Cell, } -pub fn fix_img_path(handle: Handle, url: &Url) -> bool { - let src = dom::get_attr("src", handle.clone()); - let s = match src { - Some(src) => src, - None => return false, - }; - if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") { - if let Ok(new_url) = url.join(&s) { - dom::set_attr("src", new_url.as_str(), handle) - } - } - true +pub struct Scorer<'a> { + pub punctuations: &'a Regex, + pub unlikely_candidates: &'a Regex, + pub likely_candidates: &'a Regex, + pub positive_candidates: &'a Regex, + pub negative_candidates: &'a Regex, + pub block_child_tags: &'a [&'a str], } -pub fn fix_anchor_path(handle: Handle, url: &Url) -> bool { - let src = dom::get_attr("href", handle.clone()); - let s = match src { - Some(src) => src, - None => return false, - }; - if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") { - if let Ok(new_url) = url.join(&s) { - dom::set_attr("href", new_url.as_str(), handle) +impl<'a> Default for Scorer<'a> { + fn default() -> Self { + Self { + punctuations: &PUNCTUATIONS, + likely_candidates: &LIKELY, + unlikely_candidates: &UNLIKELY, + positive_candidates: &POSITIVE, + negative_candidates: &NEGATIVE, + block_child_tags: &BLOCK_CHILD_TAGS, } } - true } -pub fn get_link_density(handle: Handle) -> f32 { - let text_length = dom::text_len(handle.clone()) as f32; - if text_length == 0.0 { - return 0.0; - } - let mut link_length = 0.0; - let mut links: Vec> = vec![]; - dom::find_node(handle.clone(), "a", &mut links); - for link in links.iter() { - link_length += dom::text_len(link.clone()) as f32; +impl<'a> Scorer<'a> { + pub fn new( + punctuations: &'a Regex, + likely_candidates: &'a Regex, + unlikely_candidates: &'a Regex, + positive_candidates: &'a Regex, + negative_candidates: &'a Regex, + block_child_tags: &'a [&'a str], + ) -> Self { + Scorer { + punctuations, + unlikely_candidates, + likely_candidates, + positive_candidates, + negative_candidates, + block_child_tags, + } } - link_length / text_length -} -pub fn is_candidate(handle: Handle) -> bool { - let text_len = dom::text_len(handle.clone()); - if text_len < 20 { - return false; - } - let n: &str = &dom::get_tag_name(handle.clone()).unwrap_or_default(); - match n { - "p" => true, - "div" | "article" | "center" | "section" => { - !dom::has_nodes(handle.clone(), &BLOCK_CHILD_TAGS.to_vec()) + pub fn preprocess(&self, dom: &mut RcDom, handle: Handle, title: &mut String) -> bool { + if let Element { + ref name, + ref attrs, + .. + } = handle.clone().data + { + let tag_name = name.local.as_ref(); + match tag_name.to_lowercase().as_ref() { + "script" | "link" | "style" => return true, + "title" => dom::extract_text(handle.clone(), title, true), + _ => (), + } + for name in ["id", "class"].iter() { + if let Some(val) = dom::attr(name, &attrs.borrow()) { + if tag_name != "body" + && self.unlikely_candidates.is_match(&val) + && !self.likely_candidates.is_match(&val) + { + return true; + } + } + } + } + let mut useless_nodes = vec![]; + let mut paragraph_nodes = vec![]; + let mut br_count = 0; + for child in handle.children.borrow().iter() { + if self.preprocess(dom, child.clone(), title) { + useless_nodes.push(child.clone()); + } + let c = child.clone(); + match c.data { + Element { ref name, .. } => { + let tag_name = name.local.as_ref(); + if "br" == tag_name.to_lowercase() { + br_count += 1 + } else { + br_count = 0 + } + } + Text { ref contents } => { + let s = contents.borrow(); + if br_count >= 2 && !s.trim().is_empty() { + paragraph_nodes.push(child.clone()); + br_count = 0 + } + } + _ => (), + } + } + for node in useless_nodes.iter() { + dom.remove_from_parent(node); + } + for node in paragraph_nodes.iter() { + let name = QualName::new(None, ns!(), LocalName::from("p")); + let p = dom.create_element(name, vec![], ElementFlags::default()); + dom.append_before_sibling(node, NodeOrText::AppendNode(p.clone())); + dom.remove_from_parent(node); + if let Text { ref contents } = node.clone().data { + let text = contents.clone().into_inner().clone(); + dom.append(&p, NodeOrText::AppendText(text)) + } } - _ => false, + false } -} -pub fn init_content_score(handle: Handle) -> f32 { - let tag_name = dom::get_tag_name(handle.clone()).unwrap_or_default(); - let score = match tag_name.as_ref() { - "article" => 10.0, - "div" => 5.0, - "blockquote" => 3.0, - "form" => -3.0, - "th" => 5.0, - _ => 0.0, - }; - score + get_class_weight(handle.clone()) -} + pub fn find_candidates( + &self, + id: &Path, + handle: Handle, + candidates: &mut BTreeMap, + nodes: &mut BTreeMap>, + ) { + if let Some(id) = id.to_str().map(|id| id.to_string()) { + nodes.insert(id, handle.clone()); + } -pub fn calc_content_score(handle: Handle) -> f32 { - let mut score: f32 = 1.0; - let mut text = String::new(); - dom::extract_text(handle.clone(), &mut text, true); - let mat = PUNCTUATIONS.find_iter(&text); - score += mat.count() as f32; - score += f32::min(f32::floor(text.chars().count() as f32 / 100.0), 3.0); - score -} + if self.is_candidate(handle.clone()) { + let score = self.calc_content_score(handle.clone()); + if let Some(c) = id + .parent() + .and_then(|pid| self.find_or_create_candidate(pid, candidates, nodes)) + { + c.score.set(c.score.get() + score) + } + if let Some(c) = id + .parent() + .and_then(|pid| pid.parent()) + .and_then(|gpid| self.find_or_create_candidate(gpid, candidates, nodes)) + { + c.score.set(c.score.get() + score / 2.0) + } + } -pub fn get_class_weight(handle: Handle) -> f32 { - let mut weight: f32 = 0.0; - if let Element { - name: _, ref attrs, .. - } = handle.data - { - for name in ["id", "class"].iter() { - if let Some(val) = dom::attr(name, &attrs.borrow()) { - if POSITIVE.is_match(&val) { - weight += 25.0 - }; - if NEGATIVE.is_match(&val) { - weight -= 25.0 - } + if self.is_candidate(handle.clone()) { + let score = self.calc_content_score(handle.clone()); + if let Some(c) = id + .to_str() + .map(|id| id.to_string()) + .and_then(|id| candidates.get(&id)) + { + c.score.set(c.score.get() + score) + } + if let Some(c) = id + .parent() + .and_then(|pid| pid.to_str()) + .map(|id| id.to_string()) + .and_then(|pid| candidates.get(&pid)) + { + c.score.set(c.score.get() + score) + } + if let Some(c) = id + .parent() + .and_then(|p| p.parent()) + .and_then(|pid| pid.to_str()) + .map(|id| id.to_string()) + .and_then(|pid| candidates.get(&pid)) + { + c.score.set(c.score.get() + score) } } - }; - weight -} -pub fn preprocess(dom: &mut RcDom, handle: Handle, title: &mut String) -> bool { - if let Element { - ref name, - ref attrs, - .. - } = handle.clone().data - { - let tag_name = name.local.as_ref(); - match tag_name.to_lowercase().as_ref() { - "script" | "link" | "style" => return true, - "title" => dom::extract_text(handle.clone(), title, true), - _ => (), + for (i, child) in handle.children.borrow().iter().enumerate() { + self.find_candidates( + id.join(i.to_string()).as_path(), + child.clone(), + candidates, + nodes, + ) } - for name in ["id", "class"].iter() { - if let Some(val) = dom::attr(name, &attrs.borrow()) { - if tag_name != "body" && UNLIKELY.is_match(&val) && !LIKELY.is_match(&val) { - return true; + } + + pub fn clean( + &self, + dom: &mut RcDom, + id: &Path, + handle: Handle, + url: &Url, + candidates: &BTreeMap, + ) -> bool { + let mut useless = false; + match handle.data { + Document => (), + Doctype { .. } => (), + Text { ref contents } => { + let s = contents.borrow(); + if s.trim().is_empty() { + useless = true } } - } - } - let mut useless_nodes = vec![]; - let mut paragraph_nodes = vec![]; - let mut br_count = 0; - for child in handle.children.borrow().iter() { - if preprocess(dom, child.clone(), title) { - useless_nodes.push(child.clone()); - } - let c = child.clone(); - match c.data { - Element { ref name, .. } => { + Comment { .. } => useless = true, + Element { + ref name, + ref attrs, + .. + } => { let tag_name = name.local.as_ref(); - if "br" == tag_name.to_lowercase() { - br_count += 1 - } else { - br_count = 0 + match tag_name.to_lowercase().as_ref() { + "script" | "link" | "style" | "noscript" | "meta" | "h1" | "object" + | "header" | "footer" | "aside" => useless = true, + "form" | "table" | "ul" | "div" => { + useless = self.is_useless(id, handle.clone(), candidates) + } + "img" => useless = !fix_img_path(handle.clone(), url), + "a" => useless = !fix_anchor_path(handle.clone(), url), + _ => (), } + dom::clean_attr("id", &mut attrs.borrow_mut()); + dom::clean_attr("class", &mut attrs.borrow_mut()); + dom::clean_attr("style", &mut attrs.borrow_mut()); } - Text { ref contents } => { - let s = contents.borrow(); - if br_count >= 2 && !s.trim().is_empty() { - paragraph_nodes.push(child.clone()); - br_count = 0 - } + ProcessingInstruction { .. } => unreachable!(), + } + let mut useless_nodes = vec![]; + for (i, child) in handle.children.borrow().iter().enumerate() { + let pid = id.join(i.to_string()); + if self.clean(dom, pid.as_path(), child.clone(), url, candidates) { + useless_nodes.push(child.clone()); } - _ => (), } - } - for node in useless_nodes.iter() { - dom.remove_from_parent(node); - } - for node in paragraph_nodes.iter() { - let name = QualName::new(None, ns!(), LocalName::from("p")); - let p = dom.create_element(name, vec![], ElementFlags::default()); - dom.append_before_sibling(node, NodeOrText::AppendNode(p.clone())); - dom.remove_from_parent(node); - if let Text { ref contents } = node.clone().data { - let text = contents.clone().into_inner().clone(); - dom.append(&p, NodeOrText::AppendText(text)) + for node in useless_nodes.iter() { + dom.remove_from_parent(node); } + if dom::is_empty(handle) { + useless = true + } + useless } - false -} -pub fn find_candidates( - id: &Path, - handle: Handle, - candidates: &mut BTreeMap, - nodes: &mut BTreeMap>, -) { - if let Some(id) = id.to_str().map(|id| id.to_string()) { - nodes.insert(id, handle.clone()); + fn calc_content_score(&self, handle: Handle) -> f32 { + let mut score: f32 = 1.0; + let mut text = String::new(); + dom::extract_text(handle.clone(), &mut text, true); + let mat = self.punctuations.find_iter(&text); + score += mat.count() as f32; + score += f32::min(f32::floor(text.chars().count() as f32 / 100.0), 3.0); + score } - if is_candidate(handle.clone()) { - let score = calc_content_score(handle.clone()); - if let Some(c) = id - .parent() - .and_then(|pid| find_or_create_candidate(pid, candidates, nodes)) + fn get_class_weight(&self, handle: Handle) -> f32 { + let mut weight: f32 = 0.0; + if let Element { + name: _, ref attrs, .. + } = handle.data { - c.score.set(c.score.get() + score) - } - if let Some(c) = id - .parent() - .and_then(|pid| pid.parent()) - .and_then(|gpid| find_or_create_candidate(gpid, candidates, nodes)) - { - c.score.set(c.score.get() + score / 2.0) + for name in ["id", "class"].iter() { + if let Some(val) = dom::attr(name, &attrs.borrow()) { + if self.positive_candidates.is_match(&val) { + weight += 25.0 + }; + if self.negative_candidates.is_match(&val) { + weight -= 25.0 + } + } + } + }; + weight + } + + fn init_content_score(&self, handle: Handle) -> f32 { + let tag_name = dom::get_tag_name(handle.clone()).unwrap_or_default(); + let score = match tag_name.as_ref() { + "article" => 10.0, + "div" => 5.0, + "blockquote" => 3.0, + "form" => -3.0, + "th" => 5.0, + _ => 0.0, + }; + score + self.get_class_weight(handle.clone()) + } + + fn find_or_create_candidate( + &self, + id: &Path, + candidates: &'a mut BTreeMap, + nodes: &BTreeMap>, + ) -> Option<&'a Candidate> { + if let Some(id) = id.to_str().map(|id| id.to_string()) { + if let Some(node) = nodes.get(&id) { + if candidates.get(&id).is_none() { + candidates.insert( + id.clone(), + Candidate { + node: node.clone(), + score: Cell::new(self.init_content_score(node.clone())), + }, + ); + } + return candidates.get(&id); + } } + None } - if is_candidate(handle.clone()) { - let score = calc_content_score(handle.clone()); - if let Some(c) = id + fn is_useless( + &self, + id: &Path, + handle: Handle, + candidates: &BTreeMap, + ) -> bool { + let tag_name = &dom::get_tag_name(handle.clone()).unwrap_or_default(); + let weight = self.get_class_weight(handle.clone()); + let score = id .to_str() - .map(|id| id.to_string()) - .and_then(|id| candidates.get(&id)) - { - c.score.set(c.score.get() + score) + .and_then(|id| candidates.get(id)) + .map(|c| c.score.get()) + .unwrap_or(0.0); + if weight + score < 0.0 { + return true; } - if let Some(c) = id - .parent() - .and_then(|pid| pid.to_str()) - .map(|id| id.to_string()) - .and_then(|pid| candidates.get(&pid)) - { - c.score.set(c.score.get() + score) + let text_nodes_len = dom::text_children_count(handle.clone()); + let mut p_nodes: Vec> = vec![]; + let mut img_nodes: Vec> = vec![]; + let mut li_nodes: Vec> = vec![]; + let mut input_nodes: Vec> = vec![]; + let mut embed_nodes: Vec> = vec![]; + dom::find_node(handle.clone(), "p", &mut p_nodes); + dom::find_node(handle.clone(), "img", &mut img_nodes); + dom::find_node(handle.clone(), "li", &mut li_nodes); + dom::find_node(handle.clone(), "input", &mut input_nodes); + dom::find_node(handle.clone(), "embed", &mut embed_nodes); + let p_count = p_nodes.len(); + let img_count = img_nodes.len(); + let li_count = li_nodes.len() as i32 - 100; + let input_count = input_nodes.len(); + let embed_count = embed_nodes.len(); + let link_density = get_link_density(handle.clone()); + let content_length = dom::text_len(handle.clone()); + let para_count = text_nodes_len + p_count; + + if img_count > para_count + text_nodes_len { + return true; } - if let Some(c) = id - .parent() - .and_then(|p| p.parent()) - .and_then(|pid| pid.to_str()) - .map(|id| id.to_string()) - .and_then(|pid| candidates.get(&pid)) - { - c.score.set(c.score.get() + score) + if li_count > para_count as i32 && tag_name != "ul" && tag_name != "ol" { + return true; } - } - - for (i, child) in handle.children.borrow().iter().enumerate() { - find_candidates( - id.join(i.to_string()).as_path(), - child.clone(), - candidates, - nodes, - ) - } -} - -fn find_or_create_candidate<'a>( - id: &Path, - candidates: &'a mut BTreeMap, - nodes: &BTreeMap>, -) -> Option<&'a Candidate> { - if let Some(id) = id.to_str().map(|id| id.to_string()) { - if let Some(node) = nodes.get(&id) { - if candidates.get(&id).is_none() { - candidates.insert( - id.clone(), - Candidate { - node: node.clone(), - score: Cell::new(init_content_score(node.clone())), - }, - ); - } - return candidates.get(&id); + if input_count as f32 > f32::floor(para_count as f32 / 3.0) { + return true; } + if content_length < 25 && (img_count == 0 || img_count > 2) { + return true; + } + if weight < 25.0 && link_density > 0.2 { + return true; + } + if (embed_count == 1 && content_length < 35) || embed_count > 1 { + return true; + } + false } - None -} -pub fn clean( - dom: &mut RcDom, - id: &Path, - handle: Handle, - url: &Url, - candidates: &BTreeMap, -) -> bool { - let mut useless = false; - match handle.data { - Document => (), - Doctype { .. } => (), - Text { ref contents } => { - let s = contents.borrow(); - if s.trim().is_empty() { - useless = true - } + fn is_candidate(&self, handle: Handle) -> bool { + let text_len = dom::text_len(handle.clone()); + if text_len < 20 { + return false; } - Comment { .. } => useless = true, - Element { - ref name, - ref attrs, - .. - } => { - let tag_name = name.local.as_ref(); - match tag_name.to_lowercase().as_ref() { - "script" | "link" | "style" | "noscript" | "meta" | "h1" | "object" | "header" - | "footer" | "aside" => useless = true, - "form" | "table" | "ul" | "div" => { - useless = is_useless(id, handle.clone(), candidates) - } - "img" => useless = !fix_img_path(handle.clone(), url), - "a" => useless = !fix_anchor_path(handle.clone(), url), - _ => (), + let n: &str = &dom::get_tag_name(handle.clone()).unwrap_or_default(); + match n { + "p" => true, + "div" | "article" | "center" | "section" => { + !dom::has_nodes(handle.clone(), self.block_child_tags) } - dom::clean_attr("id", &mut attrs.borrow_mut()); - dom::clean_attr("class", &mut attrs.borrow_mut()); - dom::clean_attr("style", &mut attrs.borrow_mut()); + _ => false, } - ProcessingInstruction { .. } => unreachable!(), } - let mut useless_nodes = vec![]; - for (i, child) in handle.children.borrow().iter().enumerate() { - let pid = id.join(i.to_string()); - if clean(dom, pid.as_path(), child.clone(), url, candidates) { - useless_nodes.push(child.clone()); +} + +pub fn fix_img_path(handle: Handle, url: &Url) -> bool { + let src = dom::get_attr("src", handle.clone()); + let s = match src { + Some(src) => src, + None => return false, + }; + if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") { + if let Ok(new_url) = url.join(&s) { + dom::set_attr("src", new_url.as_str(), handle) } } - for node in useless_nodes.iter() { - dom.remove_from_parent(node); - } - if dom::is_empty(handle) { - useless = true - } - useless + true } -pub fn is_useless(id: &Path, handle: Handle, candidates: &BTreeMap) -> bool { - let tag_name = &dom::get_tag_name(handle.clone()).unwrap_or_default(); - let weight = get_class_weight(handle.clone()); - let score = id - .to_str() - .and_then(|id| candidates.get(id)) - .map(|c| c.score.get()) - .unwrap_or(0.0); - if weight + score < 0.0 { - return true; +pub fn fix_anchor_path(handle: Handle, url: &Url) -> bool { + let src = dom::get_attr("href", handle.clone()); + let s = match src { + Some(src) => src, + None => return false, + }; + if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") { + if let Ok(new_url) = url.join(&s) { + dom::set_attr("href", new_url.as_str(), handle) + } } - let text_nodes_len = dom::text_children_count(handle.clone()); - let mut p_nodes: Vec> = vec![]; - let mut img_nodes: Vec> = vec![]; - let mut li_nodes: Vec> = vec![]; - let mut input_nodes: Vec> = vec![]; - let mut embed_nodes: Vec> = vec![]; - dom::find_node(handle.clone(), "p", &mut p_nodes); - dom::find_node(handle.clone(), "img", &mut img_nodes); - dom::find_node(handle.clone(), "li", &mut li_nodes); - dom::find_node(handle.clone(), "input", &mut input_nodes); - dom::find_node(handle.clone(), "embed", &mut embed_nodes); - let p_count = p_nodes.len(); - let img_count = img_nodes.len(); - let li_count = li_nodes.len() as i32 - 100; - let input_count = input_nodes.len(); - let embed_count = embed_nodes.len(); - let link_density = get_link_density(handle.clone()); - let content_length = dom::text_len(handle.clone()); - let para_count = text_nodes_len + p_count; + true +} - if img_count > para_count + text_nodes_len { - return true; - } - if li_count > para_count as i32 && tag_name != "ul" && tag_name != "ol" { - return true; - } - if input_count as f32 > f32::floor(para_count as f32 / 3.0) { - return true; - } - if content_length < 25 && (img_count == 0 || img_count > 2) { - return true; - } - if weight < 25.0 && link_density > 0.2 { - return true; +pub fn get_link_density(handle: Handle) -> f32 { + let text_length = dom::text_len(handle.clone()) as f32; + if text_length == 0.0 { + return 0.0; } - if (embed_count == 1 && content_length < 35) || embed_count > 1 { - return true; + let mut link_length = 0.0; + let mut links: Vec> = vec![]; + dom::find_node(handle.clone(), "a", &mut links); + for link in links.iter() { + link_length += dom::text_len(link.clone()) as f32; } - false + link_length / text_length } diff --git a/tests/lib.rs b/tests/lib.rs index 249ab85..65696b7 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -1,6 +1,9 @@ extern crate readability; +extern crate regex; extern crate url; +use readability::{extractor, scorer::Scorer}; +use regex::Regex; use std::fs::File; use url::Url; @@ -8,7 +11,7 @@ use url::Url; fn test_extract_title() { let mut file = File::open("./data/title.html").unwrap(); let url = Url::parse("https://example.com").unwrap(); - let product = readability::extractor::extract(&mut file, &url).unwrap(); + let product = extractor::extract(&mut file, &url).unwrap(); assert_eq!(product.title, "This is title"); } @@ -16,7 +19,7 @@ fn test_extract_title() { fn test_fix_rel_links() { let mut file = File::open("./data/rel.html").unwrap(); let url = Url::parse("https://example.com").unwrap(); - let product = readability::extractor::extract(&mut file, &url).unwrap(); + let product = extractor::extract(&mut file, &url).unwrap(); assert_eq!(product.content, "This is title

poop

"); } @@ -24,6 +27,35 @@ fn test_fix_rel_links() { fn test_fix_img_links() { let mut file = File::open("./data/img.html").unwrap(); let url = Url::parse("https://example.com").unwrap(); - let product = readability::extractor::extract(&mut file, &url).unwrap(); + let product = extractor::extract(&mut file, &url).unwrap(); assert_eq!(product.content, "This is title

"); } + +#[test] +fn test_comment() { + let mut file = File::open("./data/comment.html").unwrap(); + let url = Url::parse("https://example.com").unwrap(); + let product = extractor::extract(&mut file, &url).unwrap(); + assert_eq!( + product.content, + "This is title" + ); +} + +#[test] +fn test_comment_custom() { + let mut file = File::open("./data/comment.html").unwrap(); + let url = Url::parse("https://example.com").unwrap(); + let scorer = Scorer { + unlikely_candidates: &Regex::new( + "combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter|ssba", + ) + .unwrap(), + ..Default::default() + }; + let product = extractor::extract_with_scorer(&mut file, &url, &scorer).unwrap(); + assert_eq!( + product.content, + "My div with more than 25 characters.

My paragraph with more than 25 characters.

" + ); +}