diff --git a/Cargo.toml b/Cargo.toml
index 1a1f149..2a679fb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,6 +15,7 @@ url = "2.4"
html5ever = "0.26"
markup5ever_rcdom = "0.2"
lazy_static = "1.4"
+log = "0.4.22"
[dependencies.reqwest]
version = "0.11"
diff --git a/data/comment.html b/data/comment.html
new file mode 100644
index 0000000..484e6b2
--- /dev/null
+++ b/data/comment.html
@@ -0,0 +1,15 @@
+
+
+
+
+ This is title
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/dom.rs b/src/dom.rs
index 9a0b636..84976d2 100644
--- a/src/dom.rs
+++ b/src/dom.rs
@@ -147,7 +147,7 @@ pub fn find_node(handle: Handle, tag_name: &str, nodes: &mut Vec>) {
}
}
-pub fn has_nodes(handle: Handle, tag_names: &Vec<&'static str>) -> bool {
+pub fn has_nodes(handle: Handle, tag_names: &[&str]) -> bool {
for child in handle.children.borrow().iter() {
let tag_name: &str = &get_tag_name(child.clone()).unwrap_or_default();
if tag_names.iter().any(|&n| n == tag_name) {
diff --git a/src/extractor.rs b/src/extractor.rs
index 1ff2ce9..c10ecf9 100644
--- a/src/extractor.rs
+++ b/src/extractor.rs
@@ -1,7 +1,9 @@
+use crate::scorer::{Scorer, DEFAULT_SCORER};
use dom;
use error::Error;
use html5ever::tendril::stream::TendrilSink;
use html5ever::{parse_document, serialize};
+use log::debug;
use markup5ever_rcdom::{RcDom, SerializableHandle};
#[cfg(feature = "reqwest")]
use reqwest;
@@ -37,7 +39,8 @@ pub fn scrape(url: &str) -> Result {
}
}
-pub fn extract(input: &mut R, url: &Url) -> Result
+/// Extract text with a custom [`Scorer`].
+pub fn extract_with_scorer(input: &mut R, url: &Url, scorer: &Scorer) -> Result
where
R: Read,
{
@@ -48,8 +51,11 @@ where
let mut candidates = BTreeMap::new();
let mut nodes = BTreeMap::new();
let handle = dom.document.clone();
- scorer::preprocess(&mut dom, handle.clone(), &mut title);
- scorer::find_candidates(Path::new("/"), handle.clone(), &mut candidates, &mut nodes);
+ scorer.preprocess(&mut dom, handle.clone(), &mut title);
+ scorer.find_candidates(Path::new("/"), handle.clone(), &mut candidates, &mut nodes);
+
+ debug!("Found candidates: {}", candidates.values().len());
+
let mut id: &str = "/";
let mut top_candidate: &Candidate = &Candidate {
node: handle.clone(),
@@ -67,7 +73,7 @@ where
let mut bytes = vec![];
let node = top_candidate.node.clone();
- scorer::clean(&mut dom, Path::new(id), node.clone(), url, &candidates);
+ scorer.clean(&mut dom, Path::new(id), node.clone(), url, &candidates);
serialize(
&mut bytes,
@@ -85,3 +91,11 @@ where
text,
})
}
+
+/// Extract text with the default [`Scorer`].
+pub fn extract(input: &mut R, url: &Url) -> Result
+where
+ R: Read,
+{
+ extract_with_scorer(input, url, &DEFAULT_SCORER)
+}
diff --git a/src/lib.rs b/src/lib.rs
index 8977431..3baa530 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,12 +1,13 @@
#[macro_use]
extern crate html5ever;
-extern crate markup5ever_rcdom;
-extern crate regex;
-extern crate url;
#[macro_use]
extern crate lazy_static;
+extern crate log;
+extern crate markup5ever_rcdom;
+extern crate regex;
#[cfg(feature = "reqwest")]
extern crate reqwest;
+extern crate url;
pub mod dom;
pub mod error;
diff --git a/src/scorer.rs b/src/scorer.rs
index 4eb73c2..c24a852 100644
--- a/src/scorer.rs
+++ b/src/scorer.rs
@@ -14,21 +14,21 @@ use std::path::Path;
use std::rc::Rc;
use url::Url;
-pub static PUNCTUATIONS_REGEX: &str = r"([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)";
-pub static UNLIKELY_CANDIDATES: &str = "combx|comment|community|disqus|extra|foot|header|menu\
+const PUNCTUATIONS_REGEX: &str = r"([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)";
+const UNLIKELY_CANDIDATES: &str = "combx|comment|community|disqus|extra|foot|header|menu\
|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate\
|pagination|pager|popup|tweet|twitter\
|ssba";
-pub static LIKELY_CANDIDATES: &str = "and|article|body|column|main|shadow\
+const LIKELY_CANDIDATES: &str = "and|article|body|column|main|shadow\
|content|hentry";
-pub static POSITIVE_CANDIDATES: &str = "article|body|content|entry|hentry|main|page\
+const POSITIVE_CANDIDATES: &str = "article|body|content|entry|hentry|main|page\
|pagination|post|text|blog|story";
-pub static NEGATIVE_CANDIDATES: &str = "combx|comment|com|contact|foot|footer|footnote\
+const NEGATIVE_CANDIDATES: &str = "combx|comment|com|contact|foot|footer|footnote\
|masthead|media|meta|outbrain|promo|related\
|scroll|shoutbox|sidebar|sponsor|shopping\
|tags|tool|widget|form|textfield\
|uiScale|hidden";
-static BLOCK_CHILD_TAGS: [&str; 10] = [
+const BLOCK_CHILD_TAGS: [&str; 10] = [
"a",
"blockquote",
"dl",
@@ -46,6 +46,7 @@ lazy_static! {
static ref UNLIKELY: Regex = Regex::new(UNLIKELY_CANDIDATES).unwrap();
static ref POSITIVE: Regex = Regex::new(POSITIVE_CANDIDATES).unwrap();
static ref NEGATIVE: Regex = Regex::new(NEGATIVE_CANDIDATES).unwrap();
+ pub static ref DEFAULT_SCORER: Scorer<'static> = Scorer::default();
}
pub struct Candidate {
@@ -53,361 +54,414 @@ pub struct Candidate {
pub score: Cell,
}
-pub fn fix_img_path(handle: Handle, url: &Url) -> bool {
- let src = dom::get_attr("src", handle.clone());
- let s = match src {
- Some(src) => src,
- None => return false,
- };
- if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") {
- if let Ok(new_url) = url.join(&s) {
- dom::set_attr("src", new_url.as_str(), handle)
- }
- }
- true
+pub struct Scorer<'a> {
+ pub punctuations: &'a Regex,
+ pub unlikely_candidates: &'a Regex,
+ pub likely_candidates: &'a Regex,
+ pub positive_candidates: &'a Regex,
+ pub negative_candidates: &'a Regex,
+ pub block_child_tags: &'a [&'a str],
}
-pub fn fix_anchor_path(handle: Handle, url: &Url) -> bool {
- let src = dom::get_attr("href", handle.clone());
- let s = match src {
- Some(src) => src,
- None => return false,
- };
- if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") {
- if let Ok(new_url) = url.join(&s) {
- dom::set_attr("href", new_url.as_str(), handle)
+impl<'a> Default for Scorer<'a> {
+ fn default() -> Self {
+ Self {
+ punctuations: &PUNCTUATIONS,
+ likely_candidates: &LIKELY,
+ unlikely_candidates: &UNLIKELY,
+ positive_candidates: &POSITIVE,
+ negative_candidates: &NEGATIVE,
+ block_child_tags: &BLOCK_CHILD_TAGS,
}
}
- true
}
-pub fn get_link_density(handle: Handle) -> f32 {
- let text_length = dom::text_len(handle.clone()) as f32;
- if text_length == 0.0 {
- return 0.0;
- }
- let mut link_length = 0.0;
- let mut links: Vec> = vec![];
- dom::find_node(handle.clone(), "a", &mut links);
- for link in links.iter() {
- link_length += dom::text_len(link.clone()) as f32;
+impl<'a> Scorer<'a> {
+ pub fn new(
+ punctuations: &'a Regex,
+ likely_candidates: &'a Regex,
+ unlikely_candidates: &'a Regex,
+ positive_candidates: &'a Regex,
+ negative_candidates: &'a Regex,
+ block_child_tags: &'a [&'a str],
+ ) -> Self {
+ Scorer {
+ punctuations,
+ unlikely_candidates,
+ likely_candidates,
+ positive_candidates,
+ negative_candidates,
+ block_child_tags,
+ }
}
- link_length / text_length
-}
-pub fn is_candidate(handle: Handle) -> bool {
- let text_len = dom::text_len(handle.clone());
- if text_len < 20 {
- return false;
- }
- let n: &str = &dom::get_tag_name(handle.clone()).unwrap_or_default();
- match n {
- "p" => true,
- "div" | "article" | "center" | "section" => {
- !dom::has_nodes(handle.clone(), &BLOCK_CHILD_TAGS.to_vec())
+ pub fn preprocess(&self, dom: &mut RcDom, handle: Handle, title: &mut String) -> bool {
+ if let Element {
+ ref name,
+ ref attrs,
+ ..
+ } = handle.clone().data
+ {
+ let tag_name = name.local.as_ref();
+ match tag_name.to_lowercase().as_ref() {
+ "script" | "link" | "style" => return true,
+ "title" => dom::extract_text(handle.clone(), title, true),
+ _ => (),
+ }
+ for name in ["id", "class"].iter() {
+ if let Some(val) = dom::attr(name, &attrs.borrow()) {
+ if tag_name != "body"
+ && self.unlikely_candidates.is_match(&val)
+ && !self.likely_candidates.is_match(&val)
+ {
+ return true;
+ }
+ }
+ }
+ }
+ let mut useless_nodes = vec![];
+ let mut paragraph_nodes = vec![];
+ let mut br_count = 0;
+ for child in handle.children.borrow().iter() {
+ if self.preprocess(dom, child.clone(), title) {
+ useless_nodes.push(child.clone());
+ }
+ let c = child.clone();
+ match c.data {
+ Element { ref name, .. } => {
+ let tag_name = name.local.as_ref();
+ if "br" == tag_name.to_lowercase() {
+ br_count += 1
+ } else {
+ br_count = 0
+ }
+ }
+ Text { ref contents } => {
+ let s = contents.borrow();
+ if br_count >= 2 && !s.trim().is_empty() {
+ paragraph_nodes.push(child.clone());
+ br_count = 0
+ }
+ }
+ _ => (),
+ }
+ }
+ for node in useless_nodes.iter() {
+ dom.remove_from_parent(node);
+ }
+ for node in paragraph_nodes.iter() {
+ let name = QualName::new(None, ns!(), LocalName::from("p"));
+ let p = dom.create_element(name, vec![], ElementFlags::default());
+ dom.append_before_sibling(node, NodeOrText::AppendNode(p.clone()));
+ dom.remove_from_parent(node);
+ if let Text { ref contents } = node.clone().data {
+ let text = contents.clone().into_inner().clone();
+ dom.append(&p, NodeOrText::AppendText(text))
+ }
}
- _ => false,
+ false
}
-}
-pub fn init_content_score(handle: Handle) -> f32 {
- let tag_name = dom::get_tag_name(handle.clone()).unwrap_or_default();
- let score = match tag_name.as_ref() {
- "article" => 10.0,
- "div" => 5.0,
- "blockquote" => 3.0,
- "form" => -3.0,
- "th" => 5.0,
- _ => 0.0,
- };
- score + get_class_weight(handle.clone())
-}
+ pub fn find_candidates(
+ &self,
+ id: &Path,
+ handle: Handle,
+ candidates: &mut BTreeMap,
+ nodes: &mut BTreeMap>,
+ ) {
+ if let Some(id) = id.to_str().map(|id| id.to_string()) {
+ nodes.insert(id, handle.clone());
+ }
-pub fn calc_content_score(handle: Handle) -> f32 {
- let mut score: f32 = 1.0;
- let mut text = String::new();
- dom::extract_text(handle.clone(), &mut text, true);
- let mat = PUNCTUATIONS.find_iter(&text);
- score += mat.count() as f32;
- score += f32::min(f32::floor(text.chars().count() as f32 / 100.0), 3.0);
- score
-}
+ if self.is_candidate(handle.clone()) {
+ let score = self.calc_content_score(handle.clone());
+ if let Some(c) = id
+ .parent()
+ .and_then(|pid| self.find_or_create_candidate(pid, candidates, nodes))
+ {
+ c.score.set(c.score.get() + score)
+ }
+ if let Some(c) = id
+ .parent()
+ .and_then(|pid| pid.parent())
+ .and_then(|gpid| self.find_or_create_candidate(gpid, candidates, nodes))
+ {
+ c.score.set(c.score.get() + score / 2.0)
+ }
+ }
-pub fn get_class_weight(handle: Handle) -> f32 {
- let mut weight: f32 = 0.0;
- if let Element {
- name: _, ref attrs, ..
- } = handle.data
- {
- for name in ["id", "class"].iter() {
- if let Some(val) = dom::attr(name, &attrs.borrow()) {
- if POSITIVE.is_match(&val) {
- weight += 25.0
- };
- if NEGATIVE.is_match(&val) {
- weight -= 25.0
- }
+ if self.is_candidate(handle.clone()) {
+ let score = self.calc_content_score(handle.clone());
+ if let Some(c) = id
+ .to_str()
+ .map(|id| id.to_string())
+ .and_then(|id| candidates.get(&id))
+ {
+ c.score.set(c.score.get() + score)
+ }
+ if let Some(c) = id
+ .parent()
+ .and_then(|pid| pid.to_str())
+ .map(|id| id.to_string())
+ .and_then(|pid| candidates.get(&pid))
+ {
+ c.score.set(c.score.get() + score)
+ }
+ if let Some(c) = id
+ .parent()
+ .and_then(|p| p.parent())
+ .and_then(|pid| pid.to_str())
+ .map(|id| id.to_string())
+ .and_then(|pid| candidates.get(&pid))
+ {
+ c.score.set(c.score.get() + score)
}
}
- };
- weight
-}
-pub fn preprocess(dom: &mut RcDom, handle: Handle, title: &mut String) -> bool {
- if let Element {
- ref name,
- ref attrs,
- ..
- } = handle.clone().data
- {
- let tag_name = name.local.as_ref();
- match tag_name.to_lowercase().as_ref() {
- "script" | "link" | "style" => return true,
- "title" => dom::extract_text(handle.clone(), title, true),
- _ => (),
+ for (i, child) in handle.children.borrow().iter().enumerate() {
+ self.find_candidates(
+ id.join(i.to_string()).as_path(),
+ child.clone(),
+ candidates,
+ nodes,
+ )
}
- for name in ["id", "class"].iter() {
- if let Some(val) = dom::attr(name, &attrs.borrow()) {
- if tag_name != "body" && UNLIKELY.is_match(&val) && !LIKELY.is_match(&val) {
- return true;
+ }
+
+ pub fn clean(
+ &self,
+ dom: &mut RcDom,
+ id: &Path,
+ handle: Handle,
+ url: &Url,
+ candidates: &BTreeMap,
+ ) -> bool {
+ let mut useless = false;
+ match handle.data {
+ Document => (),
+ Doctype { .. } => (),
+ Text { ref contents } => {
+ let s = contents.borrow();
+ if s.trim().is_empty() {
+ useless = true
}
}
- }
- }
- let mut useless_nodes = vec![];
- let mut paragraph_nodes = vec![];
- let mut br_count = 0;
- for child in handle.children.borrow().iter() {
- if preprocess(dom, child.clone(), title) {
- useless_nodes.push(child.clone());
- }
- let c = child.clone();
- match c.data {
- Element { ref name, .. } => {
+ Comment { .. } => useless = true,
+ Element {
+ ref name,
+ ref attrs,
+ ..
+ } => {
let tag_name = name.local.as_ref();
- if "br" == tag_name.to_lowercase() {
- br_count += 1
- } else {
- br_count = 0
+ match tag_name.to_lowercase().as_ref() {
+ "script" | "link" | "style" | "noscript" | "meta" | "h1" | "object"
+ | "header" | "footer" | "aside" => useless = true,
+ "form" | "table" | "ul" | "div" => {
+ useless = self.is_useless(id, handle.clone(), candidates)
+ }
+ "img" => useless = !fix_img_path(handle.clone(), url),
+ "a" => useless = !fix_anchor_path(handle.clone(), url),
+ _ => (),
}
+ dom::clean_attr("id", &mut attrs.borrow_mut());
+ dom::clean_attr("class", &mut attrs.borrow_mut());
+ dom::clean_attr("style", &mut attrs.borrow_mut());
}
- Text { ref contents } => {
- let s = contents.borrow();
- if br_count >= 2 && !s.trim().is_empty() {
- paragraph_nodes.push(child.clone());
- br_count = 0
- }
+ ProcessingInstruction { .. } => unreachable!(),
+ }
+ let mut useless_nodes = vec![];
+ for (i, child) in handle.children.borrow().iter().enumerate() {
+ let pid = id.join(i.to_string());
+ if self.clean(dom, pid.as_path(), child.clone(), url, candidates) {
+ useless_nodes.push(child.clone());
}
- _ => (),
}
- }
- for node in useless_nodes.iter() {
- dom.remove_from_parent(node);
- }
- for node in paragraph_nodes.iter() {
- let name = QualName::new(None, ns!(), LocalName::from("p"));
- let p = dom.create_element(name, vec![], ElementFlags::default());
- dom.append_before_sibling(node, NodeOrText::AppendNode(p.clone()));
- dom.remove_from_parent(node);
- if let Text { ref contents } = node.clone().data {
- let text = contents.clone().into_inner().clone();
- dom.append(&p, NodeOrText::AppendText(text))
+ for node in useless_nodes.iter() {
+ dom.remove_from_parent(node);
}
+ if dom::is_empty(handle) {
+ useless = true
+ }
+ useless
}
- false
-}
-pub fn find_candidates(
- id: &Path,
- handle: Handle,
- candidates: &mut BTreeMap,
- nodes: &mut BTreeMap>,
-) {
- if let Some(id) = id.to_str().map(|id| id.to_string()) {
- nodes.insert(id, handle.clone());
+ fn calc_content_score(&self, handle: Handle) -> f32 {
+ let mut score: f32 = 1.0;
+ let mut text = String::new();
+ dom::extract_text(handle.clone(), &mut text, true);
+ let mat = self.punctuations.find_iter(&text);
+ score += mat.count() as f32;
+ score += f32::min(f32::floor(text.chars().count() as f32 / 100.0), 3.0);
+ score
}
- if is_candidate(handle.clone()) {
- let score = calc_content_score(handle.clone());
- if let Some(c) = id
- .parent()
- .and_then(|pid| find_or_create_candidate(pid, candidates, nodes))
+ fn get_class_weight(&self, handle: Handle) -> f32 {
+ let mut weight: f32 = 0.0;
+ if let Element {
+ name: _, ref attrs, ..
+ } = handle.data
{
- c.score.set(c.score.get() + score)
- }
- if let Some(c) = id
- .parent()
- .and_then(|pid| pid.parent())
- .and_then(|gpid| find_or_create_candidate(gpid, candidates, nodes))
- {
- c.score.set(c.score.get() + score / 2.0)
+ for name in ["id", "class"].iter() {
+ if let Some(val) = dom::attr(name, &attrs.borrow()) {
+ if self.positive_candidates.is_match(&val) {
+ weight += 25.0
+ };
+ if self.negative_candidates.is_match(&val) {
+ weight -= 25.0
+ }
+ }
+ }
+ };
+ weight
+ }
+
+ fn init_content_score(&self, handle: Handle) -> f32 {
+ let tag_name = dom::get_tag_name(handle.clone()).unwrap_or_default();
+ let score = match tag_name.as_ref() {
+ "article" => 10.0,
+ "div" => 5.0,
+ "blockquote" => 3.0,
+ "form" => -3.0,
+ "th" => 5.0,
+ _ => 0.0,
+ };
+ score + self.get_class_weight(handle.clone())
+ }
+
+ fn find_or_create_candidate(
+ &self,
+ id: &Path,
+ candidates: &'a mut BTreeMap,
+ nodes: &BTreeMap>,
+ ) -> Option<&'a Candidate> {
+ if let Some(id) = id.to_str().map(|id| id.to_string()) {
+ if let Some(node) = nodes.get(&id) {
+ if candidates.get(&id).is_none() {
+ candidates.insert(
+ id.clone(),
+ Candidate {
+ node: node.clone(),
+ score: Cell::new(self.init_content_score(node.clone())),
+ },
+ );
+ }
+ return candidates.get(&id);
+ }
}
+ None
}
- if is_candidate(handle.clone()) {
- let score = calc_content_score(handle.clone());
- if let Some(c) = id
+ fn is_useless(
+ &self,
+ id: &Path,
+ handle: Handle,
+ candidates: &BTreeMap,
+ ) -> bool {
+ let tag_name = &dom::get_tag_name(handle.clone()).unwrap_or_default();
+ let weight = self.get_class_weight(handle.clone());
+ let score = id
.to_str()
- .map(|id| id.to_string())
- .and_then(|id| candidates.get(&id))
- {
- c.score.set(c.score.get() + score)
+ .and_then(|id| candidates.get(id))
+ .map(|c| c.score.get())
+ .unwrap_or(0.0);
+ if weight + score < 0.0 {
+ return true;
}
- if let Some(c) = id
- .parent()
- .and_then(|pid| pid.to_str())
- .map(|id| id.to_string())
- .and_then(|pid| candidates.get(&pid))
- {
- c.score.set(c.score.get() + score)
+ let text_nodes_len = dom::text_children_count(handle.clone());
+ let mut p_nodes: Vec> = vec![];
+ let mut img_nodes: Vec> = vec![];
+ let mut li_nodes: Vec> = vec![];
+ let mut input_nodes: Vec> = vec![];
+ let mut embed_nodes: Vec> = vec![];
+ dom::find_node(handle.clone(), "p", &mut p_nodes);
+ dom::find_node(handle.clone(), "img", &mut img_nodes);
+ dom::find_node(handle.clone(), "li", &mut li_nodes);
+ dom::find_node(handle.clone(), "input", &mut input_nodes);
+ dom::find_node(handle.clone(), "embed", &mut embed_nodes);
+ let p_count = p_nodes.len();
+ let img_count = img_nodes.len();
+ let li_count = li_nodes.len() as i32 - 100;
+ let input_count = input_nodes.len();
+ let embed_count = embed_nodes.len();
+ let link_density = get_link_density(handle.clone());
+ let content_length = dom::text_len(handle.clone());
+ let para_count = text_nodes_len + p_count;
+
+ if img_count > para_count + text_nodes_len {
+ return true;
}
- if let Some(c) = id
- .parent()
- .and_then(|p| p.parent())
- .and_then(|pid| pid.to_str())
- .map(|id| id.to_string())
- .and_then(|pid| candidates.get(&pid))
- {
- c.score.set(c.score.get() + score)
+ if li_count > para_count as i32 && tag_name != "ul" && tag_name != "ol" {
+ return true;
}
- }
-
- for (i, child) in handle.children.borrow().iter().enumerate() {
- find_candidates(
- id.join(i.to_string()).as_path(),
- child.clone(),
- candidates,
- nodes,
- )
- }
-}
-
-fn find_or_create_candidate<'a>(
- id: &Path,
- candidates: &'a mut BTreeMap,
- nodes: &BTreeMap>,
-) -> Option<&'a Candidate> {
- if let Some(id) = id.to_str().map(|id| id.to_string()) {
- if let Some(node) = nodes.get(&id) {
- if candidates.get(&id).is_none() {
- candidates.insert(
- id.clone(),
- Candidate {
- node: node.clone(),
- score: Cell::new(init_content_score(node.clone())),
- },
- );
- }
- return candidates.get(&id);
+ if input_count as f32 > f32::floor(para_count as f32 / 3.0) {
+ return true;
}
+ if content_length < 25 && (img_count == 0 || img_count > 2) {
+ return true;
+ }
+ if weight < 25.0 && link_density > 0.2 {
+ return true;
+ }
+ if (embed_count == 1 && content_length < 35) || embed_count > 1 {
+ return true;
+ }
+ false
}
- None
-}
-pub fn clean(
- dom: &mut RcDom,
- id: &Path,
- handle: Handle,
- url: &Url,
- candidates: &BTreeMap,
-) -> bool {
- let mut useless = false;
- match handle.data {
- Document => (),
- Doctype { .. } => (),
- Text { ref contents } => {
- let s = contents.borrow();
- if s.trim().is_empty() {
- useless = true
- }
+ fn is_candidate(&self, handle: Handle) -> bool {
+ let text_len = dom::text_len(handle.clone());
+ if text_len < 20 {
+ return false;
}
- Comment { .. } => useless = true,
- Element {
- ref name,
- ref attrs,
- ..
- } => {
- let tag_name = name.local.as_ref();
- match tag_name.to_lowercase().as_ref() {
- "script" | "link" | "style" | "noscript" | "meta" | "h1" | "object" | "header"
- | "footer" | "aside" => useless = true,
- "form" | "table" | "ul" | "div" => {
- useless = is_useless(id, handle.clone(), candidates)
- }
- "img" => useless = !fix_img_path(handle.clone(), url),
- "a" => useless = !fix_anchor_path(handle.clone(), url),
- _ => (),
+ let n: &str = &dom::get_tag_name(handle.clone()).unwrap_or_default();
+ match n {
+ "p" => true,
+ "div" | "article" | "center" | "section" => {
+ !dom::has_nodes(handle.clone(), self.block_child_tags)
}
- dom::clean_attr("id", &mut attrs.borrow_mut());
- dom::clean_attr("class", &mut attrs.borrow_mut());
- dom::clean_attr("style", &mut attrs.borrow_mut());
+ _ => false,
}
- ProcessingInstruction { .. } => unreachable!(),
}
- let mut useless_nodes = vec![];
- for (i, child) in handle.children.borrow().iter().enumerate() {
- let pid = id.join(i.to_string());
- if clean(dom, pid.as_path(), child.clone(), url, candidates) {
- useless_nodes.push(child.clone());
+}
+
+pub fn fix_img_path(handle: Handle, url: &Url) -> bool {
+ let src = dom::get_attr("src", handle.clone());
+ let s = match src {
+ Some(src) => src,
+ None => return false,
+ };
+ if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") {
+ if let Ok(new_url) = url.join(&s) {
+ dom::set_attr("src", new_url.as_str(), handle)
}
}
- for node in useless_nodes.iter() {
- dom.remove_from_parent(node);
- }
- if dom::is_empty(handle) {
- useless = true
- }
- useless
+ true
}
-pub fn is_useless(id: &Path, handle: Handle, candidates: &BTreeMap) -> bool {
- let tag_name = &dom::get_tag_name(handle.clone()).unwrap_or_default();
- let weight = get_class_weight(handle.clone());
- let score = id
- .to_str()
- .and_then(|id| candidates.get(id))
- .map(|c| c.score.get())
- .unwrap_or(0.0);
- if weight + score < 0.0 {
- return true;
+pub fn fix_anchor_path(handle: Handle, url: &Url) -> bool {
+ let src = dom::get_attr("href", handle.clone());
+ let s = match src {
+ Some(src) => src,
+ None => return false,
+ };
+ if !s.starts_with("//") && !s.starts_with("http://") && !s.starts_with("https://") {
+ if let Ok(new_url) = url.join(&s) {
+ dom::set_attr("href", new_url.as_str(), handle)
+ }
}
- let text_nodes_len = dom::text_children_count(handle.clone());
- let mut p_nodes: Vec> = vec![];
- let mut img_nodes: Vec> = vec![];
- let mut li_nodes: Vec> = vec![];
- let mut input_nodes: Vec> = vec![];
- let mut embed_nodes: Vec> = vec![];
- dom::find_node(handle.clone(), "p", &mut p_nodes);
- dom::find_node(handle.clone(), "img", &mut img_nodes);
- dom::find_node(handle.clone(), "li", &mut li_nodes);
- dom::find_node(handle.clone(), "input", &mut input_nodes);
- dom::find_node(handle.clone(), "embed", &mut embed_nodes);
- let p_count = p_nodes.len();
- let img_count = img_nodes.len();
- let li_count = li_nodes.len() as i32 - 100;
- let input_count = input_nodes.len();
- let embed_count = embed_nodes.len();
- let link_density = get_link_density(handle.clone());
- let content_length = dom::text_len(handle.clone());
- let para_count = text_nodes_len + p_count;
+ true
+}
- if img_count > para_count + text_nodes_len {
- return true;
- }
- if li_count > para_count as i32 && tag_name != "ul" && tag_name != "ol" {
- return true;
- }
- if input_count as f32 > f32::floor(para_count as f32 / 3.0) {
- return true;
- }
- if content_length < 25 && (img_count == 0 || img_count > 2) {
- return true;
- }
- if weight < 25.0 && link_density > 0.2 {
- return true;
+pub fn get_link_density(handle: Handle) -> f32 {
+ let text_length = dom::text_len(handle.clone()) as f32;
+ if text_length == 0.0 {
+ return 0.0;
}
- if (embed_count == 1 && content_length < 35) || embed_count > 1 {
- return true;
+ let mut link_length = 0.0;
+ let mut links: Vec> = vec![];
+ dom::find_node(handle.clone(), "a", &mut links);
+ for link in links.iter() {
+ link_length += dom::text_len(link.clone()) as f32;
}
- false
+ link_length / text_length
}
diff --git a/tests/lib.rs b/tests/lib.rs
index 249ab85..65696b7 100644
--- a/tests/lib.rs
+++ b/tests/lib.rs
@@ -1,6 +1,9 @@
extern crate readability;
+extern crate regex;
extern crate url;
+use readability::{extractor, scorer::Scorer};
+use regex::Regex;
use std::fs::File;
use url::Url;
@@ -8,7 +11,7 @@ use url::Url;
fn test_extract_title() {
let mut file = File::open("./data/title.html").unwrap();
let url = Url::parse("https://example.com").unwrap();
- let product = readability::extractor::extract(&mut file, &url).unwrap();
+ let product = extractor::extract(&mut file, &url).unwrap();
assert_eq!(product.title, "This is title");
}
@@ -16,7 +19,7 @@ fn test_extract_title() {
fn test_fix_rel_links() {
let mut file = File::open("./data/rel.html").unwrap();
let url = Url::parse("https://example.com").unwrap();
- let product = readability::extractor::extract(&mut file, &url).unwrap();
+ let product = extractor::extract(&mut file, &url).unwrap();
assert_eq!(product.content, "This is title poop
");
}
@@ -24,6 +27,35 @@ fn test_fix_rel_links() {
fn test_fix_img_links() {
let mut file = File::open("./data/img.html").unwrap();
let url = Url::parse("https://example.com").unwrap();
- let product = readability::extractor::extract(&mut file, &url).unwrap();
+ let product = extractor::extract(&mut file, &url).unwrap();
assert_eq!(product.content, "This is title
");
}
+
+#[test]
+fn test_comment() {
+ let mut file = File::open("./data/comment.html").unwrap();
+ let url = Url::parse("https://example.com").unwrap();
+ let product = extractor::extract(&mut file, &url).unwrap();
+ assert_eq!(
+ product.content,
+ "This is title"
+ );
+}
+
+#[test]
+fn test_comment_custom() {
+ let mut file = File::open("./data/comment.html").unwrap();
+ let url = Url::parse("https://example.com").unwrap();
+ let scorer = Scorer {
+ unlikely_candidates: &Regex::new(
+ "combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter|ssba",
+ )
+ .unwrap(),
+ ..Default::default()
+ };
+ let product = extractor::extract_with_scorer(&mut file, &url, &scorer).unwrap();
+ assert_eq!(
+ product.content,
+ "My div with more than 25 characters.My paragraph with more than 25 characters.
"
+ );
+}
My paragraph with more than 25 characters.
+