diff --git a/data/muddy_waters.html b/data/muddy_waters.html new file mode 100644 index 0000000..99d0542 --- /dev/null +++ b/data/muddy_waters.html @@ -0,0 +1,42 @@ +

+ McKinley Morganfield (April 4, 1914 – April 30, 1983),[1][2] + better known as Muddy Waters was an American + blues + singer-songwriter + and musician who was an important figure in the post-World War II blues + scene, and is often cited as the "father of modern + Chicago blues".[3] + His style of playing has been described as "raining down + Delta + beatitude".[4] +

diff --git a/src/dom.rs b/src/dom.rs index 9a0b636..3634dbe 100644 --- a/src/dom.rs +++ b/src/dom.rs @@ -105,7 +105,18 @@ pub fn extract_text(handle: Handle, text: &mut String, deep: bool) { let c = child.clone(); match c.data { Text { ref contents } => { - text.push_str(contents.borrow().as_ref()); + let cc = contents.borrow().to_string(); + + if text.len() > 0 && text.chars().last().unwrap() != ' ' { + let needs_space = match text.trim().chars().last().unwrap() { + '.' | '!' | ',' | '"' | '\'' => false, + _ => true, + }; + if needs_space { + text.push_str(" "); + } + } + text.push_str(cc.trim()); } Element { .. } => { if deep { diff --git a/src/scorer.rs b/src/scorer.rs index 4eb73c2..176ede7 100644 --- a/src/scorer.rs +++ b/src/scorer.rs @@ -18,7 +18,7 @@ pub static PUNCTUATIONS_REGEX: &str = r"([、。,.!?]|\.[^A-Za-z0-9]|,[^ pub static UNLIKELY_CANDIDATES: &str = "combx|comment|community|disqus|extra|foot|header|menu\ |remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate\ |pagination|pager|popup|tweet|twitter\ - |ssba"; + |ssba|mw-editsection|cite_ref-"; pub static LIKELY_CANDIDATES: &str = "and|article|body|column|main|shadow\ |content|hentry"; pub static POSITIVE_CANDIDATES: &str = "article|body|content|entry|hentry|main|page\ diff --git a/tests/lib.rs b/tests/lib.rs index 249ab85..a6131c8 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -27,3 +27,16 @@ fn test_fix_img_links() { let product = readability::extractor::extract(&mut file, &url).unwrap(); assert_eq!(product.content, "This is title

"); } + +#[test] +fn test_extract_text() { + // previous result: better known asMuddy Waterswas an Americanbluessinger-songwriterand musician who was an important figure + // new result: better known as Muddy Waters was an American blues singer-songwriter and musician who was an important figure + let mut file = File::open("./data/muddy_waters.html").unwrap(); + let url = Url::parse("https://example.com").unwrap(); + let product = readability::extractor::extract(&mut file, &url).unwrap(); + println!("{}", product.text); + let expected = r#"McKinley Morganfield (April 4, 1914 – April 30, 1983),better known as Muddy Waters was an American blues singer-songwriter and musician who was an important figure in the post-World War II blues + scene, and is often cited as the "father of modern Chicago blues ".His style of playing has been described as "raining down Delta beatitude"."#; + assert_eq!(expected, product.text); +}