diff --git a/src/parser/base.rs b/src/parser/base.rs index 7b8c498..3d2cc5c 100644 --- a/src/parser/base.rs +++ b/src/parser/base.rs @@ -118,8 +118,7 @@ impl<'a> Parser<'a> { // If we do not find any characters that are not identifiers // then we are probably at the end of the stream - let end = simd::search_non_ident(bytes) - .unwrap_or_else(|| self.stream.len() - start); + let end = simd::search_non_ident(bytes).unwrap_or_else(|| self.stream.len() - start); self.stream.idx += end; Some(self.stream.slice(start, start + end)) @@ -219,10 +218,12 @@ impl<'a> Parser<'a> { self.stream.advance(); let closing_tag_name = self.read_to(b'>'); - + self.stream.expect_and_skip_cond(b'>'); - let closing_tag_matches_parent = self.stack.last() + let closing_tag_matches_parent = self + .stack + .last() .and_then(|last_handle| last_handle.get(self)) .and_then(|last_item| last_item.as_tag()) .map_or(false, |last_tag| last_tag.name() == closing_tag_name); @@ -296,10 +297,9 @@ impl<'a> Parser<'a> { if simd::matches_case_insensitive(tag, *b"doctype") { let doctype = self.read_ident()?; - let html5 = simd::matches_case_insensitive(doctype, *b"html"); - - if html5 { - self.version = Some(HTMLVersion::HTML5); + if simd::matches_case_insensitive(doctype, *b"html") { + self.skip_whitespaces(); + self.version = self.detect_html_version(); } self.skip_whitespaces(); @@ -310,6 +310,36 @@ impl<'a> Parser<'a> { Some(()) } + fn detect_html_version(&mut self) -> Option { + if let Some(byte) = self.stream.current() { + if byte == &b'>' { + return Some(HTMLVersion::HTML5); + } + } + + // it's old HTML 4 doctype. skipping the ' PUBLIC "-//W3C//DTD HTML 4.01"' part + self.stream.advance_by(29); + + if let Some(byte) = self.stream.current() { + if byte == &b'/' { + return Some(HTMLVersion::StrictHTML401); + } + } + + self.stream.advance(); + let variant_marker = self.read_ident()?; + + if simd::matches_case_insensitive(variant_marker, *b"transitional//en") { + return Some(HTMLVersion::TransitionalHTML401); + }; + + if simd::matches_case_insensitive(variant_marker, *b"frameset//en") { + return Some(HTMLVersion::FramesetHTML401); + }; + + None + } + fn parse_tag(&mut self) -> Option<()> { let start = self.stream.idx; diff --git a/src/simd/nightly.rs b/src/simd/nightly.rs index 08e66ab..bb3d9e5 100644 --- a/src/simd/nightly.rs +++ b/src/simd/nightly.rs @@ -101,6 +101,9 @@ pub fn search_non_ident(haystack: &[u8]) -> Option { let needle_uc_z = u8x16::splat(b'Z'); let needle_minus = u8x16::splat(b'-'); let needle_underscore = u8x16::splat(b'_'); + let needle_colon = u8x16::splat(b':'); + let needle_plus = u8x16::splat(b'+'); + let needle_slash = u8x16::splat(b'/'); while i <= len - 16 { let mut bytes = [0; 16]; @@ -122,7 +125,10 @@ pub fn search_non_ident(haystack: &[u8]) -> Option { let eq_minus = bytes.simd_eq(needle_minus); let eq_underscore = bytes.simd_eq(needle_underscore); - let symbol = eq_minus | eq_underscore; + let eq_plus = bytes.simd_eq(needle_plus); + let eq_colon = bytes.simd_eq(needle_colon); + let eq_slash = bytes.simd_eq(needle_slash); + let symbol = eq_minus | eq_underscore | eq_plus | eq_colon | eq_slash; let or = !(digit | lowercase | uppercase | symbol).to_int(); diff --git a/src/tests.rs b/src/tests.rs index 046861a..542a7fa 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -126,6 +126,30 @@ fn html5() { assert_eq!(dom.children().len(), 1) } +#[test] +fn html4_strict() { + let dom = parse(r#" hello"#, ParserOptions::default()).unwrap(); + + assert_eq!(dom.version(), Some(HTMLVersion::StrictHTML401)); + assert_eq!(dom.children().len(), 1) +} + +#[test] +fn html4_transitional() { + let dom = parse(r#" hello"#, ParserOptions::default()).unwrap(); + + assert_eq!(dom.version(), Some(HTMLVersion::TransitionalHTML401)); + assert_eq!(dom.children().len(), 1) +} + +#[test] +fn html4_frameset() { + let dom = parse(r#" hello"#, ParserOptions::default()).unwrap(); + + assert_eq!(dom.version(), Some(HTMLVersion::FramesetHTML401)); + assert_eq!(dom.children().len(), 1) +} + #[test] fn ignore_void_closing_tags() { let input = r#"