Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 38 additions & 8 deletions src/parser/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,7 @@ impl<'a> Parser<'a> {

// If we do not find any characters that are not identifiers
// then we are probably at the end of the stream
let end = simd::search_non_ident(bytes)
.unwrap_or_else(|| self.stream.len() - start);
let end = simd::search_non_ident(bytes).unwrap_or_else(|| self.stream.len() - start);

self.stream.idx += end;
Some(self.stream.slice(start, start + end))
Expand Down Expand Up @@ -219,10 +218,12 @@ impl<'a> Parser<'a> {
self.stream.advance();

let closing_tag_name = self.read_to(b'>');

self.stream.expect_and_skip_cond(b'>');

let closing_tag_matches_parent = self.stack.last()
let closing_tag_matches_parent = self
.stack
.last()
.and_then(|last_handle| last_handle.get(self))
.and_then(|last_item| last_item.as_tag())
.map_or(false, |last_tag| last_tag.name() == closing_tag_name);
Expand Down Expand Up @@ -296,10 +297,9 @@ impl<'a> Parser<'a> {
if simd::matches_case_insensitive(tag, *b"doctype") {
let doctype = self.read_ident()?;

let html5 = simd::matches_case_insensitive(doctype, *b"html");

if html5 {
self.version = Some(HTMLVersion::HTML5);
if simd::matches_case_insensitive(doctype, *b"html") {
self.skip_whitespaces();
self.version = self.detect_html_version();
}

self.skip_whitespaces();
Expand All @@ -310,6 +310,36 @@ impl<'a> Parser<'a> {
Some(())
}

fn detect_html_version(&mut self) -> Option<HTMLVersion> {
if let Some(byte) = self.stream.current() {
if byte == &b'>' {
return Some(HTMLVersion::HTML5);
}
}

// it's old HTML 4 doctype. skipping the ' PUBLIC "-//W3C//DTD HTML 4.01"' part
self.stream.advance_by(29);

if let Some(byte) = self.stream.current() {
if byte == &b'/' {
return Some(HTMLVersion::StrictHTML401);
}
}

self.stream.advance();
let variant_marker = self.read_ident()?;

if simd::matches_case_insensitive(variant_marker, *b"transitional//en") {
return Some(HTMLVersion::TransitionalHTML401);
};

if simd::matches_case_insensitive(variant_marker, *b"frameset//en") {
return Some(HTMLVersion::FramesetHTML401);
};

None
}

fn parse_tag(&mut self) -> Option<()> {
let start = self.stream.idx;

Expand Down
8 changes: 7 additions & 1 deletion src/simd/nightly.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ pub fn search_non_ident(haystack: &[u8]) -> Option<usize> {
let needle_uc_z = u8x16::splat(b'Z');
let needle_minus = u8x16::splat(b'-');
let needle_underscore = u8x16::splat(b'_');
let needle_colon = u8x16::splat(b':');
let needle_plus = u8x16::splat(b'+');
let needle_slash = u8x16::splat(b'/');

while i <= len - 16 {
let mut bytes = [0; 16];
Expand All @@ -122,7 +125,10 @@ pub fn search_non_ident(haystack: &[u8]) -> Option<usize> {

let eq_minus = bytes.simd_eq(needle_minus);
let eq_underscore = bytes.simd_eq(needle_underscore);
let symbol = eq_minus | eq_underscore;
let eq_plus = bytes.simd_eq(needle_plus);
let eq_colon = bytes.simd_eq(needle_colon);
let eq_slash = bytes.simd_eq(needle_slash);
let symbol = eq_minus | eq_underscore | eq_plus | eq_colon | eq_slash;

let or = !(digit | lowercase | uppercase | symbol).to_int();

Expand Down
24 changes: 24 additions & 0 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,30 @@ fn html5() {
assert_eq!(dom.children().len(), 1)
}

#[test]
fn html4_strict() {
let dom = parse(r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> hello"#, ParserOptions::default()).unwrap();

assert_eq!(dom.version(), Some(HTMLVersion::StrictHTML401));
assert_eq!(dom.children().len(), 1)
}

#[test]
fn html4_transitional() {
let dom = parse(r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN "http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd"> hello"#, ParserOptions::default()).unwrap();

assert_eq!(dom.version(), Some(HTMLVersion::TransitionalHTML401));
assert_eq!(dom.children().len(), 1)
}

#[test]
fn html4_frameset() {
let dom = parse(r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/frameset.dtd"> hello"#, ParserOptions::default()).unwrap();

assert_eq!(dom.version(), Some(HTMLVersion::FramesetHTML401));
assert_eq!(dom.children().len(), 1)
}

#[test]
fn ignore_void_closing_tags() {
let input = r#"
Expand Down