From 3f4ceb05e6166da0312a101cc3df35b91b565fec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Buczy=C5=84ski?= <56112903+tomekb234@users.noreply.github.com> Date: Tue, 28 Oct 2025 21:01:50 +0100 Subject: [PATCH 1/2] Allow &[u8] and Vec input Fixes #61. --- src/lib.rs | 10 ++++++++++ src/parser/base.rs | 4 ++-- src/vdom.rs | 28 ++++++++++++++-------------- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6417134..3bb058d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -47,6 +47,11 @@ pub use vdom::{VDom, VDomGuard}; /// assert_eq!(dom.query_selector("div").unwrap().count(), 1); /// ``` pub fn parse(input: &str, options: ParserOptions) -> Result, ParseError> { + parse_bytes(input.as_bytes(), options) +} + +/// Same as `parse()`, but with `&[u8]` input. +pub fn parse_bytes(input: &[u8], options: ParserOptions) -> Result, ParseError> { let mut parser = Parser::new(input, options); parser.parse()?; Ok(VDom::from(parser)) @@ -83,5 +88,10 @@ pub fn parse_query_selector(input: &str) -> Option> { /// Once `VDomGuard` goes out of scope, the string will be freed. /// It should not be possible to cause UB in its current form and might become a safe function in the future. pub unsafe fn parse_owned(input: String, options: ParserOptions) -> Result { + parse_bytes_owned(input.into_bytes(), options) +} + +/// Same as `parse_owned()`, but with `Vec` input. +pub unsafe fn parse_bytes_owned(input: Vec, options: ParserOptions) -> Result { VDomGuard::parse(input, options) } diff --git a/src/parser/base.rs b/src/parser/base.rs index 7b8c498..188e94a 100644 --- a/src/parser/base.rs +++ b/src/parser/base.rs @@ -53,12 +53,12 @@ pub struct Parser<'a> { } impl<'a> Parser<'a> { - pub(crate) fn new(input: &str, options: ParserOptions) -> Parser { + pub(crate) fn new(input: &[u8], options: ParserOptions) -> Parser { Parser { stack: Vec::with_capacity(4), options, tags: Vec::new(), - stream: Stream::new(input.as_bytes()), + stream: Stream::new(input), ast: Vec::new(), ids: HashMap::new(), classes: HashMap::new(), diff --git a/src/vdom.rs b/src/vdom.rs index 5a98a6c..0d9331d 100644 --- a/src/vdom.rs +++ b/src/vdom.rs @@ -188,9 +188,9 @@ pub struct VDomGuard { /// Wrapped VDom instance dom: VDom<'static>, /// The leaked input string that is referenced by self.dom - _s: RawString, + _b: RawBytes, /// PhantomData for self.dom - _phantom: PhantomData<&'static str>, + _phantom: PhantomData<&'static [u8]>, } unsafe impl Send for VDomGuard {} @@ -198,23 +198,23 @@ unsafe impl Sync for VDomGuard {} impl VDomGuard { /// Parses the input string - pub(crate) fn parse(input: String, options: ParserOptions) -> Result { - let input = RawString::new(input); + pub(crate) fn parse(input: Vec, options: ParserOptions) -> Result { + let input = RawBytes::new(input); let ptr = input.as_ptr(); - let input_ref: &'static str = unsafe { &*ptr }; + let input_ref: &'static [u8] = unsafe { &*ptr }; // Parsing will either: // a) succeed, and we return a VDom instance // that, when dropped, will free the input string // b) fail, and we return a ParseError - // and `RawString`s destructor will run and deallocate the string properly + // and `RawBytes`s destructor will run and deallocate the string properly let mut parser = Parser::new(input_ref, options); parser.parse()?; Ok(Self { - _s: input, + _b: input, dom: VDom::from(parser), _phantom: PhantomData, }) @@ -238,21 +238,21 @@ impl VDomGuard { } #[derive(Debug)] -struct RawString(*mut str); +struct RawBytes(*mut [u8]); -impl RawString { - pub fn new(s: String) -> Self { - Self(Box::into_raw(s.into_boxed_str())) +impl RawBytes { + pub fn new(s: Vec) -> Self { + Self(Box::into_raw(s.into_boxed_slice())) } - pub fn as_ptr(&self) -> *mut str { + pub fn as_ptr(&self) -> *mut [u8] { self.0 } } -impl Drop for RawString { +impl Drop for RawBytes { fn drop(&mut self) { - // SAFETY: the pointer is always valid because `RawString` can only be constructed through `RawString::new()` + // SAFETY: the pointer is always valid because `RawBytes` can only be constructed through `RawBytes::new()` unsafe { drop(Box::from_raw(self.0)); }; From ff062a4adc531f32a5d7d3dfbc3bcaef6265f6d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Buczy=C5=84ski?= <56112903+tomekb234@users.noreply.github.com> Date: Tue, 28 Oct 2025 21:43:43 +0100 Subject: [PATCH 2/2] Add test for non-UTF-8 input --- src/tests.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/tests.rs b/src/tests.rs index 046861a..d7c07c2 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -1,4 +1,4 @@ -use crate::{parse, parse_owned, Bytes}; +use crate::{parse, parse_bytes, parse_owned, Bytes}; use crate::{parser::*, HTMLTag, Node}; fn force_as_tag<'a, 'b>(actual: &'a Node<'b>) -> &'a HTMLTag<'b> { @@ -778,3 +778,18 @@ fn tag_raw_abrupt_stop() { let from_raw = first_tag.raw().try_as_utf8_str().unwrap(); assert_eq!(from_raw, "

abcd\xc3\x28

".as_ref(); + + let vdom = parse_bytes(input, Default::default()).unwrap(); + + let first_tag = vdom.children()[0] + .get(vdom.parser()) + .unwrap() + .as_tag() + .unwrap(); + + assert_eq!(first_tag.raw().as_bytes(), b"

\xc3\x28

"); +}