From 3f4ceb05e6166da0312a101cc3df35b91b565fec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomasz=20Buczy=C5=84ski?=
<56112903+tomekb234@users.noreply.github.com>
Date: Tue, 28 Oct 2025 21:01:50 +0100
Subject: [PATCH 1/2] Allow &[u8] and Vec input
Fixes #61.
---
src/lib.rs | 10 ++++++++++
src/parser/base.rs | 4 ++--
src/vdom.rs | 28 ++++++++++++++--------------
3 files changed, 26 insertions(+), 16 deletions(-)
diff --git a/src/lib.rs b/src/lib.rs
index 6417134..3bb058d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -47,6 +47,11 @@ pub use vdom::{VDom, VDomGuard};
/// assert_eq!(dom.query_selector("div").unwrap().count(), 1);
/// ```
pub fn parse(input: &str, options: ParserOptions) -> Result, ParseError> {
+ parse_bytes(input.as_bytes(), options)
+}
+
+/// Same as `parse()`, but with `&[u8]` input.
+pub fn parse_bytes(input: &[u8], options: ParserOptions) -> Result, ParseError> {
let mut parser = Parser::new(input, options);
parser.parse()?;
Ok(VDom::from(parser))
@@ -83,5 +88,10 @@ pub fn parse_query_selector(input: &str) -> Option> {
/// Once `VDomGuard` goes out of scope, the string will be freed.
/// It should not be possible to cause UB in its current form and might become a safe function in the future.
pub unsafe fn parse_owned(input: String, options: ParserOptions) -> Result {
+ parse_bytes_owned(input.into_bytes(), options)
+}
+
+/// Same as `parse_owned()`, but with `Vec` input.
+pub unsafe fn parse_bytes_owned(input: Vec, options: ParserOptions) -> Result {
VDomGuard::parse(input, options)
}
diff --git a/src/parser/base.rs b/src/parser/base.rs
index 7b8c498..188e94a 100644
--- a/src/parser/base.rs
+++ b/src/parser/base.rs
@@ -53,12 +53,12 @@ pub struct Parser<'a> {
}
impl<'a> Parser<'a> {
- pub(crate) fn new(input: &str, options: ParserOptions) -> Parser {
+ pub(crate) fn new(input: &[u8], options: ParserOptions) -> Parser {
Parser {
stack: Vec::with_capacity(4),
options,
tags: Vec::new(),
- stream: Stream::new(input.as_bytes()),
+ stream: Stream::new(input),
ast: Vec::new(),
ids: HashMap::new(),
classes: HashMap::new(),
diff --git a/src/vdom.rs b/src/vdom.rs
index 5a98a6c..0d9331d 100644
--- a/src/vdom.rs
+++ b/src/vdom.rs
@@ -188,9 +188,9 @@ pub struct VDomGuard {
/// Wrapped VDom instance
dom: VDom<'static>,
/// The leaked input string that is referenced by self.dom
- _s: RawString,
+ _b: RawBytes,
/// PhantomData for self.dom
- _phantom: PhantomData<&'static str>,
+ _phantom: PhantomData<&'static [u8]>,
}
unsafe impl Send for VDomGuard {}
@@ -198,23 +198,23 @@ unsafe impl Sync for VDomGuard {}
impl VDomGuard {
/// Parses the input string
- pub(crate) fn parse(input: String, options: ParserOptions) -> Result {
- let input = RawString::new(input);
+ pub(crate) fn parse(input: Vec, options: ParserOptions) -> Result {
+ let input = RawBytes::new(input);
let ptr = input.as_ptr();
- let input_ref: &'static str = unsafe { &*ptr };
+ let input_ref: &'static [u8] = unsafe { &*ptr };
// Parsing will either:
// a) succeed, and we return a VDom instance
// that, when dropped, will free the input string
// b) fail, and we return a ParseError
- // and `RawString`s destructor will run and deallocate the string properly
+ // and `RawBytes`s destructor will run and deallocate the string properly
let mut parser = Parser::new(input_ref, options);
parser.parse()?;
Ok(Self {
- _s: input,
+ _b: input,
dom: VDom::from(parser),
_phantom: PhantomData,
})
@@ -238,21 +238,21 @@ impl VDomGuard {
}
#[derive(Debug)]
-struct RawString(*mut str);
+struct RawBytes(*mut [u8]);
-impl RawString {
- pub fn new(s: String) -> Self {
- Self(Box::into_raw(s.into_boxed_str()))
+impl RawBytes {
+ pub fn new(s: Vec) -> Self {
+ Self(Box::into_raw(s.into_boxed_slice()))
}
- pub fn as_ptr(&self) -> *mut str {
+ pub fn as_ptr(&self) -> *mut [u8] {
self.0
}
}
-impl Drop for RawString {
+impl Drop for RawBytes {
fn drop(&mut self) {
- // SAFETY: the pointer is always valid because `RawString` can only be constructed through `RawString::new()`
+ // SAFETY: the pointer is always valid because `RawBytes` can only be constructed through `RawBytes::new()`
unsafe {
drop(Box::from_raw(self.0));
};
From ff062a4adc531f32a5d7d3dfbc3bcaef6265f6d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomasz=20Buczy=C5=84ski?=
<56112903+tomekb234@users.noreply.github.com>
Date: Tue, 28 Oct 2025 21:43:43 +0100
Subject: [PATCH 2/2] Add test for non-UTF-8 input
---
src/tests.rs | 17 ++++++++++++++++-
1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/src/tests.rs b/src/tests.rs
index 046861a..d7c07c2 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -1,4 +1,4 @@
-use crate::{parse, parse_owned, Bytes};
+use crate::{parse, parse_bytes, parse_owned, Bytes};
use crate::{parser::*, HTMLTag, Node};
fn force_as_tag<'a, 'b>(actual: &'a Node<'b>) -> &'a HTMLTag<'b> {
@@ -778,3 +778,18 @@ fn tag_raw_abrupt_stop() {
let from_raw = first_tag.raw().try_as_utf8_str().unwrap();
assert_eq!(from_raw, "abcd
\xc3\x28
".as_ref();
+
+ let vdom = parse_bytes(input, Default::default()).unwrap();
+
+ let first_tag = vdom.children()[0]
+ .get(vdom.parser())
+ .unwrap()
+ .as_tag()
+ .unwrap();
+
+ assert_eq!(first_tag.raw().as_bytes(), b"\xc3\x28
");
+}