Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 103 additions & 64 deletions thorlog/v3/matchstrings.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package thorlog
import (
"bytes"
"encoding/hex"
"encoding/json"
"fmt"
"regexp"
"strings"
Expand All @@ -13,57 +12,108 @@ import (
"github.com/NextronSystems/jsonlog"
)

type MatchData struct {
Data []byte
FullHex bool
type StringWithEncoding struct {
Data string `json:"data"`
Encoding StringEncoding `json:"encoding"`
}

func (f MatchData) MarshalJSON() ([]byte, error) {
matchingString := f.String()
return InvalidUnicodeString(matchingString).MarshalJSON()
type StringEncoding string

const (
Plain StringEncoding = "plain"
Hex StringEncoding = "hex"
)

// Encode encodes the given data into a StringWithEncoding,
// choosing the most appropriate encoding based on its content.
func Encode(s []byte) StringWithEncoding {
if utf8.Valid(s) {
return StringWithEncoding{
Data: string(s),
Encoding: Plain,
}
} else {
return StringWithEncoding{
Data: hex.EncodeToString(s),
Encoding: Hex,
}
}
}

func (f *MatchData) UnmarshalJSON(data []byte) error {
var matchingString string
err := json.Unmarshal(data, &matchingString)
if err != nil {
return err
// EncodeString encodes the given data into a StringWithEncoding,
// choosing the most appropriate encoding based on its content.
func EncodeString(s string) StringWithEncoding {
if utf8.ValidString(s) {
return StringWithEncoding{
Data: s,
Encoding: Plain,
}
} else {
return StringWithEncoding{
Data: hex.EncodeToString([]byte(s)),
Encoding: Hex,
}
}
f.Data = []byte(matchingString)
return nil
}

func (f MatchData) JSONSchemaAlias() any {
return ""
// Plaintext returns the raw byte sequence represented by the StringWithEncoding.
func (s StringWithEncoding) Plaintext() []byte {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that one of the encoding types is plain, I don't know if this is a good choice for a name. What about Data or Raw?

switch s.Encoding {
case Plain:
return []byte(s.Data)
case Hex:
data, err := hex.DecodeString(s.Data)
if err != nil {
return []byte("<invalid hex data: " + err.Error() + ">")
}
return data
default:
return []byte(fmt.Sprintf("<unknown encoding %s>", s.Encoding))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This return value is ambiguous because it could be held for a string with the same content. Thus this branch should rather end in an error state. On the other hand, we should never end up here.

I'd be fine with a comment stating that this shouldn't happen normally. But maybe we still want to include s.Data in the output in some way. (E.g., remove all invalid UTF8 chars but keep the message about unknown encoding?)

}
}

var notOnlyASCII = regexp.MustCompile(`[^\x20-\x7E\x0d\x0a\x09]+`) // printable chars + \r,\n,\t

func (f MatchData) String() string {
if f.FullHex {
return hex.EncodeToString(f.Data)
// String returns a human-readable representation of the encoded string.
// The representation is guaranteed to be valid UTF-8.
func (s StringWithEncoding) String() string {
data := s.decode()
if needsQuoting.MatchString(data) {
return quote(data)
}
data := f.Data
matchingString := string(data) // Try to directly convert
return data
}

if !f.FullHex && notOnlyASCII.MatchString(matchingString) { // Check if any non-printable chars occur
var utf16Data = data
// Try UTF16 encoding
if len(utf16Data) > 1 && utf16Data[0] == 0xFF && utf16Data[1] == 0xFE {
// Remove byte order mark
utf16Data = utf16Data[2:]
}
if len(utf16Data) > 0 && utf16Data[0] == 0 {
// Might be UTF16 shifted by one byte
utf16Data = utf16Data[1:]
}
matchingString, _ = decodeUTF16(utf16Data)
if notOnlyASCII.MatchString(matchingString) || len(matchingString) == 0 {
// Can't cleanly be rendered as UTF-16
matchingString = string(data)
}
// decode returns the plain text, after decoding it from UTF-16, if applicable.
func (s StringWithEncoding) decode() string {
plaintext := s.Plaintext()

if decoded, ok := attemptDecodeUTF16(plaintext); ok {
return decoded
}
return matchingString

return string(plaintext)
}

// attemptDecodeUTF16 tries to decode the given byte slice as UTF-16 and checks
// whether the decoded string contains non-ASCII characters.
// It returns the decoded string and a boolean indicating whether the decoding was successful.
func attemptDecodeUTF16(b []byte) (string, bool) {
// Try UTF16 encoding
if len(b) > 1 && b[0] == 0xFF && b[1] == 0xFE {
// Remove byte order mark
b = b[2:]
}
if len(b) > 0 && b[0] == 0 {
// Might be UTF16 shifted by one byte
b = b[1:]
}
decodedUtf16, _ := decodeUTF16(b)
if !notOnlyASCII.MatchString(decodedUtf16) && len(decodedUtf16) > 0 {
// Can cleanly be rendered as UTF-16
return decodedUtf16, true
}
return "", false
}

// https://gist.github.com/bradleypeabody/185b1d7ed6c0c2ab6cec
Expand All @@ -84,31 +134,30 @@ func decodeUTF16(b []byte) (string, error) {
return ret.String(), nil
}

func (f MatchData) QuotedString() string {
matchingString := f.String()
matchingString = escaper.Replace(matchingString)
var replacedString bytes.Buffer
for _, char := range []byte(matchingString) {
func quote(s string) string {
s = escaper.Replace(s)
var quotedString bytes.Buffer
quotedString.WriteString(`"`)
for _, char := range []byte(s) {
if char < 0x20 || char > 0x7E { // non ASCII
replacedString.WriteString("\\x")
replacedString.WriteString(hex.EncodeToString([]byte{char}))
quotedString.WriteString("\\x")
quotedString.WriteString(hex.EncodeToString([]byte{char}))
} else {
replacedString.WriteByte(char)
quotedString.WriteByte(char)
}
}
matchingString = replacedString.String()
matchingString = fmt.Sprintf("\"%s\"", matchingString)
return matchingString
quotedString.WriteString(`"`)
return quotedString.String()
}

// MatchString describes a sequence of bytes in an object
// that was matched on by a signature.
type MatchString struct {
// Match contains the bytes that were matched.
Match MatchData `json:"data"`
Match StringWithEncoding `json:"data"`
// Context contains the bytes surrounding the matched bytes.
// This may be missing if no context is available.
Context *MatchData `json:"context,omitempty"`
Context *StringWithEncoding `json:"context,omitempty"`
// Offset contains the Match's offset within the Field
// where the data was matched.
Offset *uint64 `json:"offset,omitempty"`
Expand All @@ -120,26 +169,16 @@ type MatchString struct {
var needsQuoting = regexp.MustCompile(`[^\x21\x23-\x7E]`)

func (f MatchString) String() string {
var matchString string
if needsQuoting.MatchString(f.Match.String()) && !f.Match.FullHex {
matchString += f.Match.QuotedString()
} else {
matchString += f.Match.String()
}
matchString := f.Match.String()
if f.Context != nil {
matchString += " in "
if needsQuoting.MatchString(f.Context.String()) && !f.Context.FullHex {
matchString += f.Context.QuotedString()
} else {
matchString += f.Context.String()
}
matchString += " in " + f.Context.String()
}
if f.Offset != nil {
// Only show the offset if this match does not encompass the full field and it's not explicitly hidden
var showOffset = !f.HideOffset
if f.Field != nil && *f.Offset == 0 {
if targetString, isString := f.Field.Value().(string); isString {
if targetString == string(f.Match.Data) {
if targetString == f.Match.Data {
showOffset = false
}
}
Expand Down
Loading