diff --git a/thorlog/v3/matchstrings.go b/thorlog/v3/matchstrings.go index 5b6553a..a3c728b 100644 --- a/thorlog/v3/matchstrings.go +++ b/thorlog/v3/matchstrings.go @@ -3,7 +3,6 @@ package thorlog import ( "bytes" "encoding/hex" - "encoding/json" "fmt" "regexp" "strings" @@ -13,57 +12,108 @@ import ( "github.com/NextronSystems/jsonlog" ) -type MatchData struct { - Data []byte - FullHex bool +type StringWithEncoding struct { + Data string `json:"data"` + Encoding StringEncoding `json:"encoding"` } -func (f MatchData) MarshalJSON() ([]byte, error) { - matchingString := f.String() - return InvalidUnicodeString(matchingString).MarshalJSON() +type StringEncoding string + +const ( + Plain StringEncoding = "plain" + Hex StringEncoding = "hex" +) + +// Encode encodes the given data into a StringWithEncoding, +// choosing the most appropriate encoding based on its content. +func Encode(s []byte) StringWithEncoding { + if utf8.Valid(s) { + return StringWithEncoding{ + Data: string(s), + Encoding: Plain, + } + } else { + return StringWithEncoding{ + Data: hex.EncodeToString(s), + Encoding: Hex, + } + } } -func (f *MatchData) UnmarshalJSON(data []byte) error { - var matchingString string - err := json.Unmarshal(data, &matchingString) - if err != nil { - return err +// EncodeString encodes the given data into a StringWithEncoding, +// choosing the most appropriate encoding based on its content. +func EncodeString(s string) StringWithEncoding { + if utf8.ValidString(s) { + return StringWithEncoding{ + Data: s, + Encoding: Plain, + } + } else { + return StringWithEncoding{ + Data: hex.EncodeToString([]byte(s)), + Encoding: Hex, + } } - f.Data = []byte(matchingString) - return nil } -func (f MatchData) JSONSchemaAlias() any { - return "" +// Plaintext returns the raw byte sequence represented by the StringWithEncoding. +func (s StringWithEncoding) Plaintext() []byte { + switch s.Encoding { + case Plain: + return []byte(s.Data) + case Hex: + data, err := hex.DecodeString(s.Data) + if err != nil { + return []byte("") + } + return data + default: + return []byte(fmt.Sprintf("", s.Encoding)) + } } var notOnlyASCII = regexp.MustCompile(`[^\x20-\x7E\x0d\x0a\x09]+`) // printable chars + \r,\n,\t -func (f MatchData) String() string { - if f.FullHex { - return hex.EncodeToString(f.Data) +// String returns a human-readable representation of the encoded string. +// The representation is guaranteed to be valid UTF-8. +func (s StringWithEncoding) String() string { + data := s.decode() + if needsQuoting.MatchString(data) { + return quote(data) } - data := f.Data - matchingString := string(data) // Try to directly convert + return data +} - if !f.FullHex && notOnlyASCII.MatchString(matchingString) { // Check if any non-printable chars occur - var utf16Data = data - // Try UTF16 encoding - if len(utf16Data) > 1 && utf16Data[0] == 0xFF && utf16Data[1] == 0xFE { - // Remove byte order mark - utf16Data = utf16Data[2:] - } - if len(utf16Data) > 0 && utf16Data[0] == 0 { - // Might be UTF16 shifted by one byte - utf16Data = utf16Data[1:] - } - matchingString, _ = decodeUTF16(utf16Data) - if notOnlyASCII.MatchString(matchingString) || len(matchingString) == 0 { - // Can't cleanly be rendered as UTF-16 - matchingString = string(data) - } +// decode returns the plain text, after decoding it from UTF-16, if applicable. +func (s StringWithEncoding) decode() string { + plaintext := s.Plaintext() + + if decoded, ok := attemptDecodeUTF16(plaintext); ok { + return decoded } - return matchingString + + return string(plaintext) +} + +// attemptDecodeUTF16 tries to decode the given byte slice as UTF-16 and checks +// whether the decoded string contains non-ASCII characters. +// It returns the decoded string and a boolean indicating whether the decoding was successful. +func attemptDecodeUTF16(b []byte) (string, bool) { + // Try UTF16 encoding + if len(b) > 1 && b[0] == 0xFF && b[1] == 0xFE { + // Remove byte order mark + b = b[2:] + } + if len(b) > 0 && b[0] == 0 { + // Might be UTF16 shifted by one byte + b = b[1:] + } + decodedUtf16, _ := decodeUTF16(b) + if !notOnlyASCII.MatchString(decodedUtf16) && len(decodedUtf16) > 0 { + // Can cleanly be rendered as UTF-16 + return decodedUtf16, true + } + return "", false } // https://gist.github.com/bradleypeabody/185b1d7ed6c0c2ab6cec @@ -84,31 +134,30 @@ func decodeUTF16(b []byte) (string, error) { return ret.String(), nil } -func (f MatchData) QuotedString() string { - matchingString := f.String() - matchingString = escaper.Replace(matchingString) - var replacedString bytes.Buffer - for _, char := range []byte(matchingString) { +func quote(s string) string { + s = escaper.Replace(s) + var quotedString bytes.Buffer + quotedString.WriteString(`"`) + for _, char := range []byte(s) { if char < 0x20 || char > 0x7E { // non ASCII - replacedString.WriteString("\\x") - replacedString.WriteString(hex.EncodeToString([]byte{char})) + quotedString.WriteString("\\x") + quotedString.WriteString(hex.EncodeToString([]byte{char})) } else { - replacedString.WriteByte(char) + quotedString.WriteByte(char) } } - matchingString = replacedString.String() - matchingString = fmt.Sprintf("\"%s\"", matchingString) - return matchingString + quotedString.WriteString(`"`) + return quotedString.String() } // MatchString describes a sequence of bytes in an object // that was matched on by a signature. type MatchString struct { // Match contains the bytes that were matched. - Match MatchData `json:"data"` + Match StringWithEncoding `json:"data"` // Context contains the bytes surrounding the matched bytes. // This may be missing if no context is available. - Context *MatchData `json:"context,omitempty"` + Context *StringWithEncoding `json:"context,omitempty"` // Offset contains the Match's offset within the Field // where the data was matched. Offset *uint64 `json:"offset,omitempty"` @@ -120,26 +169,16 @@ type MatchString struct { var needsQuoting = regexp.MustCompile(`[^\x21\x23-\x7E]`) func (f MatchString) String() string { - var matchString string - if needsQuoting.MatchString(f.Match.String()) && !f.Match.FullHex { - matchString += f.Match.QuotedString() - } else { - matchString += f.Match.String() - } + matchString := f.Match.String() if f.Context != nil { - matchString += " in " - if needsQuoting.MatchString(f.Context.String()) && !f.Context.FullHex { - matchString += f.Context.QuotedString() - } else { - matchString += f.Context.String() - } + matchString += " in " + f.Context.String() } if f.Offset != nil { // Only show the offset if this match does not encompass the full field and it's not explicitly hidden var showOffset = !f.HideOffset if f.Field != nil && *f.Offset == 0 { if targetString, isString := f.Field.Value().(string); isString { - if targetString == string(f.Match.Data) { + if targetString == f.Match.Data { showOffset = false } } diff --git a/thorlog/v3/matchstrings_test.go b/thorlog/v3/matchstrings_test.go index 7d68cb9..f3952a1 100644 --- a/thorlog/v3/matchstrings_test.go +++ b/thorlog/v3/matchstrings_test.go @@ -6,6 +6,125 @@ import ( "github.com/NextronSystems/jsonlog" ) +func TestEncodeString(t *testing.T) { + for _, tt := range []struct { + input string + expected StringWithEncoding + }{ + { + input: "hello", + expected: StringWithEncoding{ + Data: "hello", + Encoding: Plain, + }, + }, + { + input: string([]byte{0xff, 0xfe, 0xfd}), + expected: StringWithEncoding{ + Data: "fffefd", + Encoding: Hex, + }, + }, + } { + t.Run(tt.input, func(t *testing.T) { + if got := EncodeString(tt.input); got != tt.expected { + t.Fatalf("expected %v, got %v", tt.expected, got) + } + }) + } +} + +func TestStringWithEncoding_Plaintext(t *testing.T) { + for _, tt := range []struct { + input StringWithEncoding + expected []byte + }{ + { + input: StringWithEncoding{ + Data: "hello", + Encoding: Plain, + }, + expected: []byte("hello"), + }, + { + input: StringWithEncoding{ + Data: "68656c6c6f", + Encoding: Hex, + }, + expected: []byte("hello"), + }, + } { + t.Run(tt.input.Data, func(t *testing.T) { + if got := tt.input.Plaintext(); string(got) != string(tt.expected) { + t.Fatalf("expected %v, got %v", tt.expected, got) + } + }) + } +} +func TestStringWithEncoding_String(t *testing.T) { + for _, tt := range []struct { + input StringWithEncoding + expected string + }{ + { + input: StringWithEncoding{ + Data: "hello", + Encoding: Plain, + }, + expected: "hello", + }, + + { + input: StringWithEncoding{ + Data: "666f6f", + Encoding: Hex, + }, + expected: `foo`, + }, + { + input: StringWithEncoding{ + Data: "010203", + Encoding: Hex, + }, + expected: `"\x01\x02\x03"`, + }, + { + input: StringWithEncoding{ + Data: "fo\x00o", + Encoding: Plain, + }, + expected: `"fo\x00o"`, + }, + { + input: StringWithEncoding{ + Data: `"quoted" data`, + Encoding: Plain, + }, + expected: `"\"quoted\" data"`, + }, + { + input: StringWithEncoding{ + Data: `8081fe`, // invalid UTF-8 byte sequence + Encoding: Hex, + }, + expected: `"\x80\x81\xfe"`, + }, + { + input: StringWithEncoding{ + Data: "a\x00b\x00c\x00", // UTF-16LE encoded "abc" + Encoding: Plain, + }, + expected: `abc`, + }, + } { + t.Run(tt.input.Data, func(t *testing.T) { + if got := tt.input.String(); got != tt.expected { + t.Fatalf("expected %v, got %v", tt.expected, got) + } + }) + } +} + func TestMatchString_String(t *testing.T) { var testObject = new(struct { jsonlog.ObjectHeader @@ -19,57 +138,30 @@ func TestMatchString_String(t *testing.T) { }{ { input: MatchString{ - Match: MatchData{ - Data: []byte("foo"), - }, + Match: EncodeString("foo"), }, expected: "foo", }, { input: MatchString{ - Match: MatchData{ - Data: []byte("foo"), - }, - Context: &MatchData{ - Data: []byte("a wild foo appears"), - }, - }, - expected: `foo in "a wild foo appears"`, - }, - { - input: MatchString{ - Match: MatchData{ - Data: []byte("foo"), - }, - Context: &MatchData{ - Data: []byte("a wild foo appears"), + Match: EncodeString("foo"), + Context: &StringWithEncoding{ + Data: "a wild foo appears", + Encoding: Plain, }, }, expected: `foo in "a wild foo appears"`, }, { input: MatchString{ - Match: MatchData{ - Data: []byte("foo"), - FullHex: true, - }, - }, - expected: `666f6f`, - }, - { - input: MatchString{ - Match: MatchData{ - Data: []byte("foo"), - }, + Match: EncodeString("foo"), Offset: asOptional(0x10), }, expected: `foo at 0x10`, }, { input: MatchString{ - Match: MatchData{ - Data: []byte("foo"), - }, + Match: EncodeString("foo"), Offset: asOptional(0x10), HideOffset: true, }, @@ -77,42 +169,12 @@ func TestMatchString_String(t *testing.T) { }, { input: MatchString{ - Match: MatchData{ - Data: []byte("foo"), - }, + Match: EncodeString("foo"), Offset: asOptional(0x10), Field: ref, }, expected: `foo at 0x10 in MY_FIELD`, }, - { - input: MatchString{ - Match: MatchData{ - Data: []byte("bar"), - }, - Offset: asOptional(0x10), - Field: ref, - }, - expected: `bar at 0x10 in MY_FIELD`, - }, - { - input: MatchString{ - Match: MatchData{ - Data: []byte("bar"), - }, - Offset: asOptional(0), - Field: ref, - }, - expected: `bar in MY_FIELD`, - }, - { - input: MatchString{ - Match: MatchData{ - Data: []byte("fo\x00o"), - }, - }, - expected: `"fo\x00o"`, - }, } { t.Run(tt.expected, func(t *testing.T) { if got := tt.input.String(); got != tt.expected { diff --git a/thorlog/v3/sparsedata.go b/thorlog/v3/sparsedata.go index de863ec..4088cc2 100644 --- a/thorlog/v3/sparsedata.go +++ b/thorlog/v3/sparsedata.go @@ -1,8 +1,6 @@ package thorlog import ( - "bytes" - "encoding/hex" "fmt" "strings" @@ -40,8 +38,8 @@ func (s SparseData) String() string { _, _ = w.WriteString(truncateSequence) } for _, element := range s.Elements { - _, _ = nonAsciiEscaper.WriteString(&w, string(element.Data)) - if element.Offset+uint64(len(element.Data)) < uint64(s.Length) { + _, _ = nonAsciiEscaper.WriteString(&w, string(element.Data.Plaintext())) + if element.Offset+uint64(len(element.Data.Plaintext())) < uint64(s.Length) { _, _ = w.WriteString(truncateSequence) } } @@ -49,26 +47,8 @@ func (s SparseData) String() string { } type SparseDataElement struct { - Offset uint64 `json:"offset"` - Data InvalidUnicodeString `json:"data"` -} - -type InvalidUnicodeString string - -func (s InvalidUnicodeString) MarshalJSON() ([]byte, error) { - matchingString := escaper.Replace(string(s)) - var replacedString bytes.Buffer - for _, char := range []byte(matchingString) { - if char < 0x20 || char > 0x7E { // non ASCII - replacedString.WriteString("\\u00") - replacedString.WriteString(hex.EncodeToString([]byte{char})) - } else { - replacedString.WriteByte(char) - } - } - matchingString = replacedString.String() - matchingString = fmt.Sprintf("\"%s\"", matchingString) - return []byte(matchingString), nil + Offset uint64 `json:"offset"` + Data StringWithEncoding `json:"data"` } var escaper = strings.NewReplacer("\\", "\\\\", "\"", "\\\"")