diff --git a/lib/Grok.php b/lib/Grok.php index 049fc7a..f78b7b3 100644 --- a/lib/Grok.php +++ b/lib/Grok.php @@ -11,76 +11,77 @@ class Grok { protected $pattern_regex = null; protected $matchCount = 0; + protected $cache = []; protected $patterns = array( - 'USERNAME' => '[a-zA-Z0-9_-]+', - 'USER' => '%{USERNAME}', - 'INT' => '(?:[+-]?(?:[0-9]+))', - 'BASE10NUM' => '(?[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+)))', - 'NUMBER' => '(?:%{BASE10NUM})', - 'BASE16NUM' => '(? '\b(? '\b(?:[1-9][0-9]*)\b', - 'NONNEGINT' => '\b(?:[0-9]+)\b', - 'WORD' => '\b\w+\b', - 'NOTSPACE' => '\S+', - 'SPACE' => '\s*', - 'DATA' => '.*?', - 'GREEDYDATA' => '.*', + 'USERNAME' => '[a-zA-Z0-9_-]+', + 'USER' => '%{USERNAME}', + 'INT' => '(?:[+-]?(?:[0-9]+))', + 'BASE10NUM' => '(?[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+)))', + 'NUMBER' => '(?:%{BASE10NUM})', + 'BASE16NUM' => '(? '\b(? '\b(?:[1-9][0-9]*)\b', + 'NONNEGINT' => '\b(?:[0-9]+)\b', + 'WORD' => '\b\w+\b', + 'NOTSPACE' => '\S+', + 'SPACE' => '\s*', + 'DATA' => '.*?', + 'GREEDYDATA' => '.*', //'QUOTEDSTRING' => '(?:(? '(?:(? '[A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12}', + 'QUOTEDSTRING' => '(?:(? '[A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12}', # Networking - 'MAC' => '(?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC})', - 'CISCOMAC' => '(?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4})', - 'WINDOWSMAC' => '(?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})', - 'COMMONMAC' => '(?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})', - 'IP' => '(? '\b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b)', - 'HOST' => '%{HOSTNAME}', - 'IPORHOST' => '(?:%{HOSTNAME}|%{IP})', - 'HOSTPORT' => '(?:%{IPORHOST=~/\./}:%{POSINT})', + 'MAC' => '(?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC})', + 'CISCOMAC' => '(?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4})', + 'WINDOWSMAC' => '(?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})', + 'COMMONMAC' => '(?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})', + 'IP' => '(? '\b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b)', + 'HOST' => '%{HOSTNAME}', + 'IPORHOST' => '(?:%{HOSTNAME}|%{IP})', + 'HOSTPORT' => '(?:%{IPORHOST=~/\./}:%{POSINT})', # paths - 'PATH' => '(?:%{UNIXPATH}|%{WINPATH})', - 'UNIXPATH' => '(?:/(?:[\w_%!$@:.,-]+|\\.)*)+', + 'PATH' => '(?:%{UNIXPATH}|%{WINPATH})', + 'UNIXPATH' => '(?:/(?:[\w_%!$@:.,-]+|\\.)*)+', #'UNIXPATH' => '(? '(?:/dev/pts/%{NONNEGINT})', - 'BSDTTY' => '(?:/dev/tty[pq][a-z0-9])', - 'TTY' => '(?:%{BSDTTY}|%{LINUXTTY})', - 'WINPATH' => '(?:[A-Za-z]+:|\\)(?:\\[^\\?*]*)+', - 'URIPROTO' => '[A-Za-z]+(\+[A-Za-z+]+)?', - 'URIHOST' => '%{IPORHOST}(?::%{POSINT:port})?', + 'LINUXTTY' => '(?:/dev/pts/%{NONNEGINT})', + 'BSDTTY' => '(?:/dev/tty[pq][a-z0-9])', + 'TTY' => '(?:%{BSDTTY}|%{LINUXTTY})', + 'WINPATH' => '(?:[A-Za-z]+:|\\)(?:\\[^\\?*]*)+', + 'URIPROTO' => '[A-Za-z]+(\+[A-Za-z+]+)?', + 'URIHOST' => '%{IPORHOST}(?::%{POSINT:port})?', # uripath comes loosely from RFC1738, but mostly from what Firefox # doesn't turn into %XX - 'URIPATH' => '(?:/[A-Za-z0-9$.+!*\'(){},~:;=#%_-]*)+', + 'URIPATH' => '(?:/[A-Za-z0-9$.+!*\'(){},~:;=#%_-]*)+', #'URIPARAM' => '\?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)?' - 'URIPARAM' => '\?[A-Za-z0-9$.+!*\'|(){},~#%&/=:;_-]*', - 'URIPATHPARAM' => '%{URIPATH}(?:%{URIPARAM})?', - 'URI' => '%{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?', + 'URIPARAM' => '\?[A-Za-z0-9$.+!*\'|(){},~#%&/=:;_-]*', + 'URIPATHPARAM' => '%{URIPATH}(?:%{URIPARAM})?', + 'URI' => '%{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?', # Months: January, Feb, 3, 03, 12, December - 'MONTH' => '\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b', - 'MONTHNUM' => '(?:0?[1-9]|1[0-2])', - 'MONTHDAY' => '(?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9])', + 'MONTH' => '\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b', + 'MONTHNUM' => '(?:0?[1-9]|1[0-2])', + 'MONTHDAY' => '(?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9])', # Days: Monday, Tue, Thu, etc... - 'DAY' => '(?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?)', + 'DAY' => '(?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?)', # Years? - 'YEAR' => '[0-9]+', + 'YEAR' => '[0-9]+', # Time: HH:MM:SS #'TIME' => '\d{2}:\d{2}(?::\d{2}(?:\.\d+)?)?', # I'm still on the fence about using grok to perform the time match, # since it's probably slower. #'TIME' => '%{POSINT<24}:%{POSINT<60}(?::%{POSINT<60}(?:\.%{POSINT})?)?', - 'HOUR' => '(?:2[0123]|[01][0-9])', - 'MINUTE' => '(?:[0-5][0-9])', + 'HOUR' => '(?:2[0123]|[01][0-9])', + 'MINUTE' => '(?:[0-5][0-9])', # '60' is a leap second in most time standards and thus is valid. - 'SECOND' => '(?:(?:[0-5][0-9]|60)(?:[.,][0-9]+)?)', - 'TIME' => '(?!<[0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9])', + 'SECOND' => '(?:(?:[0-5][0-9]|60)(?:[.,][0-9]+)?)', + 'TIME' => '(?!<[0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9])', # datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it) 'DATE_US' => '%{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}', 'DATE_EU' => '%{YEAR}[/-]%{MONTHNUM}[/-]%{MONTHDAY}', @@ -102,14 +103,14 @@ class Grok 'HTTPDATE' => '%{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT}', # Shortcuts - 'QS' => '%{QUOTEDSTRING}', + 'QS' => '%{QUOTEDSTRING}', # Log formats 'SYSLOGBASE' => '%{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:', 'COMBINEDAPACHELOG' => '%{IPORHOST:clientip} %{USER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "(?:%{WORD:verb} %{URIPATHPARAM:request}(?: HTTP/%{NUMBER:httpversion})?|-)" %{NUMBER:response} (?:%{NUMBER:bytes}|-) "(?:%{URI:referrer}|-)" %{QS:agent}', # Log Levels - 'LOGLEVEL' => '([D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE)', + 'LOGLEVEL' => '([D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE)', ); protected $fieldMap = array(); @@ -124,27 +125,27 @@ public function __construct($patterns = null) // Pattern to match %{FOO:bar} or %{FOO<=3} // currently no predicate supported $this->pattern_regex = "/(?!<\\\\)%\{" - ."(?" - . "(?[A-z0-9]+)" - . "(?::(?[A-z0-9_:]+))?" - .")" - ."(?:=" - . "(?" - . "(?:" - . "(?P\\{(?:(?>[^{}]+|(?>\\\\[{}])+)|(?P>curly2))*\\})+" - . "|" - . "(?:[^{}]+|\\\\[{}])+" - . ")+" - . ")" - .")?" - ."\\s*(?" - . "(?:" - . "(?P\\{(?:(?>[^{}]+|(?>\\\\[{}])+)|(?P>curly))*\\})" - . "|" - . "(?:[^{}]+|\\\\[{}])+" - . ")+" - .")?" - ."\\}/"; + . "(?" + . "(?[A-z0-9]+)" + . "(?::(?[A-z0-9_:]+))?" + . ")" + . "(?:=" + . "(?" + . "(?:" + . "(?P\\{(?:(?>[^{}]+|(?>\\\\[{}])+)|(?P>curly2))*\\})+" + . "|" + . "(?:[^{}]+|\\\\[{}])+" + . ")+" + . ")" + . ")?" + . "\\s*(?" + . "(?:" + . "(?P\\{(?:(?>[^{}]+|(?>\\\\[{}])+)|(?P>curly))*\\})" + . "|" + . "(?:[^{}]+|\\\\[{}])+" + . ")+" + . ")?" + . "\\}/"; if (!is_null($patterns)) { $this->patterns = $patterns; @@ -154,7 +155,7 @@ public function __construct($patterns = null) /** * Add one additional pattern * - * @param string $name Name + * @param string $name Name * @param string $pattern Pattern */ public function addPattern($name, $pattern) @@ -173,41 +174,6 @@ public function addPatterns(array $patterns) $this->patterns = array_merge($this->patterns, $patterns); } - /** - * Reset internal data - */ - protected function reset() - { - $this->matchCount = 0; - $this->fieldMap = array(); - } - - /** - * Resolve and merge grok pattern - * - * @param string $pattern Pattern - * - * @return string Merged pattern - */ - public function resolve($pattern) - { - //var_dump('resolve pattern:', $pattern); - if (preg_match_all($this->pattern_regex, $pattern, $matches, PREG_SET_ORDER)) { - //var_dump($matches); - foreach ($matches as $match) { - $subPattern = $this->resolve($this->patterns[$match['pattern']]); - if (isset($match['subname']) && !empty($match['subname'])) { - //$this->fieldMap[$match['subname']] = ++$this->matchCount; //$subPattern; - $this->fieldMap[++$this->matchCount] = $match['subname']; - $subPattern = '(?<'.$match['subname'].'>'.$subPattern.')'; - //var_dump($subPattern); - } - $pattern = str_replace($match[0], $subPattern, $pattern, $replaced); - } - } - return $pattern; - } - /** * Parse given content with pattern. * Returns matches as named array. @@ -222,8 +188,13 @@ public function parse($pattern, $content, $options = '') { $results = array(); $this->reset(); - $pattern = "/".str_replace('/', '\/', $this->resolve($pattern))."/".$options; - //var_dump('resolved pattern:', $pattern); + + $result = $this->resolve($pattern); + $resolvedPattern = $result['pattern']; + $this->matchCount += $result['matchCount']; + $this->fieldMap = array_merge($this->fieldMap, $result['fieldMap']); + + $pattern = "/" . str_replace('/', '\/', $resolvedPattern) . "/" . $options; if (preg_match_all($pattern, $content, $matches, PREG_SET_ORDER)) { if (count($matches) > 0 && isset($matches[0]) && is_array($matches[0])) { foreach ($this->fieldMap as $pos => $key) { @@ -233,6 +204,53 @@ public function parse($pattern, $content, $options = '') } } } + return !empty($results) ? $results : false; } + + /** + * Resolve and merge grok pattern + * + * @param string $pattern Pattern + * + * @return array of pattern, fieldMap, matchCount + */ + public function resolve($pattern) + { + $initialPatternHash = md5($pattern); + if (!isset($this->cache[$initialPatternHash])) { + if (preg_match_all($this->pattern_regex, $pattern, $matches, PREG_SET_ORDER)) { + foreach ($matches as $match) { + + $result = $this->resolve($this->patterns[$match['pattern']]); + $subPattern = $result['pattern']; + $this->matchCount += $result['matchCount']; + $this->fieldMap = array_merge($this->fieldMap, $result['fieldMap']); + + if (isset($match['subname']) && !empty($match['subname'])) { + + $this->fieldMap[++$this->matchCount] = $match['subname']; + $subPattern = '(?<' . $match['subname'] . '>' . $subPattern . ')'; + + } + $pattern = str_replace($match[0], $subPattern, $pattern, $replaced); + } + } + $this->cache[$initialPatternHash] = []; + $this->cache[$initialPatternHash]['pattern'] = $pattern; + $this->cache[$initialPatternHash]['fieldMap'] = $this->fieldMap; + $this->cache[$initialPatternHash]['matchCount'] = $this->matchCount; + } + + return $this->cache[$initialPatternHash]; + } + + /** + * Reset internal data + */ + protected function reset() + { + $this->matchCount = 0; + $this->fieldMap = array(); + } }