From 44a4d74e0b66bb16f1d5534a73b0ab5abd73a984 Mon Sep 17 00:00:00 2001 From: Janmejay Singh Date: Tue, 28 Oct 2014 10:52:12 +0530 Subject: [PATCH 2/2] added support for field_type tokenized, which parses fields with multiple values separated by a given token Eg. given string: %-- 1.2.3.4, 5.6.7.8, 9.10.11.12 --% and given rule %-- rule=ips:%my_ips:tokenized:, :ipv4% ---% it would generate json: %-- { "my_ips": [ "1.2.3.4", "5.6.7.8", "9.10.11.12" ] } --% --- src/liblognorm.h | 1 + src/parser.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/parser.h | 9 ++++ src/ptree.c | 2 + src/ptree.h | 2 + src/samp.c | 10 ++++ 6 files changed, 167 insertions(+) diff --git a/src/liblognorm.h b/src/liblognorm.h index 888e624..d409730 100644 --- a/src/liblognorm.h +++ b/src/liblognorm.h @@ -67,6 +67,7 @@ /* error codes */ #define LN_NOMEM -1 #define LN_INVLDFDESCR -1 +#define LN_BADPARSERSTATE -500 #define LN_WRONGPARSER -1000 /** diff --git a/src/parser.c b/src/parser.c index 9d3879f..a3b85b0 100644 --- a/src/parser.c +++ b/src/parser.c @@ -32,6 +32,7 @@ #include #include "liblognorm.h" +#include "lognorm.h" #include "internal.h" #include "parser.h" @@ -538,6 +539,148 @@ BEGINParser(CharSeparated) ENDParser +/** + * Parse string tokenized by given char-sequence + * The sequence may appear 0 or more times, but zero times means 1 token. + * NOTE: its not 0 tokens, but 1 token. + * + * The token found is parsed according to the field-type provided after + * tokenizer char-seq. + */ +struct tokenized_parser_data_s { + es_str_t *tok_str; + ln_ctx ctx; +}; +typedef struct tokenized_parser_data_s tokenized_parser_data_t; +static void load_tokenized_parser_samples(ln_ctx, const char* const, const int, const char* const, const int); +void* tokenized_parser_data_constructor(ln_fieldList_t *); + +BEGINParser(Tokenized) + assert(str != NULL); + assert(offs != NULL); + assert(parsed != NULL); + + json_object *json_p = NULL; + CHKN(json_p = json_object_new_object()); + json_object *matches = NULL; + CHKN(matches = json_object_new_array()); + + + tokenized_parser_data_t *pData = (tokenized_parser_data_t*) node->parser_data; + + if (! pData) { + r = LN_BADPARSERSTATE; + goto fail; + } + + int remaining_len = strLen - *offs; + const char *remaining_str = str + *offs; + json_object *remaining = NULL; + json_object *match = NULL; + + while (remaining_len > 0) { + ln_normalize(pData->ctx, remaining_str, remaining_len, &json_p); + + if (remaining) json_object_put(remaining); + + if (json_object_object_get_ex(json_p, "default", &match)) { + json_object_array_add(matches, json_object_get(match)); + } else { + if (json_object_array_length(matches) > 0) { + remaining_len += es_strlen(pData->tok_str); + break; + } else { + json_object_put(json_p); + r = LN_WRONGPARSER; + goto fail; + } + } + + if (json_object_object_get_ex(json_p, "tail", &remaining)) { + remaining_len = json_object_get_string_len(remaining); + if (remaining_len > 0) { + remaining_str = json_object_get_string(remaining); + if (es_strbufcmp(pData->tok_str, (const unsigned char *)remaining_str, es_strlen(pData->tok_str))) { + break; + } else { + json_object_get(remaining); + remaining_str += es_strlen(pData->tok_str); + remaining_len -= es_strlen(pData->tok_str); + } + } + } else { + remaining_len = 0; + break; + } + + json_object_object_del(json_p, "default"); + json_object_object_del(json_p, "tail"); + } + json_object_put(json_p); + + /* success, persist */ + *parsed = (strLen - *offs) - remaining_len; + *value = matches; + +ENDParser + +void tokenized_parser_data_destructor(void** dataPtr) { + tokenized_parser_data_t *data = (tokenized_parser_data_t*) *dataPtr; + if (data->tok_str) es_deleteStr(data->tok_str); + if (data->ctx) ln_exitCtx(data->ctx); + free(data); + *dataPtr = NULL; +} + +static void load_tokenized_parser_samples(ln_ctx ctx, const char* const field_type, const int field_type_len, const char* const suffix, const int length) { + static const char* const RULE_PREFIX = "rule=:%default:";//TODO: extract nice constants + static const int RULE_PREFIX_LEN = 15; + + es_str_t *field_decl = es_newStrFromCStr(RULE_PREFIX, RULE_PREFIX_LEN); + if (! field_decl) goto free; + + if (es_addBuf(&field_decl, field_type, field_type_len) || es_addBuf(&field_decl, "%", 1) || es_addBuf(&field_decl, suffix, length)) { + ln_dbgprintf(ctx, "couldn't prepare field for tokenized field-picking: '%s'", field_type); + goto free; + } + char *sample_str = es_str2cstr(field_decl, NULL); + if (! sample_str) { + ln_dbgprintf(ctx, "couldn't prepare sample-string for: '%s'", field_type); + goto free; + } + ln_loadSample(ctx, sample_str); +free: + if (sample_str) free(sample_str); + if (field_decl) es_deleteStr(field_decl); +} + +void* tokenized_parser_data_constructor(ln_fieldList_t *node) { + es_str_t *raw_data = node->raw_data; + static const char* const ARG_SEP = ":"; + static const char* const TAIL_FIELD = "%tail:rest%"; + static const int TAIL_FIELD_LEN = 11; + + char *args = es_str2cstr(raw_data, NULL); + if (! args) return NULL; + char *field_type = strstr(args, ARG_SEP); + + tokenized_parser_data_t *pData = malloc(sizeof(tokenized_parser_data_t)); + if (! pData) goto fail; + if (! (pData->tok_str = es_newStrFromCStr(args, field_type - args))) goto fail; + es_unescapeStr(pData->tok_str); + if (! (pData->ctx = ln_initCtx())) goto fail; + field_type++;//skip : + const int field_type_len = strlen(field_type); + load_tokenized_parser_samples(pData->ctx, field_type, field_type_len, TAIL_FIELD, TAIL_FIELD_LEN); + load_tokenized_parser_samples(pData->ctx, field_type, field_type_len, "", 0); + goto free; +fail: + if (pData) tokenized_parser_data_destructor((void**) &pData); + pData = NULL; +free: + if (args) free(args); + return pData; +} /** * Just get everything till the end of string. diff --git a/src/parser.h b/src/parser.h index cbec884..35cbd34 100644 --- a/src/parser.h +++ b/src/parser.h @@ -103,4 +103,13 @@ int ln_parseTime24hr(const char *str, size_t strlen, size_t *offs, const ln_fiel */ int ln_parseIPv4(const char *str, size_t strlen, size_t *offs, const ln_fieldList_t *node, size_t *parsed, struct json_object **value); +/** + * Get all tokens separated by tokenizer-string as array. + */ +int ln_parseTokenized(const char *str, size_t strlen, size_t *offs, const ln_fieldList_t *node, size_t *parsed, struct json_object **value); + +void* tokenized_parser_data_constructor(ln_fieldList_t *node); +void tokenized_parser_data_destructor(void** dataPtr); + + #endif /* #ifndef LIBLOGNORM_PARSER_H_INCLUDED */ diff --git a/src/ptree.c b/src/ptree.c index 4a335eb..a72aaa3 100644 --- a/src/ptree.c +++ b/src/ptree.c @@ -78,6 +78,8 @@ ln_deletePTreeNode(ln_fieldList_t *node) es_deleteStr(node->data); if(node->raw_data != NULL) es_deleteStr(node->raw_data); + if(node->parser_data != NULL) + node->parser_data_destructor(&(node->parser_data)); free(node); } diff --git a/src/ptree.h b/src/ptree.h index 529b806..66b7bdd 100644 --- a/src/ptree.h +++ b/src/ptree.h @@ -51,6 +51,8 @@ struct ln_fieldList_s { es_str_t *name; /**< field name */ es_str_t *data; /**< extra data to be passed to parser */ es_str_t *raw_data; /**< extra untouched (unescaping is not done) data availble to be used by parser */ + void *parser_data; /** opaque data that the field-parser understands */ + void (*parser_data_destructor)(void **); /** destroy opaque data that field-parser understands */ int (*parser)(const char*, size_t, size_t*, const ln_fieldList_t *, size_t*, struct json_object **); /**< parser to use */ ln_ptree *subtree; /**< subtree to follow if parser succeeded */ diff --git a/src/samp.c b/src/samp.c index d368cb1..87df4a4 100644 --- a/src/samp.c +++ b/src/samp.c @@ -114,6 +114,7 @@ parseFieldDescr(ln_ctx ctx, struct ln_ptree **subtree, es_str_t *rule, char *cstr; /* for debug mode strings */ unsigned char *buf; es_size_t lenBuf; + void* (*constructor_fn)(ln_fieldList_t *) = NULL; assert(subtree != NULL); @@ -125,6 +126,9 @@ parseFieldDescr(ln_ctx ctx, struct ln_ptree **subtree, es_str_t *rule, node->subtree = NULL; node->next = NULL; node->data = NULL; + node->raw_data = NULL; + node->parser_data = NULL; + node->parser_data_destructor = NULL; CHKN(node->name = es_newStr(16)); while(i < lenBuf && buf[i] != ':') { @@ -191,6 +195,10 @@ parseFieldDescr(ln_ctx ctx, struct ln_ptree **subtree, es_str_t *rule, } else if(!es_strconstcmp(*str, "char-sep")) { // TODO: check extra data!!!! (very important) node->parser = ln_parseCharSeparated; + } else if(!es_strconstcmp(*str, "tokenized")) { + node->parser = ln_parseTokenized; + constructor_fn = tokenized_parser_data_constructor; + node->parser_data_destructor = tokenized_parser_data_destructor; } else { cstr = es_str2cstr(*str, NULL); ln_dbgprintf(ctx, "ERROR: invalid field type '%s'", cstr); @@ -220,6 +228,8 @@ parseFieldDescr(ln_ctx ctx, struct ln_ptree **subtree, es_str_t *rule, } } + if (constructor_fn) node->parser_data = constructor_fn(node); + /* finished */ CHKR(ln_addFDescrToPTree(subtree, node)); -- 2.0.4