From 44a4d74e0b66bb16f1d5534a73b0ab5abd73a984 Mon Sep 17 00:00:00 2001
From: Janmejay Singh <singh.janmejay@gmail.com>
Date: Tue, 28 Oct 2014 10:52:12 +0530
Subject: [PATCH 2/2] added support for field_type tokenized, which parses
 fields with multiple values separated by a given token

Eg. given string:
%--
1.2.3.4, 5.6.7.8, 9.10.11.12
--%

and given rule
%--
rule=ips:%my_ips:tokenized:, :ipv4%
---%

it would generate json:
%--
{ "my_ips": [ "1.2.3.4", "5.6.7.8", "9.10.11.12" ] }
--%
---
 src/liblognorm.h |   1 +
 src/parser.c     | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/parser.h     |   9 ++++
 src/ptree.c      |   2 +
 src/ptree.h      |   2 +
 src/samp.c       |  10 ++++
 6 files changed, 167 insertions(+)

diff --git a/src/liblognorm.h b/src/liblognorm.h
index 888e624..d409730 100644
--- a/src/liblognorm.h
+++ b/src/liblognorm.h
@@ -67,6 +67,7 @@
 /* error codes */
 #define LN_NOMEM -1
 #define LN_INVLDFDESCR -1
+#define LN_BADPARSERSTATE -500
 #define LN_WRONGPARSER -1000
 
 /**
diff --git a/src/parser.c b/src/parser.c
index 9d3879f..a3b85b0 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -32,6 +32,7 @@
 #include <json.h>
 
 #include "liblognorm.h"
+#include "lognorm.h"
 #include "internal.h"
 #include "parser.h"
 
@@ -538,6 +539,148 @@ BEGINParser(CharSeparated)
 
 ENDParser
 
+/**
+ * Parse string tokenized by given char-sequence
+ * The sequence may appear 0 or more times, but zero times means 1 token.
+ * NOTE: its not 0 tokens, but 1 token.
+ *
+ * The token found is parsed according to the field-type provided after
+ *  tokenizer char-seq.
+ */
+struct tokenized_parser_data_s {
+	es_str_t *tok_str;
+	ln_ctx ctx;
+};
+typedef struct tokenized_parser_data_s tokenized_parser_data_t;
+static void load_tokenized_parser_samples(ln_ctx, const char* const, const int, const char* const, const int);
+void* tokenized_parser_data_constructor(ln_fieldList_t *);
+
+BEGINParser(Tokenized)
+	assert(str != NULL);
+	assert(offs != NULL);
+	assert(parsed != NULL);
+
+	json_object *json_p = NULL;
+	CHKN(json_p = json_object_new_object());
+	json_object *matches = NULL;
+	CHKN(matches = json_object_new_array());
+
+
+	tokenized_parser_data_t *pData = (tokenized_parser_data_t*) node->parser_data;
+
+	if (! pData) {
+		r = LN_BADPARSERSTATE;
+		goto fail;
+	}
+
+	int remaining_len = strLen - *offs;
+	const char *remaining_str = str + *offs;
+	json_object *remaining = NULL;
+	json_object *match = NULL;
+	
+	while (remaining_len > 0) {
+		ln_normalize(pData->ctx, remaining_str, remaining_len, &json_p);
+
+		if (remaining) json_object_put(remaining);
+
+		if (json_object_object_get_ex(json_p, "default", &match)) {
+			json_object_array_add(matches, json_object_get(match));
+		} else {
+			if (json_object_array_length(matches) > 0) {
+				remaining_len += es_strlen(pData->tok_str);
+				break;
+			} else {
+				json_object_put(json_p);
+				r = LN_WRONGPARSER;
+				goto fail;
+			}
+		}
+
+		if (json_object_object_get_ex(json_p, "tail", &remaining)) {
+			remaining_len = json_object_get_string_len(remaining);
+			if (remaining_len > 0) {
+				remaining_str = json_object_get_string(remaining);
+				if (es_strbufcmp(pData->tok_str, (const unsigned char *)remaining_str, es_strlen(pData->tok_str))) {
+					break;
+				} else {
+					json_object_get(remaining);
+					remaining_str += es_strlen(pData->tok_str);
+					remaining_len -= es_strlen(pData->tok_str);
+				}
+			}
+		} else {
+			remaining_len = 0;
+			break;
+		}
+
+		json_object_object_del(json_p, "default");
+		json_object_object_del(json_p, "tail");
+	}
+	json_object_put(json_p);
+
+	/* success, persist */
+	*parsed = (strLen - *offs) - remaining_len;
+	*value =  matches;
+
+ENDParser
+
+void tokenized_parser_data_destructor(void** dataPtr) {
+	tokenized_parser_data_t *data = (tokenized_parser_data_t*) *dataPtr;
+	if (data->tok_str) es_deleteStr(data->tok_str);
+	if (data->ctx) ln_exitCtx(data->ctx);
+	free(data);
+	*dataPtr = NULL;
+}
+
+static void load_tokenized_parser_samples(ln_ctx ctx, const char* const field_type, const int field_type_len, const char* const suffix, const int length) {
+	static const char* const RULE_PREFIX = "rule=:%default:";//TODO: extract nice constants
+	static const int RULE_PREFIX_LEN = 15;
+	
+	es_str_t *field_decl = es_newStrFromCStr(RULE_PREFIX, RULE_PREFIX_LEN);
+	if (! field_decl) goto free;
+
+	if (es_addBuf(&field_decl, field_type, field_type_len) || es_addBuf(&field_decl, "%", 1) || es_addBuf(&field_decl, suffix, length)) {
+		ln_dbgprintf(ctx, "couldn't prepare field for tokenized field-picking: '%s'", field_type);
+		goto free;
+	}
+	char *sample_str = es_str2cstr(field_decl, NULL);
+	if (! sample_str) {
+		ln_dbgprintf(ctx, "couldn't prepare sample-string for: '%s'", field_type);
+		goto free;
+	}
+	ln_loadSample(ctx, sample_str);
+free:
+	if (sample_str) free(sample_str);
+	if (field_decl) es_deleteStr(field_decl);
+}
+
+void* tokenized_parser_data_constructor(ln_fieldList_t *node) {
+	es_str_t *raw_data = node->raw_data;
+	static const char* const ARG_SEP = ":";
+	static const char* const TAIL_FIELD = "%tail:rest%";
+	static const int TAIL_FIELD_LEN = 11;
+	
+	char *args = es_str2cstr(raw_data, NULL);
+	if (! args) return NULL;
+	char *field_type = strstr(args, ARG_SEP);
+
+	tokenized_parser_data_t *pData = malloc(sizeof(tokenized_parser_data_t));
+	if (! pData)  goto fail;
+	if (! (pData->tok_str = es_newStrFromCStr(args, field_type - args))) goto fail;
+	es_unescapeStr(pData->tok_str);
+	if (! (pData->ctx = ln_initCtx())) goto fail;
+	field_type++;//skip :
+	const int field_type_len = strlen(field_type);
+	load_tokenized_parser_samples(pData->ctx, field_type, field_type_len, TAIL_FIELD, TAIL_FIELD_LEN);
+	load_tokenized_parser_samples(pData->ctx, field_type, field_type_len, "", 0);
+	goto free;
+fail:
+	if (pData) tokenized_parser_data_destructor((void**) &pData);
+	pData = NULL;
+free:
+	if (args) free(args);
+	return pData;
+}
 
 /**
  * Just get everything till the end of string.
diff --git a/src/parser.h b/src/parser.h
index cbec884..35cbd34 100644
--- a/src/parser.h
+++ b/src/parser.h
@@ -103,4 +103,13 @@ int ln_parseTime24hr(const char *str, size_t strlen, size_t *offs, const ln_fiel
  */
 int ln_parseIPv4(const char *str, size_t strlen, size_t *offs, const ln_fieldList_t *node, size_t *parsed, struct json_object **value);
 
+/** 
+ * Get all tokens separated by tokenizer-string as array.
+ */
+int ln_parseTokenized(const char *str, size_t strlen, size_t *offs, const ln_fieldList_t *node, size_t *parsed, struct json_object **value);
+
+void* tokenized_parser_data_constructor(ln_fieldList_t *node);
+void tokenized_parser_data_destructor(void** dataPtr);
+
+
 #endif /* #ifndef LIBLOGNORM_PARSER_H_INCLUDED */
diff --git a/src/ptree.c b/src/ptree.c
index 4a335eb..a72aaa3 100644
--- a/src/ptree.c
+++ b/src/ptree.c
@@ -78,6 +78,8 @@ ln_deletePTreeNode(ln_fieldList_t *node)
 		es_deleteStr(node->data);
 	if(node->raw_data != NULL)
 		es_deleteStr(node->raw_data);
+	if(node->parser_data != NULL)
+		node->parser_data_destructor(&(node->parser_data));
 	free(node);
 }
 
diff --git a/src/ptree.h b/src/ptree.h
index 529b806..66b7bdd 100644
--- a/src/ptree.h
+++ b/src/ptree.h
@@ -51,6 +51,8 @@ struct ln_fieldList_s {
 	es_str_t *name;		/**< field name */
 	es_str_t *data;		/**< extra data to be passed to parser */
 	es_str_t *raw_data;		/**< extra untouched (unescaping is not done) data availble to be used by parser */
+	void *parser_data; /** opaque data that the field-parser understands */
+	void (*parser_data_destructor)(void **); /** destroy opaque data that field-parser understands */
 	int (*parser)(const char*, size_t, size_t*, const ln_fieldList_t *,
 				  size_t*, struct json_object **); /**< parser to use */
 	ln_ptree *subtree;	/**< subtree to follow if parser succeeded */
diff --git a/src/samp.c b/src/samp.c
index d368cb1..87df4a4 100644
--- a/src/samp.c
+++ b/src/samp.c
@@ -114,6 +114,7 @@ parseFieldDescr(ln_ctx ctx, struct ln_ptree **subtree, es_str_t *rule,
 	char *cstr;	/* for debug mode strings */
 	unsigned char *buf;
 	es_size_t lenBuf;
+	void* (*constructor_fn)(ln_fieldList_t *) = NULL;
 
 	assert(subtree != NULL);
 
@@ -125,6 +126,9 @@ parseFieldDescr(ln_ctx ctx, struct ln_ptree **subtree, es_str_t *rule,
 	node->subtree = NULL;
 	node->next = NULL;
 	node->data = NULL;
+	node->raw_data = NULL;
+	node->parser_data = NULL;
+	node->parser_data_destructor = NULL;
 	CHKN(node->name = es_newStr(16));
 
 	while(i < lenBuf && buf[i] != ':') {
@@ -191,6 +195,10 @@ parseFieldDescr(ln_ctx ctx, struct ln_ptree **subtree, es_str_t *rule,
 	} else if(!es_strconstcmp(*str, "char-sep")) {
 		// TODO: check extra data!!!! (very important)
 		node->parser = ln_parseCharSeparated;
+	} else if(!es_strconstcmp(*str, "tokenized")) {
+		node->parser = ln_parseTokenized;
+		constructor_fn = tokenized_parser_data_constructor;
+		node->parser_data_destructor = tokenized_parser_data_destructor;
 	} else {
 		cstr = es_str2cstr(*str, NULL);
 		ln_dbgprintf(ctx, "ERROR: invalid field type '%s'", cstr);
@@ -220,6 +228,8 @@ parseFieldDescr(ln_ctx ctx, struct ln_ptree **subtree, es_str_t *rule,
 		}
 	}
 
+	if (constructor_fn) node->parser_data = constructor_fn(node);
+
 
 	/* finished */
 	CHKR(ln_addFDescrToPTree(subtree, node));
-- 
2.0.4