1 files changed, 484 insertions, 0 deletions
diff --git a/trunk/main/minimime/mimeparser.l b/trunk/main/minimime/mimeparser.l
new file mode 100644
index 000000000..19d42cf3a
--- /dev/null
+++ b/trunk/main/minimime/mimeparser.l
@@ -0,0 +1,484 @@
+%{
+/*
+ * Copyright (c) 2004 Jann Fischer. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/**
+ * This is a lexer file for parsing MIME compatible messages. It is intended
+ * to satisfy at least RFC 2045 (Format of Internet Message Bodies). It still
+ * has quite a few problems:
+ *
+ *	- The parsing could probably be done in a more elegant way
+ *	- I don't know what performance impact REJECT has on the parser
+ */
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+
+#include "mimeparser.h"
+#include "mimeparser.tab.h"
+
+#define NAMEOF(v) #v
+/* BC() is a debug wrapper for lex' BEGIN() macro */
+#define BC(x) do { \
+	struct lexer_state *lstate = yyget_extra(yyscanner); \
+	BEGIN(x); \
+	lstate->condition = x; \
+} while(0);
+
+#define ZERO(x) memset(x, '\0', sizeof(x))
+
+#define PREALLOC_BUFFER	100000
+#undef YY_BUF_SIZE
+#define YY_BUF_SIZE 65536
+
+enum header_states
+{
+	STATE_MAIL = 0,
+	STATE_CTYPE,
+	STATE_CDISP,
+	STATE_CENC,
+	STATE_MIME
+};
+
+
+
+%}
+
+%option reentrant
+%option yylineno
+%option bison-bridge
+
+%s headers
+%s header
+%s headervalue
+%s tspecialvalue
+%s comment
+%s body
+%s postamble
+%s preamble
+%s boundary
+%s endboundary
+%s endoffile
+
+STRING	[a-zA-Z0-9\-\.\_]
+TSPECIAL [a-zA-Z0-9)(<>@,;:/\-.=_\+'? ]
+TSPECIAL_LITE [a-zA-Z0-9)(<>@,-._+'?\[\]]
+
+%%
+
+<INITIAL,headers>^[a-zA-Z]+[a-zA-Z0-9\-\_]* {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+
+	yylval_param->string=strdup(yytext); 
+	lstate->current_pos += yyleng;
+	BC(header);
+
+	/* Depending on what header we are processing, we enter a different
+	 * state and return a different value.
+	 */
+	if (!strcasecmp(yytext, "Content-Type")) {
+		lstate->header_state = STATE_CTYPE;
+		return CONTENTTYPE_HEADER;
+	} else if (!strcasecmp(yytext, "Content-Transfer-Encoding")) {
+		lstate->header_state = STATE_CENC;
+		return CONTENTENCODING_HEADER;
+	} else if (!strcasecmp(yytext, "Content-Disposition")) {
+		lstate->header_state = STATE_CDISP;
+		return CONTENTDISPOSITION_HEADER;
+	} else if (!strcasecmp(yytext, "MIME-Version")) {
+		lstate->header_state = STATE_MAIL;
+		return MIMEVERSION_HEADER;
+	} else {
+		lstate->header_state = STATE_MAIL;
+		return MAIL_HEADER;
+	}
+}
+
+<INITIAL,headers>. {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	/* dprintf2("Unknown header char: %c\n", *yytext); */
+	lstate->current_pos += yyleng;
+	return ANY;
+}
+
+<headers>^(\r\n|\n) {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	lstate->lineno++; 
+
+	lstate->current_pos += yyleng;
+
+	/* This marks the end of headers. Depending on whether we are in the
+	 * envelope currently we need to parse either a body or the preamble
+	 * now.
+	 */
+	if (lstate->is_envelope == 0 || lstate->boundary_string == NULL) {
+		BC(body);
+		lstate->body_start = lstate->current_pos;
+	} else {
+		lstate->is_envelope = 0;
+		lstate->preamble_start = lstate->current_pos;
+		BC(preamble);
+	}	
+
+	return ENDOFHEADERS;
+}
+
+<header>\: {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	BC(headervalue); 
+	lstate->current_pos += yyleng;
+	return COLON;
+}	
+
+<header>(\r\n|\n) {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	BC(headers);
+	/* dprintf2("Invalid header, returning EOL\n"); */
+	lstate->current_pos += yyleng;
+	return EOL;
+}	
+
+<headervalue>(\n|\r\n)[\ \t]+	{
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	lstate->current_pos += yyleng;
+}
+
+<headervalue>.+|(.+(\n|\r\n)[\ \t]+.+)+ {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	if (lstate->header_state != STATE_MAIL && lstate->header_state != STATE_CENC) {
+		REJECT;
+	}
+	lstate->current_pos += yyleng;
+	while (*yytext && isspace(*yytext)) yytext++;
+	/* Do we actually have a header value? */
+	if (*yytext == '\0') {
+		yylval_param->string = strdup("");
+	} else {
+		yylval_param->string=strdup(yytext); 
+		lstate->lineno += count_lines(yytext);
+	}	
+	return WORD;
+}
+
+<headervalue,tspecialvalue>(\r\n|\n) {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	/* marks the end of one header line */
+	lstate->lineno++;
+	BC(headers);
+	lstate->current_pos += yyleng;
+	return EOL;
+}
+
+<headervalue>;|;(\r\n|\n)[\ \t]+ {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	lstate->lineno += count_lines(yytext);
+	lstate->current_pos += yyleng;
+	return SEMICOLON;
+}
+
+<headervalue>\= {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	lstate->current_pos += yyleng;
+	return EQUAL;
+}
+
+<headervalue>\" {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	BC(tspecialvalue);
+	lstate->current_pos += yyleng;
+	return *yytext;
+}
+
+<headervalue>{STRING}+|{TSPECIAL_LITE}+ {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	yylval_param->string=strdup(yytext);
+	lstate->lineno += count_lines(yytext);
+	lstate->current_pos += yyleng;
+	return WORD;
+}
+
+<headervalue>[\ |\t]+	{
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	lstate->current_pos += yyleng;
+}	
+
+<tspecialvalue>{TSPECIAL}+ {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	lstate->lineno += count_lines(yytext);
+	yylval_param->string=strdup(yytext);
+	lstate->current_pos += yyleng;
+	return TSPECIAL;
+}
+
+<tspecialvalue>\" {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	BC(headervalue);
+	lstate->current_pos += yyleng;
+	return *yytext;
+}
+
+<body>^\-\-{TSPECIAL}+\-\- {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	/**
+	 * Make sure we only catch matching boundaries, and not other lines
+	 * that begin and end with two dashes. If we have catched a valid
+	 * end boundary, which actually ends a body, we save the current
+	 * position, put the token back on the input stream and let the
+	 * endboundary condition parse the actual token.
+	 */
+	if (lstate->endboundary_string != NULL) {
+		if (strcmp(lstate->endboundary_string, yytext)) {
+			/* dprintf2("YYTEXT != end_boundary: '%s'\n", yytext); */
+			REJECT;
+		} else {
+			lstate->current_pos += yyleng;
+			/* dprintf2("YYTEXT == lstate->end_boundary: '%s'\n", yytext); */
+			if (lstate->body_start) {
+				yylval_param->position.opaque_start = 
+				    lstate->body_opaque_start;
+				yylval_param->position.start = lstate->body_start;
+				yylval_param->position.end = lstate->current_pos - yyleng;
+				lstate->body_opaque_start = 0;
+				lstate->body_start = 0;
+				lstate->body_end = 0;
+				yyless(0);
+				BC(endboundary);
+				return BODY;
+			}	
+		}
+	} else {
+	}	
+
+	REJECT;
+}
+
+<body,preamble>^\-\-{TSPECIAL}+ {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	/**
+	 * Make sure we only catch matching boundaries, and not other lines
+	 * that begin with two dashes.
+	 */
+	if (lstate->boundary_string != NULL) {
+		if (strcmp(lstate->boundary_string, yytext)) {
+			/* dprintf2("YYTEXT != boundary: '%s'\n", yytext);*/
+			REJECT;
+		} else {
+			/* dprintf2("YYTEXT == boundary: '%s'\n", yytext);*/
+			if (lstate->body_start) {
+				yylval_param->position.opaque_start = lstate->body_opaque_start;
+				yylval_param->position.start = lstate->body_start;
+				yylval_param->position.end = lstate->current_pos;
+				lstate->body_opaque_start = 0;
+				lstate->body_start = 0;
+				lstate->body_end = 0;
+				yyless(0);
+				BC(boundary);
+				return BODY;
+			} else if (lstate->preamble_start) {
+				yylval_param->position.start = lstate->preamble_start;
+				yylval_param->position.end = lstate->current_pos;
+				lstate->preamble_start = lstate->preamble_end = 0;
+				yyless(0);
+				BC(boundary);
+				return PREAMBLE;
+			} else {
+				BC(boundary);
+				yylval_param->string = strdup(yytext);
+				lstate->current_pos += yyleng;
+				return(BOUNDARY);
+			}
+		}
+	} else {
+	}	
+
+	REJECT;
+}
+
+<body>(\r\n|\n) {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	lstate->current_pos += yyleng;
+	lstate->lineno++;
+}
+
+<body>\r {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	lstate->current_pos += yyleng;
+	/* dprintf2("stray CR in body...\n"); */
+}
+
+<body>[^\r\n]+ {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	lstate->current_pos += yyleng;
+}
+
+<body><<EOF>> {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	if (lstate->boundary_string == NULL && lstate->body_start) {
+		yylval_param->position.opaque_start = 0;
+		yylval_param->position.start = lstate->body_start;
+		yylval_param->position.end = lstate->current_pos;
+		lstate->body_start = 0;
+		return BODY;
+	} else if (lstate->body_start) {
+		return POSTAMBLE;
+	}	
+	yyterminate();
+}	
+
+<preamble,postamble>(\r\n|\n) {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	/* dprintf2("Preamble CR/LF at line %d\n", lineno); */
+	lstate->lineno++; 
+	lstate->current_pos += yyleng;
+}	
+
+<boundary>[^\r\n]+ {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	yylval_param->string = strdup(yytext);
+	lstate->current_pos += yyleng;
+	return BOUNDARY;
+}
+
+<endboundary>[^\r\n]+ {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	yylval_param->string = strdup(yytext);
+	lstate->current_pos += yyleng;
+	return ENDBOUNDARY;
+}
+
+<boundary>(\r\n|\n) {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	BC(headers);
+	lstate->lineno++;
+	lstate->current_pos += yyleng;
+	lstate->body_opaque_start = lstate->current_pos;
+	return EOL;
+}
+
+<endboundary>(\r\n|\n) {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	BC(postamble);
+	lstate->lineno++;
+	lstate->current_pos += yyleng;
+}
+
+<preamble>. {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	lstate->current_pos += yyleng;
+}
+
+
+<postamble>. {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	lstate->current_pos += yyleng;
+}
+
+(\r\n|\n) {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	lstate->lineno++;
+	lstate->current_pos += yyleng;
+	return EOL;
+}
+
+. {
+	struct lexer_state *lstate = yyget_extra(yyscanner);
+	lstate->current_pos += yyleng;
+	return((int)*yytext);
+}
+
+
+%%
+
+void reset_lexer_state(void *yyscanner, struct parser_state *pstate)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	struct lexer_state *lstate = &(pstate->lstate);
+
+	yyset_extra((void*)lstate, yyscanner);
+	BEGIN(0);
+	lstate->header_state = STATE_MAIL;
+	lstate->lineno = 0;
+	lstate->current_pos = 1;
+	lstate->condition = 0;
+
+	lstate->is_envelope = 1;
+
+	lstate->message_len = 0;
+	lstate->buffer_length = 0;
+
+	/* temporary marker variables */
+	lstate->body_opaque_start = 0;
+	lstate->body_start = 0;
+	lstate->body_end = 0;
+	lstate->preamble_start = 0;
+	lstate->preamble_end = 0;
+	lstate->postamble_start = 0;
+	lstate->postamble_end = 0;
+}
+
+void
+PARSER_setbuffer(const char *string, yyscan_t scanner)
+{
+	struct lexer_state *lstate = yyget_extra(scanner);
+	lstate->message_buffer = string;
+	yy_scan_string(string, scanner);
+}
+
+void
+PARSER_setfp(FILE *fp, yyscan_t scanner)
+{
+	/* looks like a bug in bison 2.2a -- the wrong code is generated for yyset_in !! */
+	struct yyguts_t * yyg = (struct yyguts_t*) scanner;
+	yyg->yyin_r = fp;
+	
+	if (0) {
+		/* This is just to make a compiler warning go away */
+		yyunput(0, NULL, scanner);
+	}
+}
+
+/**
+ * Counts how many lines a given string represents in the message (in case of
+ * folded header values, for example, or a message body).
+ */
+int
+count_lines(char *txt)
+{
+	char *o;
+	int line;
+
+	line = 0;
+
+	for (o = txt; *o != '\0'; o++)	
+		if (*o == '\n')
+			line++;
+
+	return line;
+}