aboutsummaryrefslogtreecommitdiffstats
path: root/trunk/main/minimime/mimeparser.l
diff options
context:
space:
mode:
Diffstat (limited to 'trunk/main/minimime/mimeparser.l')
-rw-r--r--trunk/main/minimime/mimeparser.l484
1 files changed, 484 insertions, 0 deletions
diff --git a/trunk/main/minimime/mimeparser.l b/trunk/main/minimime/mimeparser.l
new file mode 100644
index 000000000..19d42cf3a
--- /dev/null
+++ b/trunk/main/minimime/mimeparser.l
@@ -0,0 +1,484 @@
+%{
+/*
+ * Copyright (c) 2004 Jann Fischer. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/**
+ * This is a lexer file for parsing MIME compatible messages. It is intended
+ * to satisfy at least RFC 2045 (Format of Internet Message Bodies). It still
+ * has quite a few problems:
+ *
+ * - The parsing could probably be done in a more elegant way
+ * - I don't know what performance impact REJECT has on the parser
+ */
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+
+#include "mimeparser.h"
+#include "mimeparser.tab.h"
+
+#define NAMEOF(v) #v
+/* BC() is a debug wrapper for lex' BEGIN() macro */
+#define BC(x) do { \
+ struct lexer_state *lstate = yyget_extra(yyscanner); \
+ BEGIN(x); \
+ lstate->condition = x; \
+} while(0);
+
+#define ZERO(x) memset(x, '\0', sizeof(x))
+
+#define PREALLOC_BUFFER 100000
+#undef YY_BUF_SIZE
+#define YY_BUF_SIZE 65536
+
+enum header_states
+{
+ STATE_MAIL = 0,
+ STATE_CTYPE,
+ STATE_CDISP,
+ STATE_CENC,
+ STATE_MIME
+};
+
+
+
+%}
+
+%option reentrant
+%option yylineno
+%option bison-bridge
+
+%s headers
+%s header
+%s headervalue
+%s tspecialvalue
+%s comment
+%s body
+%s postamble
+%s preamble
+%s boundary
+%s endboundary
+%s endoffile
+
+STRING [a-zA-Z0-9\-\.\_]
+TSPECIAL [a-zA-Z0-9)(<>@,;:/\-.=_\+'? ]
+TSPECIAL_LITE [a-zA-Z0-9)(<>@,-._+'?\[\]]
+
+%%
+
+<INITIAL,headers>^[a-zA-Z]+[a-zA-Z0-9\-\_]* {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+
+ yylval_param->string=strdup(yytext);
+ lstate->current_pos += yyleng;
+ BC(header);
+
+ /* Depending on what header we are processing, we enter a different
+ * state and return a different value.
+ */
+ if (!strcasecmp(yytext, "Content-Type")) {
+ lstate->header_state = STATE_CTYPE;
+ return CONTENTTYPE_HEADER;
+ } else if (!strcasecmp(yytext, "Content-Transfer-Encoding")) {
+ lstate->header_state = STATE_CENC;
+ return CONTENTENCODING_HEADER;
+ } else if (!strcasecmp(yytext, "Content-Disposition")) {
+ lstate->header_state = STATE_CDISP;
+ return CONTENTDISPOSITION_HEADER;
+ } else if (!strcasecmp(yytext, "MIME-Version")) {
+ lstate->header_state = STATE_MAIL;
+ return MIMEVERSION_HEADER;
+ } else {
+ lstate->header_state = STATE_MAIL;
+ return MAIL_HEADER;
+ }
+}
+
+<INITIAL,headers>. {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ /* dprintf2("Unknown header char: %c\n", *yytext); */
+ lstate->current_pos += yyleng;
+ return ANY;
+}
+
+<headers>^(\r\n|\n) {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ lstate->lineno++;
+
+ lstate->current_pos += yyleng;
+
+ /* This marks the end of headers. Depending on whether we are in the
+ * envelope currently we need to parse either a body or the preamble
+ * now.
+ */
+ if (lstate->is_envelope == 0 || lstate->boundary_string == NULL) {
+ BC(body);
+ lstate->body_start = lstate->current_pos;
+ } else {
+ lstate->is_envelope = 0;
+ lstate->preamble_start = lstate->current_pos;
+ BC(preamble);
+ }
+
+ return ENDOFHEADERS;
+}
+
+<header>\: {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ BC(headervalue);
+ lstate->current_pos += yyleng;
+ return COLON;
+}
+
+<header>(\r\n|\n) {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ BC(headers);
+ /* dprintf2("Invalid header, returning EOL\n"); */
+ lstate->current_pos += yyleng;
+ return EOL;
+}
+
+<headervalue>(\n|\r\n)[\ \t]+ {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ lstate->current_pos += yyleng;
+}
+
+<headervalue>.+|(.+(\n|\r\n)[\ \t]+.+)+ {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ if (lstate->header_state != STATE_MAIL && lstate->header_state != STATE_CENC) {
+ REJECT;
+ }
+ lstate->current_pos += yyleng;
+ while (*yytext && isspace(*yytext)) yytext++;
+ /* Do we actually have a header value? */
+ if (*yytext == '\0') {
+ yylval_param->string = strdup("");
+ } else {
+ yylval_param->string=strdup(yytext);
+ lstate->lineno += count_lines(yytext);
+ }
+ return WORD;
+}
+
+<headervalue,tspecialvalue>(\r\n|\n) {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ /* marks the end of one header line */
+ lstate->lineno++;
+ BC(headers);
+ lstate->current_pos += yyleng;
+ return EOL;
+}
+
+<headervalue>;|;(\r\n|\n)[\ \t]+ {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ lstate->lineno += count_lines(yytext);
+ lstate->current_pos += yyleng;
+ return SEMICOLON;
+}
+
+<headervalue>\= {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ lstate->current_pos += yyleng;
+ return EQUAL;
+}
+
+<headervalue>\" {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ BC(tspecialvalue);
+ lstate->current_pos += yyleng;
+ return *yytext;
+}
+
+<headervalue>{STRING}+|{TSPECIAL_LITE}+ {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ yylval_param->string=strdup(yytext);
+ lstate->lineno += count_lines(yytext);
+ lstate->current_pos += yyleng;
+ return WORD;
+}
+
+<headervalue>[\ |\t]+ {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ lstate->current_pos += yyleng;
+}
+
+<tspecialvalue>{TSPECIAL}+ {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ lstate->lineno += count_lines(yytext);
+ yylval_param->string=strdup(yytext);
+ lstate->current_pos += yyleng;
+ return TSPECIAL;
+}
+
+<tspecialvalue>\" {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ BC(headervalue);
+ lstate->current_pos += yyleng;
+ return *yytext;
+}
+
+<body>^\-\-{TSPECIAL}+\-\- {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ /**
+ * Make sure we only catch matching boundaries, and not other lines
+ * that begin and end with two dashes. If we have catched a valid
+ * end boundary, which actually ends a body, we save the current
+ * position, put the token back on the input stream and let the
+ * endboundary condition parse the actual token.
+ */
+ if (lstate->endboundary_string != NULL) {
+ if (strcmp(lstate->endboundary_string, yytext)) {
+ /* dprintf2("YYTEXT != end_boundary: '%s'\n", yytext); */
+ REJECT;
+ } else {
+ lstate->current_pos += yyleng;
+ /* dprintf2("YYTEXT == lstate->end_boundary: '%s'\n", yytext); */
+ if (lstate->body_start) {
+ yylval_param->position.opaque_start =
+ lstate->body_opaque_start;
+ yylval_param->position.start = lstate->body_start;
+ yylval_param->position.end = lstate->current_pos - yyleng;
+ lstate->body_opaque_start = 0;
+ lstate->body_start = 0;
+ lstate->body_end = 0;
+ yyless(0);
+ BC(endboundary);
+ return BODY;
+ }
+ }
+ } else {
+ }
+
+ REJECT;
+}
+
+<body,preamble>^\-\-{TSPECIAL}+ {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ /**
+ * Make sure we only catch matching boundaries, and not other lines
+ * that begin with two dashes.
+ */
+ if (lstate->boundary_string != NULL) {
+ if (strcmp(lstate->boundary_string, yytext)) {
+ /* dprintf2("YYTEXT != boundary: '%s'\n", yytext);*/
+ REJECT;
+ } else {
+ /* dprintf2("YYTEXT == boundary: '%s'\n", yytext);*/
+ if (lstate->body_start) {
+ yylval_param->position.opaque_start = lstate->body_opaque_start;
+ yylval_param->position.start = lstate->body_start;
+ yylval_param->position.end = lstate->current_pos;
+ lstate->body_opaque_start = 0;
+ lstate->body_start = 0;
+ lstate->body_end = 0;
+ yyless(0);
+ BC(boundary);
+ return BODY;
+ } else if (lstate->preamble_start) {
+ yylval_param->position.start = lstate->preamble_start;
+ yylval_param->position.end = lstate->current_pos;
+ lstate->preamble_start = lstate->preamble_end = 0;
+ yyless(0);
+ BC(boundary);
+ return PREAMBLE;
+ } else {
+ BC(boundary);
+ yylval_param->string = strdup(yytext);
+ lstate->current_pos += yyleng;
+ return(BOUNDARY);
+ }
+ }
+ } else {
+ }
+
+ REJECT;
+}
+
+<body>(\r\n|\n) {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ lstate->current_pos += yyleng;
+ lstate->lineno++;
+}
+
+<body>\r {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ lstate->current_pos += yyleng;
+ /* dprintf2("stray CR in body...\n"); */
+}
+
+<body>[^\r\n]+ {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ lstate->current_pos += yyleng;
+}
+
+<body><<EOF>> {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ if (lstate->boundary_string == NULL && lstate->body_start) {
+ yylval_param->position.opaque_start = 0;
+ yylval_param->position.start = lstate->body_start;
+ yylval_param->position.end = lstate->current_pos;
+ lstate->body_start = 0;
+ return BODY;
+ } else if (lstate->body_start) {
+ return POSTAMBLE;
+ }
+ yyterminate();
+}
+
+<preamble,postamble>(\r\n|\n) {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ /* dprintf2("Preamble CR/LF at line %d\n", lineno); */
+ lstate->lineno++;
+ lstate->current_pos += yyleng;
+}
+
+<boundary>[^\r\n]+ {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ yylval_param->string = strdup(yytext);
+ lstate->current_pos += yyleng;
+ return BOUNDARY;
+}
+
+<endboundary>[^\r\n]+ {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ yylval_param->string = strdup(yytext);
+ lstate->current_pos += yyleng;
+ return ENDBOUNDARY;
+}
+
+<boundary>(\r\n|\n) {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ BC(headers);
+ lstate->lineno++;
+ lstate->current_pos += yyleng;
+ lstate->body_opaque_start = lstate->current_pos;
+ return EOL;
+}
+
+<endboundary>(\r\n|\n) {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ BC(postamble);
+ lstate->lineno++;
+ lstate->current_pos += yyleng;
+}
+
+<preamble>. {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ lstate->current_pos += yyleng;
+}
+
+
+<postamble>. {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ lstate->current_pos += yyleng;
+}
+
+(\r\n|\n) {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ lstate->lineno++;
+ lstate->current_pos += yyleng;
+ return EOL;
+}
+
+. {
+ struct lexer_state *lstate = yyget_extra(yyscanner);
+ lstate->current_pos += yyleng;
+ return((int)*yytext);
+}
+
+
+%%
+
+void reset_lexer_state(void *yyscanner, struct parser_state *pstate)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ struct lexer_state *lstate = &(pstate->lstate);
+
+ yyset_extra((void*)lstate, yyscanner);
+ BEGIN(0);
+ lstate->header_state = STATE_MAIL;
+ lstate->lineno = 0;
+ lstate->current_pos = 1;
+ lstate->condition = 0;
+
+ lstate->is_envelope = 1;
+
+ lstate->message_len = 0;
+ lstate->buffer_length = 0;
+
+ /* temporary marker variables */
+ lstate->body_opaque_start = 0;
+ lstate->body_start = 0;
+ lstate->body_end = 0;
+ lstate->preamble_start = 0;
+ lstate->preamble_end = 0;
+ lstate->postamble_start = 0;
+ lstate->postamble_end = 0;
+}
+
+void
+PARSER_setbuffer(const char *string, yyscan_t scanner)
+{
+ struct lexer_state *lstate = yyget_extra(scanner);
+ lstate->message_buffer = string;
+ yy_scan_string(string, scanner);
+}
+
+void
+PARSER_setfp(FILE *fp, yyscan_t scanner)
+{
+ /* looks like a bug in bison 2.2a -- the wrong code is generated for yyset_in !! */
+ struct yyguts_t * yyg = (struct yyguts_t*) scanner;
+ yyg->yyin_r = fp;
+
+ if (0) {
+ /* This is just to make a compiler warning go away */
+ yyunput(0, NULL, scanner);
+ }
+}
+
+/**
+ * Counts how many lines a given string represents in the message (in case of
+ * folded header values, for example, or a message body).
+ */
+int
+count_lines(char *txt)
+{
+ char *o;
+ int line;
+
+ line = 0;
+
+ for (o = txt; *o != '\0'; o++)
+ if (*o == '\n')
+ line++;
+
+ return line;
+}