%{ /* * Copyright (c) 2004 Jann Fischer. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /** * This is a lexer file for parsing MIME compatible messages. It is intended * to satisfy at least RFC 2045 (Format of Internet Message Bodies). It still * has quite a few problems: * * - The parsing could probably be done in a more elegant way * - I don't know what performance impact REJECT has on the parser */ #include #include #include #include #include "mimeparser.h" #include "mimeparser.tab.h" #define NAMEOF(v) #v /* BC() is a debug wrapper for lex' BEGIN() macro */ #define BC(x) do { \ struct lexer_state *lstate = yyget_extra(yyscanner); \ BEGIN(x); \ lstate->condition = x; \ } while(0); #define ZERO(x) memset(x, '\0', sizeof(x)) #define PREALLOC_BUFFER 100000 #undef YY_BUF_SIZE #define YY_BUF_SIZE 65536 enum header_states { STATE_MAIL = 0, STATE_CTYPE, STATE_CDISP, STATE_CENC, STATE_MIME }; %} %option reentrant %option yylineno %option bison-bridge %s headers %s header %s headervalue %s tspecialvalue %s comment %s body %s postamble %s preamble %s boundary %s endboundary %s endoffile STRING [a-zA-Z0-9\-\.\_] TSPECIAL [a-zA-Z0-9)(<>@,;:/\-.=_\+'? ] TSPECIAL_LITE [a-zA-Z0-9)(<>@,-._+'?\[\]] %% ^[a-zA-Z]+[a-zA-Z0-9\-\_]* { struct lexer_state *lstate = yyget_extra(yyscanner); yylval_param->string=strdup(yytext); lstate->current_pos += yyleng; BC(header); /* Depending on what header we are processing, we enter a different * state and return a different value. */ if (!strcasecmp(yytext, "Content-Type")) { lstate->header_state = STATE_CTYPE; return CONTENTTYPE_HEADER; } else if (!strcasecmp(yytext, "Content-Transfer-Encoding")) { lstate->header_state = STATE_CENC; return CONTENTENCODING_HEADER; } else if (!strcasecmp(yytext, "Content-Disposition")) { lstate->header_state = STATE_CDISP; return CONTENTDISPOSITION_HEADER; } else if (!strcasecmp(yytext, "MIME-Version")) { lstate->header_state = STATE_MAIL; return MIMEVERSION_HEADER; } else { lstate->header_state = STATE_MAIL; return MAIL_HEADER; } } . { struct lexer_state *lstate = yyget_extra(yyscanner); /* dprintf2("Unknown header char: %c\n", *yytext); */ lstate->current_pos += yyleng; return ANY; } ^(\r\n|\n) { struct lexer_state *lstate = yyget_extra(yyscanner); lstate->lineno++; lstate->current_pos += yyleng; /* This marks the end of headers. Depending on whether we are in the * envelope currently we need to parse either a body or the preamble * now. */ if (lstate->is_envelope == 0 || lstate->boundary_string == NULL) { BC(body); lstate->body_start = lstate->current_pos; } else { lstate->is_envelope = 0; lstate->preamble_start = lstate->current_pos; BC(preamble); } return ENDOFHEADERS; }
\: { struct lexer_state *lstate = yyget_extra(yyscanner); BC(headervalue); lstate->current_pos += yyleng; return COLON; }
(\r\n|\n) { struct lexer_state *lstate = yyget_extra(yyscanner); BC(headers); /* dprintf2("Invalid header, returning EOL\n"); */ lstate->current_pos += yyleng; return EOL; } (\n|\r\n)[\ \t]+ { struct lexer_state *lstate = yyget_extra(yyscanner); lstate->current_pos += yyleng; } .+|(.+(\n|\r\n)[\ \t]+.+)+ { struct lexer_state *lstate = yyget_extra(yyscanner); if (lstate->header_state != STATE_MAIL && lstate->header_state != STATE_CENC) { REJECT; } lstate->current_pos += yyleng; while (*yytext && isspace(*yytext)) yytext++; /* Do we actually have a header value? */ if (*yytext == '\0') { yylval_param->string = strdup(""); } else { yylval_param->string=strdup(yytext); lstate->lineno += count_lines(yytext); } return WORD; } (\r\n|\n) { struct lexer_state *lstate = yyget_extra(yyscanner); /* marks the end of one header line */ lstate->lineno++; BC(headers); lstate->current_pos += yyleng; return EOL; } ;|;(\r\n|\n)[\ \t]+ { struct lexer_state *lstate = yyget_extra(yyscanner); lstate->lineno += count_lines(yytext); lstate->current_pos += yyleng; return SEMICOLON; } \= { struct lexer_state *lstate = yyget_extra(yyscanner); lstate->current_pos += yyleng; return EQUAL; } \" { struct lexer_state *lstate = yyget_extra(yyscanner); BC(tspecialvalue); lstate->current_pos += yyleng; return *yytext; } {STRING}+|{TSPECIAL_LITE}+ { struct lexer_state *lstate = yyget_extra(yyscanner); yylval_param->string=strdup(yytext); lstate->lineno += count_lines(yytext); lstate->current_pos += yyleng; return WORD; } [\ |\t]+ { struct lexer_state *lstate = yyget_extra(yyscanner); lstate->current_pos += yyleng; } {TSPECIAL}+ { struct lexer_state *lstate = yyget_extra(yyscanner); lstate->lineno += count_lines(yytext); yylval_param->string=strdup(yytext); lstate->current_pos += yyleng; return TSPECIAL; } \" { struct lexer_state *lstate = yyget_extra(yyscanner); BC(headervalue); lstate->current_pos += yyleng; return *yytext; } ^\-\-{TSPECIAL}+\-\- { struct lexer_state *lstate = yyget_extra(yyscanner); /** * Make sure we only catch matching boundaries, and not other lines * that begin and end with two dashes. If we have catched a valid * end boundary, which actually ends a body, we save the current * position, put the token back on the input stream and let the * endboundary condition parse the actual token. */ if (lstate->endboundary_string != NULL) { if (strcmp(lstate->endboundary_string, yytext)) { /* dprintf2("YYTEXT != end_boundary: '%s'\n", yytext); */ REJECT; } else { lstate->current_pos += yyleng; /* dprintf2("YYTEXT == lstate->end_boundary: '%s'\n", yytext); */ if (lstate->body_start) { yylval_param->position.opaque_start = lstate->body_opaque_start; yylval_param->position.start = lstate->body_start; yylval_param->position.end = lstate->current_pos - yyleng; lstate->body_opaque_start = 0; lstate->body_start = 0; lstate->body_end = 0; yyless(0); BC(endboundary); return BODY; } } } else { } REJECT; } ^\-\-{TSPECIAL}+ { struct lexer_state *lstate = yyget_extra(yyscanner); /** * Make sure we only catch matching boundaries, and not other lines * that begin with two dashes. */ if (lstate->boundary_string != NULL) { if (strcmp(lstate->boundary_string, yytext)) { /* dprintf2("YYTEXT != boundary: '%s'\n", yytext);*/ REJECT; } else { /* dprintf2("YYTEXT == boundary: '%s'\n", yytext);*/ if (lstate->body_start) { yylval_param->position.opaque_start = lstate->body_opaque_start; yylval_param->position.start = lstate->body_start; yylval_param->position.end = lstate->current_pos; lstate->body_opaque_start = 0; lstate->body_start = 0; lstate->body_end = 0; yyless(0); BC(boundary); return BODY; } else if (lstate->preamble_start) { yylval_param->position.start = lstate->preamble_start; yylval_param->position.end = lstate->current_pos; lstate->preamble_start = lstate->preamble_end = 0; yyless(0); BC(boundary); return PREAMBLE; } else { BC(boundary); yylval_param->string = strdup(yytext); lstate->current_pos += yyleng; return(BOUNDARY); } } } else { } REJECT; } (\r\n|\n) { struct lexer_state *lstate = yyget_extra(yyscanner); lstate->current_pos += yyleng; lstate->lineno++; } \r { struct lexer_state *lstate = yyget_extra(yyscanner); lstate->current_pos += yyleng; /* dprintf2("stray CR in body...\n"); */ } [^\r\n]+ { struct lexer_state *lstate = yyget_extra(yyscanner); lstate->current_pos += yyleng; } <> { struct lexer_state *lstate = yyget_extra(yyscanner); if (lstate->boundary_string == NULL && lstate->body_start) { yylval_param->position.opaque_start = 0; yylval_param->position.start = lstate->body_start; yylval_param->position.end = lstate->current_pos; lstate->body_start = 0; return BODY; } else if (lstate->body_start) { return POSTAMBLE; } yyterminate(); } (\r\n|\n) { struct lexer_state *lstate = yyget_extra(yyscanner); /* dprintf2("Preamble CR/LF at line %d\n", lineno); */ lstate->lineno++; lstate->current_pos += yyleng; } [^\r\n]+ { struct lexer_state *lstate = yyget_extra(yyscanner); yylval_param->string = strdup(yytext); lstate->current_pos += yyleng; return BOUNDARY; } [^\r\n]+ { struct lexer_state *lstate = yyget_extra(yyscanner); yylval_param->string = strdup(yytext); lstate->current_pos += yyleng; return ENDBOUNDARY; } (\r\n|\n) { struct lexer_state *lstate = yyget_extra(yyscanner); BC(headers); lstate->lineno++; lstate->current_pos += yyleng; lstate->body_opaque_start = lstate->current_pos; return EOL; } (\r\n|\n) { struct lexer_state *lstate = yyget_extra(yyscanner); BC(postamble); lstate->lineno++; lstate->current_pos += yyleng; } . { struct lexer_state *lstate = yyget_extra(yyscanner); lstate->current_pos += yyleng; } . { struct lexer_state *lstate = yyget_extra(yyscanner); lstate->current_pos += yyleng; } (\r\n|\n) { struct lexer_state *lstate = yyget_extra(yyscanner); lstate->lineno++; lstate->current_pos += yyleng; return EOL; } . { struct lexer_state *lstate = yyget_extra(yyscanner); lstate->current_pos += yyleng; return((int)*yytext); } %% void reset_lexer_state(void *yyscanner, struct parser_state *pstate) { struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; struct lexer_state *lstate = &(pstate->lstate); yyset_extra((void*)lstate, yyscanner); BEGIN(0); lstate->header_state = STATE_MAIL; lstate->lineno = 0; lstate->current_pos = 1; lstate->condition = 0; lstate->is_envelope = 1; lstate->message_len = 0; lstate->buffer_length = 0; /* temporary marker variables */ lstate->body_opaque_start = 0; lstate->body_start = 0; lstate->body_end = 0; lstate->preamble_start = 0; lstate->preamble_end = 0; lstate->postamble_start = 0; lstate->postamble_end = 0; } void PARSER_setbuffer(const char *string, yyscan_t scanner) { struct lexer_state *lstate = yyget_extra(scanner); lstate->message_buffer = string; yy_scan_string(string, scanner); } void PARSER_setfp(FILE *fp, yyscan_t scanner) { /* looks like a bug in bison 2.2a -- the wrong code is generated for yyset_in !! */ struct yyguts_t * yyg = (struct yyguts_t*) scanner; yyg->yyin_r = fp; if (0) { /* This is just to make a compiler warning go away */ yyunput(0, NULL, scanner); } } /** * Counts how many lines a given string represents in the message (in case of * folded header values, for example, or a message body). */ int count_lines(char *txt) { char *o; int line; line = 0; for (o = txt; *o != '\0'; o++) if (*o == '\n') line++; return line; }