Main Page | Modules | Class List | Directories | File List | Class Members | File Members | Related Pages

acsmx.c

Go to the documentation of this file.
00001 /*
00002 **
00003 ** $Id$
00004 **
00005 ** Multi-Pattern Search Engine
00006 **
00007 ** Aho-Corasick State Machine -  uses a Deterministic Finite Automata - DFA
00008 **
00009 ** Copyright (C) 2002 Sourcefire,Inc.
00010 ** Marc Norton
00011 **
00012 **  
00013 ** This program is free software; you can redistribute it and/or modify
00014 ** it under the terms of the GNU General Public License as published by
00015 ** the Free Software Foundation; either version 2 of the License, or
00016 ** (at your option) any later version.
00017 **
00018 ** This program is distributed in the hope that it will be useful,
00019 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
00020 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00021 ** GNU General Public License for more details.
00022 **
00023 ** You should have received a copy of the GNU General Public License
00024 ** along with this program; if not, write to the Free Software
00025 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
00026 **
00027 **
00028 **   Reference - Efficient String matching: An Aid to Bibliographic Search
00029 **               Alfred V Aho and Margaret J Corasick
00030 **               Bell Labratories 
00031 **               Copyright(C) 1975 Association for Computing Machinery,Inc
00032 **
00033 **   Implemented from the 4 algorithms in the paper by Aho & Corasick
00034 **   and some implementation ideas from 'Practical Algorithms in C'
00035 **
00036 **   Notes:
00037 **     1) This version uses about 1024 bytes per pattern character - heavy  on the memory. 
00038 **     2) This algorithm finds all occurrences of all patterns within a  
00039 **        body of text.
00040 **     3) Support is included to handle upper and lower case matching.     
00041 **     4) Some comopilers optimize the search routine well, others don't, this makes all the difference.
00042 **     5) Aho inspects all bytes of the search text, but only once so it's very efficient,
00043 **        if the patterns are all large than the Modified Wu-Manbar method is often faster.
00044 **     6) I don't subscribe to any one method is best for all searching needs,
00045 **        the data decides which method is best,
00046 **        and we don't know until after the search method has been tested on the specific data sets.
00047 **        
00048 **
00049 **  May 2002  : Marc Norton 1st Version  
00050 **  June 2002 : Modified interface for SNORT, added case support
00051 **  Aug 2002  : Cleaned up comments, and removed dead code.
00052 **  Nov 2,2002: Fixed queue_init() , added count=0
00053 **              
00054 ** 
00055 */  
00056   
00057 #include <stdio.h>
00058 #include <stdlib.h>
00059 #include <string.h>
00060 #include <ctype.h>
00061   
00062 #include "acsmx.h"
00063   
00064 #define MEMASSERT(p,s) if(!p){fprintf(stderr,"ACSM-No Memory: %s!\n",s);exit(0);}
00065 
00066 #ifdef DEBUG_AC
00067 static int max_memory = 0;
00068 #endif
00069 
00070 /*static void Print_DFA( ACSM_STRUCT * acsm );*/
00071 
00072 /*
00073 *
00074 */ 
00075 static void *
00076 AC_MALLOC (int n) 
00077 {
00078   void *p;
00079   p = malloc (n);
00080 #ifdef DEBUG_AC
00081   if (p)
00082     max_memory += n;
00083 #endif
00084   return p;
00085 }
00086 
00087 
00088 /*
00089 *
00090 */ 
00091 static void
00092 AC_FREE (void *p) 
00093 {
00094   if (p)
00095     free (p);
00096 }
00097 
00098 
00099 /*
00100 *    Simple QUEUE NODE
00101 */ 
00102 typedef struct _qnode
00103 {
00104   int state;
00105    struct _qnode *next;
00106 }
00107 QNODE;
00108 
00109 /*
00110 *    Simple QUEUE Structure
00111 */ 
00112 typedef struct _queue
00113 {
00114   QNODE * head, *tail;
00115   int count;
00116 }
00117 QUEUE;
00118 
00119 /*
00120 *
00121 */ 
00122 static void
00123 queue_init (QUEUE * s) 
00124 {
00125   s->head = s->tail = 0;
00126   s->count = 0;
00127 }
00128 
00129 
00130 /*
00131 *  Add Tail Item to queue
00132 */ 
00133 static void
00134 queue_add (QUEUE * s, int state) 
00135 {
00136   QNODE * q;
00137   if (!s->head)
00138     {
00139       q = s->tail = s->head = (QNODE *) AC_MALLOC (sizeof (QNODE));
00140       MEMASSERT (q, "queue_add");
00141       q->state = state;
00142       q->next = 0;
00143     }
00144   else
00145     {
00146       q = (QNODE *) AC_MALLOC (sizeof (QNODE));
00147       MEMASSERT (q, "queue_add");
00148       q->state = state;
00149       q->next = 0;
00150       s->tail->next = q;
00151       s->tail = q;
00152     }
00153   s->count++;
00154 }
00155 
00156 
00157 /*
00158 *  Remove Head Item from queue
00159 */ 
00160 static int
00161 queue_remove (QUEUE * s) 
00162 {
00163   int state = 0;
00164   QNODE * q;
00165   if (s->head)
00166     {
00167       q = s->head;
00168       state = q->state;
00169       s->head = s->head->next;
00170       s->count--;
00171       if (!s->head)
00172         {
00173           s->tail = 0;
00174           s->count = 0;
00175         }
00176       AC_FREE (q);
00177     }
00178   return state;
00179 }
00180 
00181 
00182 /*
00183 *
00184 */ 
00185 static int
00186 queue_count (QUEUE * s) 
00187 {
00188   return s->count;
00189 }
00190 
00191 
00192 /*
00193 *
00194 */ 
00195 static void
00196 queue_free (QUEUE * s) 
00197 {
00198   while (queue_count (s))
00199     {
00200       queue_remove (s);
00201     }
00202 }
00203 
00204 
00205 /*
00206 ** Case Translation Table 
00207 */ 
00208 static unsigned char xlatcase[256];
00209 
00210 /*
00211 *
00212 */ 
00213   static void
00214 init_xlatcase () 
00215 {
00216   int i;
00217   for (i = 0; i < 256; i++)
00218     {
00219       xlatcase[i] = toupper (i);
00220     }
00221 }
00222 
00223 
00224 /*
00225 *
00226 */ 
00227   static inline void
00228 ConvertCase (unsigned char *s, int m) 
00229 {
00230   int i;
00231   for (i = 0; i < m; i++)
00232     {
00233       s[i] = xlatcase[s[i]];
00234     }
00235 }
00236 
00237 
00238 /*
00239 *
00240 */ 
00241 static inline void
00242 ConvertCaseEx (unsigned char *d, unsigned char *s, int m) 
00243 {
00244   int i;
00245   for (i = 0; i < m; i++)
00246     {
00247       d[i] = xlatcase[s[i]];
00248     }
00249 }
00250 
00251 
00252 /*
00253 *
00254 */ 
00255 static ACSM_PATTERN *
00256 CopyMatchListEntry (ACSM_PATTERN * px) 
00257 {
00258   ACSM_PATTERN * p;
00259   p = (ACSM_PATTERN *) AC_MALLOC (sizeof (ACSM_PATTERN));
00260   MEMASSERT (p, "CopyMatchListEntry");
00261   memcpy (p, px, sizeof (ACSM_PATTERN));
00262   p->next = 0;
00263   return p;
00264 }
00265 
00266 
00267 /*
00268 *  Add a pattern to the list of patterns terminated at this state.
00269 *  Insert at front of list.
00270 */ 
00271 static void
00272 AddMatchListEntry (ACSM_STRUCT * acsm, int state, ACSM_PATTERN * px) 
00273 {
00274   ACSM_PATTERN * p;
00275   p = (ACSM_PATTERN *) AC_MALLOC (sizeof (ACSM_PATTERN));
00276   MEMASSERT (p, "AddMatchListEntry");
00277   memcpy (p, px, sizeof (ACSM_PATTERN));
00278   p->next = acsm->acsmStateTable[state].MatchList;
00279   acsm->acsmStateTable[state].MatchList = p;
00280 }
00281 
00282 
00283 /* 
00284    Add Pattern States
00285 */ 
00286 static void
00287 AddPatternStates (ACSM_STRUCT * acsm, ACSM_PATTERN * p) 
00288 {
00289   unsigned char *pattern;
00290   int state=0, next, n;
00291   n = p->n;
00292   pattern = p->patrn;
00293   
00294     /* 
00295      *  Match up pattern with existing states
00296      */ 
00297     for (; n > 0; pattern++, n--)
00298     {
00299       next = acsm->acsmStateTable[state].NextState[*pattern];
00300       if (next == ACSM_FAIL_STATE)
00301         break;
00302       state = next;
00303     }
00304   
00305     /*
00306      *   Add new states for the rest of the pattern bytes, 1 state per byte
00307      */ 
00308     for (; n > 0; pattern++, n--)
00309     {
00310       acsm->acsmNumStates++;
00311       acsm->acsmStateTable[state].NextState[*pattern] = acsm->acsmNumStates;
00312       state = acsm->acsmNumStates;
00313     }
00314     
00315   AddMatchListEntry (acsm, state, p);
00316 }
00317 
00318 
00319 /*
00320 *   Build Non-Deterministic Finite Automata
00321 */ 
00322 static void
00323 Build_NFA (ACSM_STRUCT * acsm) 
00324 {
00325   int r, s;
00326   int i;
00327   QUEUE q, *queue = &q;
00328   ACSM_PATTERN * mlist=0;
00329   ACSM_PATTERN * px=0;
00330   
00331     /* Init a Queue */ 
00332     queue_init (queue);
00333   
00334     /* Add the state 0 transitions 1st */ 
00335     for (i = 0; i < ALPHABET_SIZE; i++)
00336     {
00337       s = acsm->acsmStateTable[0].NextState[i];
00338       if (s)
00339         {
00340           queue_add (queue, s);
00341           acsm->acsmStateTable[s].FailState = 0;
00342         }
00343     }
00344   
00345     /* Build the fail state transitions for each valid state */ 
00346     while (queue_count (queue) > 0)
00347     {
00348       r = queue_remove (queue);
00349       
00350         /* Find Final States for any Failure */ 
00351         for (i = 0; i < ALPHABET_SIZE; i++)
00352         {
00353           int fs, next;
00354           if ((s = acsm->acsmStateTable[r].NextState[i]) != ACSM_FAIL_STATE)
00355             {
00356               queue_add (queue, s);
00357               fs = acsm->acsmStateTable[r].FailState;
00358               
00359                 /* 
00360                    *  Locate the next valid state for 'i' starting at s 
00361                  */ 
00362                 while ((next=acsm->acsmStateTable[fs].NextState[i]) ==
00363                        ACSM_FAIL_STATE)
00364                 {
00365                   fs = acsm->acsmStateTable[fs].FailState;
00366                 }
00367               
00368                 /*
00369                    *  Update 's' state failure state to point to the next valid state
00370                  */ 
00371                 acsm->acsmStateTable[s].FailState = next;
00372               
00373                 /*
00374                    *  Copy 'next'states MatchList to 's' states MatchList, 
00375                    *  we copy them so each list can be AC_FREE'd later,
00376                    *  else we could just manipulate pointers to fake the copy.
00377                  */ 
00378                 for (mlist  = acsm->acsmStateTable[next].MatchList; 
00379                      mlist != NULL ;
00380                      mlist  = mlist->next)
00381                 {
00382                     px = CopyMatchListEntry (mlist);
00383 
00384                     if( !px )
00385                     {
00386                     printf("*** Out of memory Initializing Aho Corasick in acsmx.c ****");
00387                     }
00388         
00389                     /* Insert at front of MatchList */ 
00390                     px->next = acsm->acsmStateTable[s].MatchList;
00391                     acsm->acsmStateTable[s].MatchList = px;
00392                 }
00393             }
00394         }
00395     }
00396   
00397     /* Clean up the queue */ 
00398     queue_free (queue);
00399 }
00400 
00401 
00402 /*
00403 *   Build Deterministic Finite Automata from NFA
00404 */ 
00405 static void
00406 Convert_NFA_To_DFA (ACSM_STRUCT * acsm) 
00407 {
00408   int r, s;
00409   int i;
00410   QUEUE q, *queue = &q;
00411   
00412     /* Init a Queue */ 
00413     queue_init (queue);
00414   
00415     /* Add the state 0 transitions 1st */ 
00416     for (i = 0; i < ALPHABET_SIZE; i++)
00417     {
00418       s = acsm->acsmStateTable[0].NextState[i];
00419       if (s)
00420         {
00421           queue_add (queue, s);
00422         }
00423     }
00424   
00425     /* Start building the next layer of transitions */ 
00426     while (queue_count (queue) > 0)
00427     {
00428       r = queue_remove (queue);
00429       
00430         /* State is a branch state */ 
00431         for (i = 0; i < ALPHABET_SIZE; i++)
00432         {
00433           if ((s = acsm->acsmStateTable[r].NextState[i]) != ACSM_FAIL_STATE)
00434             {
00435               queue_add (queue, s);
00436             }
00437           else
00438             {
00439               acsm->acsmStateTable[r].NextState[i] =
00440                 acsm->acsmStateTable[acsm->acsmStateTable[r].FailState].
00441                 NextState[i];
00442             }
00443         }
00444     }
00445   
00446     /* Clean up the queue */ 
00447     queue_free (queue);
00448 }
00449 
00450 
00451 /*
00452 *
00453 */ 
00454 ACSM_STRUCT * acsmNew () 
00455 {
00456   ACSM_STRUCT * p;
00457   init_xlatcase ();
00458   p = (ACSM_STRUCT *) AC_MALLOC (sizeof (ACSM_STRUCT));
00459   MEMASSERT (p, "acsmNew");
00460   if (p)
00461     memset (p, 0, sizeof (ACSM_STRUCT));
00462   return p;
00463 }
00464 
00465 
00466 /*
00467 *   Add a pattern to the list of patterns for this state machine
00468 */ 
00469 int
00470 acsmAddPattern (ACSM_STRUCT * p, unsigned char *pat, int n, int nocase,
00471                 int offset, int depth, void * id, int iid) 
00472 {
00473   ACSM_PATTERN * plist;
00474   plist = (ACSM_PATTERN *) AC_MALLOC (sizeof (ACSM_PATTERN));
00475   MEMASSERT (plist, "acsmAddPattern");
00476   plist->patrn = (unsigned char *) AC_MALLOC (n);
00477   ConvertCaseEx (plist->patrn, pat, n);
00478   plist->casepatrn = (unsigned char *) AC_MALLOC (n);
00479   memcpy (plist->casepatrn, pat, n);
00480   plist->n = n;
00481   plist->nocase = nocase;
00482   plist->offset = offset;
00483   plist->depth = depth;
00484   plist->id = id;
00485   plist->iid = iid;
00486   plist->next = p->acsmPatterns;
00487   p->acsmPatterns = plist;
00488   return 0;
00489 }
00490 
00491 
00492 /*
00493 *   Compile State Machine
00494 */ 
00495 int
00496 acsmCompile (ACSM_STRUCT * acsm) 
00497 {
00498   int i, k;
00499   ACSM_PATTERN * plist;
00500   
00501     /* Count number of states */ 
00502     acsm->acsmMaxStates = 1;
00503   for (plist = acsm->acsmPatterns; plist != NULL; plist = plist->next)
00504     {
00505       acsm->acsmMaxStates += plist->n;
00506     }
00507   acsm->acsmStateTable =
00508     (ACSM_STATETABLE *) AC_MALLOC (sizeof (ACSM_STATETABLE) *
00509                                    acsm->acsmMaxStates);
00510   MEMASSERT (acsm->acsmStateTable, "acsmCompile");
00511   memset (acsm->acsmStateTable, 0,
00512             sizeof (ACSM_STATETABLE) * acsm->acsmMaxStates);
00513   
00514     /* Initialize state zero as a branch */ 
00515     acsm->acsmNumStates = 0;
00516   
00517     /* Initialize all States NextStates to FAILED */ 
00518     for (k = 0; k < acsm->acsmMaxStates; k++)
00519     {
00520       for (i = 0; i < ALPHABET_SIZE; i++)
00521         {
00522           acsm->acsmStateTable[k].NextState[i] = ACSM_FAIL_STATE;
00523         }
00524     }
00525   
00526     /* Add each Pattern to the State Table */ 
00527     for (plist = acsm->acsmPatterns; plist != NULL; plist = plist->next)
00528     {
00529       AddPatternStates (acsm, plist);
00530     }
00531   
00532     /* Set all failed state transitions to return to the 0'th state */ 
00533     for (i = 0; i < ALPHABET_SIZE; i++)
00534     {
00535       if (acsm->acsmStateTable[0].NextState[i] == ACSM_FAIL_STATE)
00536         {
00537           acsm->acsmStateTable[0].NextState[i] = 0;
00538         }
00539     }
00540   
00541     /* Build the NFA  */ 
00542     Build_NFA (acsm);
00543   
00544     /* Convert the NFA to a DFA */ 
00545     Convert_NFA_To_DFA (acsm);
00546   
00547     /*
00548       printf ("ACSMX-Max Memory: %d bytes, %d states\n", max_memory,
00549              acsm->acsmMaxStates);
00550      */
00551 
00552   
00553     //Print_DFA( acsm );
00554 
00555     return 0;
00556 }
00557 
00558 
00559 static unsigned char Tc[64*1024];
00560 
00561 /*
00562 *   Search Text or Binary Data for Pattern matches
00563 */ 
00564   int
00565 acsmSearch (ACSM_STRUCT * acsm, unsigned char *Tx, int n,
00566             int (*Match) (void *  id, int index, void *data), void *data) 
00567 {
00568   int state;
00569   ACSM_PATTERN * mlist;
00570   unsigned char *Tend;
00571   ACSM_STATETABLE * StateTable = acsm->acsmStateTable;
00572   int nfound = 0;
00573   unsigned char *T;
00574   int index;
00575   
00576   /* Case conversion */ 
00577   ConvertCaseEx (Tc, Tx, n);
00578   T = Tc;
00579   Tend = T + n;
00580  
00581   for (state = 0; T < Tend; T++)
00582     {
00583       state = StateTable[state].NextState[*T];
00584 
00585       if( StateTable[state].MatchList != NULL )
00586         {
00587           for( mlist=StateTable[state].MatchList; mlist!=NULL;
00588                mlist=mlist->next )
00589             {
00590               index = T - mlist->n + 1 - Tc;
00591               if( mlist->nocase )
00592                 {
00593                   nfound++;
00594                   if (Match (mlist->id, index, data))
00595                     return nfound;
00596                 }
00597               else
00598                 {
00599                   if( memcmp (mlist->casepatrn, Tx + index, mlist->n) == 0 )
00600                     {
00601                       nfound++;
00602                       if (Match (mlist->id, index, data))
00603                         return nfound;
00604                     }
00605                 }
00606             }
00607         }
00608     }
00609   return nfound;
00610 }
00611 
00612 
00613 /*
00614 *   Free all memory
00615 */ 
00616   void
00617 acsmFree (ACSM_STRUCT * acsm) 
00618 {
00619   int i;
00620   ACSM_PATTERN * mlist, *ilist;
00621   for (i = 0; i < acsm->acsmMaxStates; i++)
00622     
00623     {
00624       if (acsm->acsmStateTable[i].MatchList != NULL)
00625         
00626         {
00627           mlist = acsm->acsmStateTable[i].MatchList;
00628           while (mlist)
00629             
00630             {
00631               ilist = mlist;
00632               mlist = mlist->next;
00633               AC_FREE (ilist);
00634             }
00635         }
00636     }
00637   AC_FREE (acsm->acsmStateTable);
00638 }
00639 
00640 /*
00641  * 
00642  */ 
00643 /*
00644 static void Print_DFA( ACSM_STRUCT * acsm )
00645 {
00646     int k;
00647     int i;
00648     int next;
00649     
00650     for (k = 0; k < acsm->acsmMaxStates; k++)
00651     {
00652       for (i = 0; i < ALPHABET_SIZE; i++)
00653         {
00654           next = acsm->acsmStateTable[k].NextState[i];
00655 
00656           if( next == 0 || next ==  ACSM_FAIL_STATE )
00657           {
00658            if( isprint(i) )
00659              printf("%3c->%-5d\t",i,next);
00660            else
00661              printf("%3d->%-5d\t",i,next);
00662           }
00663         }
00664       printf("\n");
00665     }
00666     
00667 } 
00668 */
00669         
00670 
00671 int acsmPrintDetailInfo(ACSM_STRUCT * p)
00672 {
00673     return 0;
00674 }
00675         
00676 int acsmPrintSummaryInfo(ACSM_STRUCT *acsm )
00677 {
00678 #ifdef XXXXX
00679     char * fsa[]={
00680       "TRIE",
00681       "NFA",
00682       "DFA",
00683     };
00684 
00685     ACSM_STRUCT2 * p = &summary.acsm;
00686 
00687     if( !summary.num_states )
00688             return;
00689     
00690     printf("+--[Pattern Matcher:Aho-Corasick Summary]----------------------\n");
00691     printf("| Alphabet Size    : %d Chars\n",p->acsmAlphabetSize);
00692     printf("| Sizeof State     : %d bytes\n",sizeof(acstate_t));
00693     printf("| Storage Format   : %s \n",sf[ p->acsmFormat ]);
00694     printf("| Num States       : %d\n",summary.num_states);
00695     printf("| Num Transitions  : %d\n",summary.num_transitions);
00696     printf("| State Density    : %.1f%%\n",100.0*(double)summary.num_transitions/(summary.num_states*p->acsmAlphabetSize));
00697     printf("| Finite Automatum : %s\n", fsa[p->acsmFSA]);
00698     if( max_memory < 1024*1024 )
00699     printf("| Memory           : %.2fKbytes\n", (float)max_memory/1024 );
00700     else
00701     printf("| Memory           : %.2fMbytes\n", (float)max_memory/(1024*1024) );
00702     printf("+-------------------------------------------------------------\n");
00703 
00704 #endif
00705     return 0;
00706 }
00707 
00708 
00709 #ifdef ACSMX_MAIN
00710   
00711 /*
00712 *  Text Data Buffer
00713 */ 
00714 unsigned char text[512];
00715 
00716 /* 
00717 *    A Match is found
00718 */ 
00719   int
00720 MatchFound (unsigned id, int index, void *data) 
00721 {
00722   fprintf (stdout, "%s\n", (char *) id);
00723   return 0;
00724 }
00725 
00726 
00727 /*
00728 *
00729 */ 
00730   int
00731 main (int argc, char **argv) 
00732 {
00733   int i, nocase = 0;
00734   ACSM_STRUCT * acsm;
00735   if (argc < 3)
00736     
00737     {
00738       fprintf (stderr,
00739                 "Usage: acsmx pattern word-1 word-2 ... word-n  -nocase\n");
00740       exit (0);
00741     }
00742   acsm = acsmNew ();
00743   strcpy (text, argv[1]);
00744   for (i = 1; i < argc; i++)
00745     if (strcmp (argv[i], "-nocase") == 0)
00746       nocase = 1;
00747   for (i = 2; i < argc; i++)
00748     
00749     {
00750       if (argv[i][0] == '-')
00751         continue;
00752       acsmAddPattern (acsm, argv[i], strlen (argv[i]), nocase, 0, 0,
00753                         argv[i], i - 2);
00754     }
00755   acsmCompile (acsm);
00756   acsmSearch (acsm, text, strlen (text), MatchFound, (void *) 0);
00757   acsmFree (acsm);
00758   printf ("normal pgm end\n");
00759   return (0);
00760 }
00761 #endif /*  */
00762 

Generated on Sun May 14 14:51:18 2006 by  doxygen 1.4.2