Main Page | Modules | Class List | Directories | File List | Class Members | File Members | Related Pages

sfksearch.c

Go to the documentation of this file.
00001 /*
00002 *  ksearch.c
00003 *  
00004 *  Basic Keyword Search Trie - uses linked lists to build the finite automata
00005 *
00006 *  Keyword-Match: Performs the equivalent of a multi-string strcmp() 
00007 *     - use for token testing after parsing the language tokens using lex or the like.
00008 *
00009 *  Keyword-Search: searches the input text for one of multiple keywords, 
00010 *  and supports case sensitivite and case insensitive patterns.
00011 *   
00012 *
00013 **  Copyright (C) 2001 Marc Norton
00014 ** Copyright (C) 2003 Sourcefire, Inc
00015 **
00016 ** This program is free software; you can redistribute it and/or modify
00017 ** it under the terms of the GNU General Public License as published by
00018 ** the Free Software Foundation; either version 2 of the License, or
00019 ** (at your option) any later version.
00020 **
00021 ** This program is distributed in the hope that it will be useful,
00022 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
00023 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00024 ** GNU General Public License for more details.
00025 **
00026 ** You should have received a copy of the GNU General Public License
00027 ** along with this program; if not, write to the Free Software
00028 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
00029 *
00030 *
00031 */
00032 #include <stdio.h>
00033 #include <stdlib.h>
00034 #include <string.h>
00035 
00036 #include <ctype.h>
00037 
00038 #include "sfksearch.h"
00039 
00040 /*
00041 *  Allocate Memory
00042 */
00043 static void * KTRIE_MALLOC( int n )
00044 {
00045    void * p;
00046 
00047    p = malloc( n );
00048 
00049    memset(p,0,n);
00050 
00051    return p;
00052 }
00053 
00054 /*
00055 *  Free Memory
00056 */
00057 /*
00058 static void KTRIE_FREE( void * p )
00059 {
00060    if( p ) free( p );
00061 }
00062 */
00063 
00064 /*
00065 *   Local/Tmp nocase array
00066 */
00067 static unsigned char Tnocase[65*1024];
00068 
00069 /*
00070 ** Case Translation Table 
00071 */
00072 static unsigned char xlatcase[256];
00073 
00074 /*
00075 *
00076 */
00077 static void init_xlatcase()
00078 {
00079    int i;
00080    static int first=1;
00081 
00082    if( !first ) return; /* thread safe */
00083    
00084    for(i=0;i<256;i++)
00085    {
00086      xlatcase[ i ] =  (unsigned char)tolower(i);
00087    }
00088 
00089    first=0;
00090 }
00091 
00092 /*
00093 *
00094 */
00095 static inline void ConvertCaseEx( unsigned char * d, unsigned char *s, int m )
00096 {
00097      int i;
00098      for( i=0; i < m; i++ )
00099      {
00100        d[i] = xlatcase[ s[i] ];
00101      }
00102 }
00103 
00104 
00105 /*
00106 *
00107 */
00108 KTRIE_STRUCT * KTrieNew()
00109 {
00110    KTRIE_STRUCT * ts = (KTRIE_STRUCT*) KTRIE_MALLOC( sizeof(KTRIE_STRUCT) );
00111 
00112    if( !ts ) return 0;
00113    
00114    memset(ts, 0, sizeof(KTRIE_STRUCT));  
00115 
00116    init_xlatcase();
00117 
00118    ts->memory = sizeof(KTRIE_STRUCT);
00119    ts->nchars = 0;
00120    ts->npats  = 0;
00121 
00122    return ts;
00123 }
00124 
00125 /*
00126 *
00127 */
00128 static KTRIEPATTERN * KTrieNewPattern(unsigned char * P, int n)
00129 {
00130    KTRIEPATTERN *p = (KTRIEPATTERN*) KTRIE_MALLOC( sizeof(KTRIEPATTERN) );
00131 
00132    if( !p ) return 0;
00133 
00134    /* Save as a nocase string */   
00135    p->P = (unsigned char*) KTRIE_MALLOC( n );
00136    if( !p->P ) 
00137        return 0;
00138 
00139    ConvertCaseEx( p->P, P, n );
00140 
00141    /* Save Case specific version */
00142    p->Pcase = (unsigned char*) KTRIE_MALLOC( n );
00143    if( !p->Pcase ) 
00144        return 0;
00145    memcpy( p->Pcase, P, n );
00146    
00147    p->n    = n;
00148    p->next = 0;
00149 
00150    return p;
00151 }
00152 
00153 /*
00154 *  Add Pattern info to the list of patterns
00155 */
00156 int KTrieAddPattern( KTRIE_STRUCT * ts, unsigned char * P, int n, 
00157                       int nocase, void * id )
00158 {
00159    KTRIEPATTERN  *new;
00160 
00161    if( !ts->patrn )
00162    {
00163        new = ts->patrn = KTrieNewPattern( P, n );
00164 
00165        if( !new ) return -1;
00166    }
00167    else
00168    {
00169        new = KTrieNewPattern(P, n );
00170 
00171        if( !new ) return -1;
00172 
00173        new->next = ts->patrn; /* insert at head of list */
00174 
00175        ts->patrn = new;
00176    }
00177 
00178    new->nocase = nocase;
00179    new->id     = id;
00180    new->mnext  = NULL;
00181 
00182    ts->npats++;
00183    ts->memory += sizeof(KTRIEPATTERN) + 2 * n ; /* Case and nocase */
00184    
00185    return 1;
00186 }
00187 
00188 
00189 /*
00190 *
00191 */
00192 static KTRIENODE * KTrieCreateNode(KTRIE_STRUCT * ts)
00193 {
00194    KTRIENODE * t=(KTRIENODE*)KTRIE_MALLOC( sizeof(KTRIENODE) );
00195 
00196    if(!t)
00197       return 0;
00198 
00199    memset(t,0,sizeof(KTRIENODE));
00200 
00201    ts->memory += sizeof(KTRIENODE);
00202    
00203    return t;
00204 }
00205 
00206 
00207 /*
00208 *  Insert a Pattern in the Trie
00209 */
00210 static int KTrieInsert( KTRIE_STRUCT *ts, KTRIEPATTERN * px  )
00211 {
00212    int            type = 0;
00213    int            n = px->n;
00214    unsigned char *P = px->P;
00215    KTRIENODE     *root;
00216    
00217    /* Make sure we at least have a root character for the tree */
00218    if( !ts->root[*P] )
00219    {
00220       ts->root[*P] = root = KTrieCreateNode(ts);
00221       if( !root ) return -1;
00222       root->edge = *P;
00223 
00224    }else{
00225 
00226       root = ts->root[*P];
00227    }
00228 
00229    /* Walk existing Patterns */   
00230    while( n )
00231    {
00232      if( root->edge == *P )
00233      {
00234          P++;
00235          n--;
00236 
00237          if( n && root->child )
00238          {
00239             root=root->child;   
00240          }
00241          else /* cannot continue */
00242          {
00243             type = 0; /* Expand the tree via the child */
00244             break; 
00245          }
00246      }
00247      else
00248      {
00249          if( root->sibling )
00250          {
00251             root=root->sibling;
00252          }
00253          else /* cannot continue */
00254          {
00255             type = 1; /* Expand the tree via the sibling */
00256             break; 
00257          }
00258      }
00259    }
00260 
00261    /* 
00262    * Add the next char of the Keyword, if any
00263    */
00264    if( n )
00265    {
00266      if( type == 0 )
00267      {
00268       /*
00269       *  Start with a new child to finish this Keyword 
00270       */
00271       root->child= KTrieCreateNode( ts );
00272       if( ! root->child ) return -1;
00273       root=root->child;
00274       root->edge  = *P;
00275       P++;
00276       n--;
00277       ts->nchars++;
00278 
00279      }
00280      else
00281      { 
00282       /*
00283       *  Start a new sibling bracnch to finish this Keyword 
00284       */
00285       root->sibling= KTrieCreateNode( ts );
00286       if( ! root->sibling ) return -1;
00287       root=root->sibling;
00288       root->edge  = *P;
00289       P++;
00290       n--;
00291       ts->nchars++;
00292      }
00293    }
00294 
00295    /*
00296    *    Finish the keyword as child nodes
00297    */
00298    while( n )
00299    {
00300       root->child = KTrieCreateNode(ts);
00301       if( ! root->child ) return -1;
00302       root=root->child;
00303       root->edge  = *P;
00304       P++;
00305       n--;
00306       ts->nchars++;
00307    }
00308 
00309    if( root->pkeyword )
00310    {
00311       px->mnext = root->pkeyword;  /* insert duplicates at front of list */
00312       root->pkeyword = px;
00313       ts->duplicates++;
00314    }
00315    else
00316    {
00317       root->pkeyword = px;
00318    }
00319 
00320    return 0;
00321 }
00322 
00323 
00324 /*
00325 *
00326 */
00327 static void Build_Bad_Character_Shifts( KTRIE_STRUCT * kt )
00328 {
00329     int           i,k;
00330     KTRIEPATTERN *plist; 
00331 
00332     /* Calc the min pattern size */
00333     kt->bcSize = 32000;
00334 
00335     for( plist=kt->patrn; plist!=NULL; plist=plist->next )
00336     { 
00337       if( plist->n < kt->bcSize )     
00338       {
00339           kt->bcSize = plist->n; /* smallest pattern size */
00340       }
00341     }
00342 
00343     /*
00344     *  Initialze the Bad Character shift table.  
00345     */
00346     for(i=0;i<256;i++)
00347     {
00348       kt->bcShift[i] = (unsigned short)kt->bcSize;  
00349     }
00350 
00351     /* 
00352     *  Finish the Bad character shift table
00353     */  
00354     for( plist=kt->patrn; plist!=NULL; plist=plist->next )
00355     {
00356        int shift, cindex;
00357 
00358        for( k=0; k<kt->bcSize; k++ )
00359        {
00360           shift = kt->bcSize - 1 - k;
00361 
00362           cindex = plist->P[ k ];
00363 
00364           if( shift < kt->bcShift[ cindex ] )
00365           {
00366               kt->bcShift[ cindex ] = (unsigned short)shift;
00367           }
00368        }
00369     }
00370 }
00371 
00372 
00373 /*
00374 *  Build the Keyword TRIE
00375 *  
00376 */
00377 int KTrieCompile(KTRIE_STRUCT * ts)
00378 {
00379   KTRIEPATTERN * p;
00380   /*
00381   static int  tmem=0; 
00382   */
00383 
00384   /* 
00385   *    Build the Keyword TRIE 
00386   */
00387   for( p=ts->patrn; p; p=p->next )
00388   {
00389        if( KTrieInsert( ts, p ) )
00390        return -1;
00391   }
00392 
00393   /*
00394   *    Build A Setwise Bad Character Shift Table
00395   */
00396   Build_Bad_Character_Shifts( ts );
00397 
00398   /*
00399   tmem += ts->memory;
00400   printf(" Compile stats: %d patterns, %d chars, %d duplicate patterns, %d bytes, %d total-bytes\n",ts->npats,ts->nchars,ts->duplicates,ts->memory,tmem);
00401   */
00402 
00403   return 0;
00404 }
00405 
00406 /*
00407 *   Search - Algorithm
00408 *
00409 *   This routine will log any substring of T that matches a keyword,
00410 *   and processes all prefix matches. This is used for generic
00411 *   pattern searching with a set of keywords and a body of text.
00412 *
00413 *   
00414 *
00415 *   kt- Trie Structure 
00416 *   T - nocase text
00417 *   Tc- case specific text
00418 *   n - text length 
00419 * 
00420 *   returns:
00421 *       # pattern matches
00422 */
00423 static inline int KTriePrefixMatch( KTRIE_STRUCT  * kt, 
00424                                     unsigned char * T, 
00425                                     unsigned char * Tc, 
00426                                     unsigned char * bT, 
00427                                     int n,
00428        int(*match)( void * id,  int index, void * data ),
00429        void * data )
00430 {
00431    KTRIENODE     * root   = kt->root[ *T ];
00432    int             nfound = 0;
00433    KTRIEPATTERN  * pk;
00434    int index ;
00435 
00436    /* Check if any keywords start with this character */
00437    if( !root ) return 0;
00438         
00439    while( n )
00440    {
00441      if( root->edge == *T )
00442      {
00443          T++;
00444          n--;
00445 
00446          for( pk = root->pkeyword; pk; pk= pk->mnext ) /* log each and every prefix match */
00447          {
00448             index = (int)(T - bT - pk->n );
00449 
00450             if( pk->nocase )
00451             {
00452                 nfound++;
00453                 if( match( pk->id, index, data ) )
00454                   return nfound;
00455             }
00456             else
00457             {   /* Retest with a Case Sensitive Test */
00458                 if( !memcmp(pk->Pcase,Tc,pk->n) )
00459                 {
00460                   nfound++;
00461                   if( match( pk->id, index, data ) )
00462                     return nfound;
00463                 }
00464             }
00465          }
00466 
00467          if( n && root->child )
00468          {
00469             root = root->child;   
00470          }
00471          else /* cannot continue -- match is over */
00472          {
00473             break; 
00474          }
00475      }
00476      else
00477      {
00478          if( root->sibling )
00479          {
00480             root = root->sibling;
00481          }
00482          else /* cannot continue */
00483          {
00484             break; 
00485          }
00486      }
00487    }
00488 
00489    return nfound;
00490 }
00491 
00492 /*
00493 *
00494 */
00495 static inline int KTrieSearchNoBC( KTRIE_STRUCT * ks, unsigned char * Tx, int n, 
00496               int(*match)( void *  id,  int index, void * data ), void * data )
00497 {
00498    int            nfound = 0;
00499    unsigned char *T, *bT;
00500 
00501    ConvertCaseEx( Tnocase, Tx, n );
00502 
00503    T  = Tnocase;
00504    bT = T;
00505 
00506    for( ; n>0 ; n--, T++, Tx++ )
00507    {
00508       nfound += KTriePrefixMatch( ks, T, Tx, bT, n, match, data );
00509    }
00510 
00511    return nfound;
00512 }
00513 
00514 /*
00515 *
00516 */
00517 static inline int KTrieSearchBC( KTRIE_STRUCT * ks, unsigned char * Tx, int n,
00518               int(*match)( void * id,  int index, void * data ), void * data )
00519 {
00520    int             tshift;
00521    unsigned char  *Tend;
00522    unsigned char  *T, *bT;
00523    int             nfound  = 0; 
00524    short          *bcShift = (short*)ks->bcShift;
00525    int             bcSize  = ks->bcSize;
00526 
00527    ConvertCaseEx( Tnocase, Tx, n );
00528 
00529    T  = Tnocase;
00530    bT = T;
00531 
00532    Tend = T + n - bcSize;
00533 
00534    bcSize--;
00535 
00536    for( ;T <= Tend; n--, T++, Tx++ )
00537    {
00538        while( (tshift = bcShift[ *( T + bcSize ) ]) > 0 ) 
00539        {
00540           T  += tshift;
00541           Tx += tshift;
00542           if( T > Tend ) return nfound;
00543        }
00544 
00545        nfound += KTriePrefixMatch( ks, T, Tx, bT, n, match, data );
00546    }
00547 
00548    return nfound;
00549 }
00550 
00551 /*
00552 *
00553 */
00554 int KTrieSearch( KTRIE_STRUCT * ks, unsigned char * T, int n, 
00555     int(*match)( void * id,  int index, void * data ), void * data )
00556 {  
00557     if( ks->bcSize < 3)
00558         return KTrieSearchNoBC( ks, T, n, match, data );
00559     else
00560         return KTrieSearchBC( ks, T, n, match, data );
00561 }
00562 
00563 /*
00564 *
00565 *    TEST DRIVER FOR KEYWORD TRIE
00566 *
00567 */
00568 #ifdef KTRIE_MAIN
00569 
00570 char ** gargv;
00571 
00572 int trie_nmatches = 0;
00573 
00574 int match( unsigned id, int index, void * data )
00575 {
00576    trie_nmatches++;
00577    data = data;
00578    printf("id=%d found at index=%d, %s\n",id,index,gargv[id]);
00579    return 0;
00580 }
00581 
00582 /*
00583 *
00584 */
00585 int main( int argc, char ** argv )
00586 {
00587     int i;
00588     KTRIE_STRUCT * ts;
00589     int nocase=1;  // don't care about case
00590 
00591     gargv = argv;
00592     
00593     ts = KTrieNew();
00594 
00595     if( argc < 3 )
00596     {
00597         printf("%s text pat1 pat2 ... patn [-c(ase-sensitive)\n",argv[0]);
00598         printf("search for keywords-default, or match keywords\n");
00599         exit(0); 
00600     }
00601 
00602     for(i=1;i<argc;i++)
00603     {    
00604        if( strcmp(argv[i],"-c")==0 ) nocase=0; /* ignore case */
00605     }
00606 
00607     printf("New TRIE created\n");
00608 
00609     for(i=2;i<argc;i++)
00610     {
00611        if( argv[i][0]=='-' ) 
00612            continue;
00613 
00614        KTrieAddPattern( ts, (unsigned char *)argv[i], strlen(argv[i]), nocase, i );
00615     }
00616     
00617     printf("Patterns added \n");
00618 
00619     KTrieCompile( ts );
00620 
00621     printf("Patterns compiled \n");
00622     printf("--> %d characters, %d patterns, %d bytes allocated\n",ts->nchars,ts->npats,ts->memory);
00623 
00624     printf("Searching...\n");
00625 
00626     KTrieSearch( ts, (unsigned char*)argv[1], strlen(argv[1]), match, 0 );
00627 
00628     printf("%d matches found\n",trie_nmatches);
00629 
00630     printf("normal pgm finish.\n");
00631      
00632     return 0;
00633 }
00634 
00635 #endif

Generated on Sun May 14 14:51:18 2006 by  doxygen 1.4.2