contrib/libpcap/optimize.c

   1 /*
   2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that: (1) source code distributions
   7  * retain the above copyright notice and this paragraph in its entirety, (2)
   8  * distributions including binary code include the above copyright notice and
   9  * this paragraph in its entirety in the documentation or other materials
  10  * provided with the distribution, and (3) all advertising materials mentioning
  11  * features or use of this software display the following acknowledgement:
  12  * ``This product includes software developed by the University of California,
  13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
  14  * the University nor the names of its contributors may be used to endorse
  15  * or promote products derived from this software without specific prior
  16  * written permission.
  17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
  18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  20  *
  21  *  Optimization module for BPF code intermediate representation.
  22  */
  23
  24 #ifdef HAVE_CONFIG_H
  25 #include <config.h>
  26 #endif
  27
  28 #include <pcap-types.h>
  29
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <memory.h>
  33 #include <setjmp.h>
  34 #include <string.h>
  35
  36 #include <errno.h>
  37
  38 #include "pcap-int.h"
  39
  40 #include "gencode.h"
  41 #include "optimize.h"
  42
  43 #ifdef HAVE_OS_PROTO_H
  44 #include "os-proto.h"
  45 #endif
  46
  47 #ifdef BDEBUG
  48 /*
  49  * The internal "debug printout" flag for the filter expression optimizer.
  50  * The code to print that stuff is present only if BDEBUG is defined, so
  51  * the flag, and the routine to set it, are defined only if BDEBUG is
  52  * defined.
  53  */
  54 static int pcap_optimizer_debug;
  55
  56 /*
  57  * Routine to set that flag.
  58  *
  59  * This is intended for libpcap developers, not for general use.
  60  * If you want to set these in a program, you'll have to declare this
  61  * routine yourself, with the appropriate DLL import attribute on Windows;
  62  * it's not declared in any header file, and won't be declared in any
  63  * header file provided by libpcap.
  64  */
  65 PCAP_API void pcap_set_optimizer_debug(int value);
  66
  67 PCAP_API_DEF void
  68 pcap_set_optimizer_debug(int value)
  69 {
  70         pcap_optimizer_debug = value;
  71 }
  72
  73 /*
  74  * The internal "print dot graph" flag for the filter expression optimizer.
  75  * The code to print that stuff is present only if BDEBUG is defined, so
  76  * the flag, and the routine to set it, are defined only if BDEBUG is
  77  * defined.
  78  */
  79 static int pcap_print_dot_graph;
  80
  81 /*
  82  * Routine to set that flag.
  83  *
  84  * This is intended for libpcap developers, not for general use.
  85  * If you want to set these in a program, you'll have to declare this
  86  * routine yourself, with the appropriate DLL import attribute on Windows;
  87  * it's not declared in any header file, and won't be declared in any
  88  * header file provided by libpcap.
  89  */
  90 PCAP_API void pcap_set_print_dot_graph(int value);
  91
  92 PCAP_API_DEF void
  93 pcap_set_print_dot_graph(int value)
  94 {
  95         pcap_print_dot_graph = value;
  96 }
  97
  98 #endif
  99
 100 /*
 101  * lowest_set_bit().
 102  *
 103  * Takes a 32-bit integer as an argument.
 104  *
 105  * If handed a non-zero value, returns the index of the lowest set bit,
 106  * counting upwards fro zero.
 107  *
 108  * If handed zero, the results are platform- and compiler-dependent.
 109  * Keep it out of the light, don't give it any water, don't feed it
 110  * after midnight, and don't pass zero to it.
 111  *
 112  * This is the same as the count of trailing zeroes in the word.
 113  */
 114 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
 115   /*
 116    * GCC 3.4 and later; we have __builtin_ctz().
 117    */
 118   #define lowest_set_bit(mask) __builtin_ctz(mask)
 119 #elif defined(_MSC_VER)
 120   /*
 121    * Visual Studio; we support only 2005 and later, so use
 122    * _BitScanForward().
 123    */
 124 #include <intrin.h>
 125
 126 #ifndef __clang__
 127 #pragma intrinsic(_BitScanForward)
 128 #endif
 129
 130 static __forceinline int
 131 lowest_set_bit(int mask)
 132 {
 133         unsigned long bit;
 134
 135         /*
 136          * Don't sign-extend mask if long is longer than int.
 137          * (It's currently not, in MSVC, even on 64-bit platforms, but....)
 138          */
 139         if (_BitScanForward(&bit, (unsigned int)mask) == 0)
 140                 return -1;      /* mask is zero */
 141         return (int)bit;
 142 }
 143 #elif defined(MSDOS) && defined(__DJGPP__)
 144   /*
 145    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
 146    * we've already included.
 147    */
 148   #define lowest_set_bit(mask)  (ffs((mask)) - 1)
 149 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
 150   /*
 151    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
 152    * or some other platform (UN*X conforming to a sufficient recent version
 153    * of the Single UNIX Specification).
 154    */
 155   #include <strings.h>
 156   #define lowest_set_bit(mask)  (ffs((mask)) - 1)
 157 #else
 158 /*
 159  * None of the above.
 160  * Use a perfect-hash-function-based function.
 161  */
 162 static int
 163 lowest_set_bit(int mask)
 164 {
 165         unsigned int v = (unsigned int)mask;
 166
 167         static const int MultiplyDeBruijnBitPosition[32] = {
 168                 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
 169                 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
 170         };
 171
 172         /*
 173          * We strip off all but the lowermost set bit (v & ~v),
 174          * and perform a minimal perfect hash on it to look up the
 175          * number of low-order zero bits in a table.
 176          *
 177          * See:
 178          *
 179          *      http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
 180          *
 181          *      http://supertech.csail.mit.edu/papers/debruijn.pdf
 182          */
 183         return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
 184 }
 185 #endif
 186
 187 /*
 188  * Represents a deleted instruction.
 189  */
 190 #define NOP -1
 191
 192 /*
 193  * Register numbers for use-def values.
 194  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
 195  * location.  A_ATOM is the accumulator and X_ATOM is the index
 196  * register.
 197  */
 198 #define A_ATOM BPF_MEMWORDS
 199 #define X_ATOM (BPF_MEMWORDS+1)
 200
 201 /*
 202  * This define is used to represent *both* the accumulator and
 203  * x register in use-def computations.
 204  * Currently, the use-def code assumes only one definition per instruction.
 205  */
 206 #define AX_ATOM N_ATOMS
 207
 208 /*
 209  * These data structures are used in a Cocke and Shwarz style
 210  * value numbering scheme.  Since the flowgraph is acyclic,
 211  * exit values can be propagated from a node's predecessors
 212  * provided it is uniquely defined.
 213  */
 214 struct valnode {
 215         int code;
 216         int v0, v1;
 217         int val;
 218         struct valnode *next;
 219 };
 220
 221 /* Integer constants mapped with the load immediate opcode. */
 222 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0L)
 223
 224 struct vmapinfo {
 225         int is_const;
 226         bpf_int32 const_val;
 227 };
 228
 229 typedef struct {
 230         /*
 231          * Place to longjmp to on an error.
 232          */
 233         jmp_buf top_ctx;
 234
 235         /*
 236          * The buffer into which to put error message.
 237          */
 238         char *errbuf;
 239
 240         /*
 241          * A flag to indicate that further optimization is needed.
 242          * Iterative passes are continued until a given pass yields no
 243          * branch movement.
 244          */
 245         int done;
 246
 247         int n_blocks;
 248         struct block **blocks;
 249         int n_edges;
 250         struct edge **edges;
 251
 252         /*
 253          * A bit vector set representation of the dominators.
 254          * We round up the set size to the next power of two.
 255          */
 256         int nodewords;
 257         int edgewords;
 258         struct block **levels;
 259         bpf_u_int32 *space;
 260
 261 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
 262 /*
 263  * True if a is in uset {p}
 264  */
 265 #define SET_MEMBER(p, a) \
 266 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
 267
 268 /*
 269  * Add 'a' to uset p.
 270  */
 271 #define SET_INSERT(p, a) \
 272 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 273
 274 /*
 275  * Delete 'a' from uset p.
 276  */
 277 #define SET_DELETE(p, a) \
 278 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
 279
 280 /*
 281  * a := a intersect b
 282  */
 283 #define SET_INTERSECT(a, b, n)\
 284 {\
 285         register bpf_u_int32 *_x = a, *_y = b;\
 286         register int _n = n;\
 287         while (--_n >= 0) *_x++ &= *_y++;\
 288 }
 289
 290 /*
 291  * a := a - b
 292  */
 293 #define SET_SUBTRACT(a, b, n)\
 294 {\
 295         register bpf_u_int32 *_x = a, *_y = b;\
 296         register int _n = n;\
 297         while (--_n >= 0) *_x++ &=~ *_y++;\
 298 }
 299
 300 /*
 301  * a := a union b
 302  */
 303 #define SET_UNION(a, b, n)\
 304 {\
 305         register bpf_u_int32 *_x = a, *_y = b;\
 306         register int _n = n;\
 307         while (--_n >= 0) *_x++ |= *_y++;\
 308 }
 309
 310         uset all_dom_sets;
 311         uset all_closure_sets;
 312         uset all_edge_sets;
 313
 314 #define MODULUS 213
 315         struct valnode *hashtbl[MODULUS];
 316         int curval;
 317         int maxval;
 318
 319         struct vmapinfo *vmap;
 320         struct valnode *vnode_base;
 321         struct valnode *next_vnode;
 322 } opt_state_t;
 323
 324 typedef struct {
 325         /*
 326          * Place to longjmp to on an error.
 327          */
 328         jmp_buf top_ctx;
 329
 330         /*
 331          * The buffer into which to put error message.
 332          */
 333         char *errbuf;
 334
 335         /*
 336          * Some pointers used to convert the basic block form of the code,
 337          * into the array form that BPF requires.  'fstart' will point to
 338          * the malloc'd array while 'ftail' is used during the recursive
 339          * traversal.
 340          */
 341         struct bpf_insn *fstart;
 342         struct bpf_insn *ftail;
 343 } conv_state_t;
 344
 345 static void opt_init(opt_state_t *, struct icode *);
 346 static void opt_cleanup(opt_state_t *);
 347 static void PCAP_NORETURN opt_error(opt_state_t *, const char *, ...)
 348     PCAP_PRINTFLIKE(2, 3);
 349
 350 static void intern_blocks(opt_state_t *, struct icode *);
 351
 352 static void find_inedges(opt_state_t *, struct block *);
 353 #ifdef BDEBUG
 354 static void opt_dump(opt_state_t *, struct icode *);
 355 #endif
 356
 357 #ifndef MAX
 358 #define MAX(a,b) ((a)>(b)?(a):(b))
 359 #endif
 360
 361 static void
 362 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
 363 {
 364         int level;
 365
 366         if (isMarked(ic, b))
 367                 return;
 368
 369         Mark(ic, b);
 370         b->link = 0;
 371
 372         if (JT(b)) {
 373                 find_levels_r(opt_state, ic, JT(b));
 374                 find_levels_r(opt_state, ic, JF(b));
 375                 level = MAX(JT(b)->level, JF(b)->level) + 1;
 376         } else
 377                 level = 0;
 378         b->level = level;
 379         b->link = opt_state->levels[level];
 380         opt_state->levels[level] = b;
 381 }
 382
 383 /*
 384  * Level graph.  The levels go from 0 at the leaves to
 385  * N_LEVELS at the root.  The opt_state->levels[] array points to the
 386  * first node of the level list, whose elements are linked
 387  * with the 'link' field of the struct block.
 388  */
 389 static void
 390 find_levels(opt_state_t *opt_state, struct icode *ic)
 391 {
 392         memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
 393         unMarkAll(ic);
 394         find_levels_r(opt_state, ic, ic->root);
 395 }
 396
 397 /*
 398  * Find dominator relationships.
 399  * Assumes graph has been leveled.
 400  */
 401 static void
 402 find_dom(opt_state_t *opt_state, struct block *root)
 403 {
 404         int i;
 405         struct block *b;
 406         bpf_u_int32 *x;
 407
 408         /*
 409          * Initialize sets to contain all nodes.
 410          */
 411         x = opt_state->all_dom_sets;
 412         i = opt_state->n_blocks * opt_state->nodewords;
 413         while (--i >= 0)
 414                 *x++ = 0xFFFFFFFFU;
 415         /* Root starts off empty. */
 416         for (i = opt_state->nodewords; --i >= 0;)
 417                 root->dom[i] = 0;
 418
 419         /* root->level is the highest level no found. */
 420         for (i = root->level; i >= 0; --i) {
 421                 for (b = opt_state->levels[i]; b; b = b->link) {
 422                         SET_INSERT(b->dom, b->id);
 423                         if (JT(b) == 0)
 424                                 continue;
 425                         SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
 426                         SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
 427                 }
 428         }
 429 }
 430
 431 static void
 432 propedom(opt_state_t *opt_state, struct edge *ep)
 433 {
 434         SET_INSERT(ep->edom, ep->id);
 435         if (ep->succ) {
 436                 SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
 437                 SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
 438         }
 439 }
 440
 441 /*
 442  * Compute edge dominators.
 443  * Assumes graph has been leveled and predecessors established.
 444  */
 445 static void
 446 find_edom(opt_state_t *opt_state, struct block *root)
 447 {
 448         int i;
 449         uset x;
 450         struct block *b;
 451
 452         x = opt_state->all_edge_sets;
 453         for (i = opt_state->n_edges * opt_state->edgewords; --i >= 0; )
 454                 x[i] = 0xFFFFFFFFU;
 455
 456         /* root->level is the highest level no found. */
 457         memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 458         memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 459         for (i = root->level; i >= 0; --i) {
 460                 for (b = opt_state->levels[i]; b != 0; b = b->link) {
 461                         propedom(opt_state, &b->et);
 462                         propedom(opt_state, &b->ef);
 463                 }
 464         }
 465 }
 466
 467 /*
 468  * Find the backwards transitive closure of the flow graph.  These sets
 469  * are backwards in the sense that we find the set of nodes that reach
 470  * a given node, not the set of nodes that can be reached by a node.
 471  *
 472  * Assumes graph has been leveled.
 473  */
 474 static void
 475 find_closure(opt_state_t *opt_state, struct block *root)
 476 {
 477         int i;
 478         struct block *b;
 479
 480         /*
 481          * Initialize sets to contain no nodes.
 482          */
 483         memset((char *)opt_state->all_closure_sets, 0,
 484               opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
 485
 486         /* root->level is the highest level no found. */
 487         for (i = root->level; i >= 0; --i) {
 488                 for (b = opt_state->levels[i]; b; b = b->link) {
 489                         SET_INSERT(b->closure, b->id);
 490                         if (JT(b) == 0)
 491                                 continue;
 492                         SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
 493                         SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
 494                 }
 495         }
 496 }
 497
 498 /*
 499  * Return the register number that is used by s.  If A and X are both
 500  * used, return AX_ATOM.  If no register is used, return -1.
 501  *
 502  * The implementation should probably change to an array access.
 503  */
 504 static int
 505 atomuse(struct stmt *s)
 506 {
 507         register int c = s->code;
 508
 509         if (c == NOP)
 510                 return -1;
 511
 512         switch (BPF_CLASS(c)) {
 513
 514         case BPF_RET:
 515                 return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
 516                         (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
 517
 518         case BPF_LD:
 519         case BPF_LDX:
 520                 return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
 521                         (BPF_MODE(c) == BPF_MEM) ? s->k : -1;
 522
 523         case BPF_ST:
 524                 return A_ATOM;
 525
 526         case BPF_STX:
 527                 return X_ATOM;
 528
 529         case BPF_JMP:
 530         case BPF_ALU:
 531                 if (BPF_SRC(c) == BPF_X)
 532                         return AX_ATOM;
 533                 return A_ATOM;
 534
 535         case BPF_MISC:
 536                 return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
 537         }
 538         abort();
 539         /* NOTREACHED */
 540 }
 541
 542 /*
 543  * Return the register number that is defined by 's'.  We assume that
 544  * a single stmt cannot define more than one register.  If no register
 545  * is defined, return -1.
 546  *
 547  * The implementation should probably change to an array access.
 548  */
 549 static int
 550 atomdef(struct stmt *s)
 551 {
 552         if (s->code == NOP)
 553                 return -1;
 554
 555         switch (BPF_CLASS(s->code)) {
 556
 557         case BPF_LD:
 558         case BPF_ALU:
 559                 return A_ATOM;
 560
 561         case BPF_LDX:
 562                 return X_ATOM;
 563
 564         case BPF_ST:
 565         case BPF_STX:
 566                 return s->k;
 567
 568         case BPF_MISC:
 569                 return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
 570         }
 571         return -1;
 572 }
 573
 574 /*
 575  * Compute the sets of registers used, defined, and killed by 'b'.
 576  *
 577  * "Used" means that a statement in 'b' uses the register before any
 578  * statement in 'b' defines it, i.e. it uses the value left in
 579  * that register by a predecessor block of this block.
 580  * "Defined" means that a statement in 'b' defines it.
 581  * "Killed" means that a statement in 'b' defines it before any
 582  * statement in 'b' uses it, i.e. it kills the value left in that
 583  * register by a predecessor block of this block.
 584  */
 585 static void
 586 compute_local_ud(struct block *b)
 587 {
 588         struct slist *s;
 589         atomset def = 0, use = 0, killed = 0;
 590         int atom;
 591
 592         for (s = b->stmts; s; s = s->next) {
 593                 if (s->s.code == NOP)
 594                         continue;
 595                 atom = atomuse(&s->s);
 596                 if (atom >= 0) {
 597                         if (atom == AX_ATOM) {
 598                                 if (!ATOMELEM(def, X_ATOM))
 599                                         use |= ATOMMASK(X_ATOM);
 600                                 if (!ATOMELEM(def, A_ATOM))
 601                                         use |= ATOMMASK(A_ATOM);
 602                         }
 603                         else if (atom < N_ATOMS) {
 604                                 if (!ATOMELEM(def, atom))
 605                                         use |= ATOMMASK(atom);
 606                         }
 607                         else
 608                                 abort();
 609                 }
 610                 atom = atomdef(&s->s);
 611                 if (atom >= 0) {
 612                         if (!ATOMELEM(use, atom))
 613                                 killed |= ATOMMASK(atom);
 614                         def |= ATOMMASK(atom);
 615                 }
 616         }
 617         if (BPF_CLASS(b->s.code) == BPF_JMP) {
 618                 /*
 619                  * XXX - what about RET?
 620                  */
 621                 atom = atomuse(&b->s);
 622                 if (atom >= 0) {
 623                         if (atom == AX_ATOM) {
 624                                 if (!ATOMELEM(def, X_ATOM))
 625                                         use |= ATOMMASK(X_ATOM);
 626                                 if (!ATOMELEM(def, A_ATOM))
 627                                         use |= ATOMMASK(A_ATOM);
 628                         }
 629                         else if (atom < N_ATOMS) {
 630                                 if (!ATOMELEM(def, atom))
 631                                         use |= ATOMMASK(atom);
 632                         }
 633                         else
 634                                 abort();
 635                 }
 636         }
 637
 638         b->def = def;
 639         b->kill = killed;
 640         b->in_use = use;
 641 }
 642
 643 /*
 644  * Assume graph is already leveled.
 645  */
 646 static void
 647 find_ud(opt_state_t *opt_state, struct block *root)
 648 {
 649         int i, maxlevel;
 650         struct block *p;
 651
 652         /*
 653          * root->level is the highest level no found;
 654          * count down from there.
 655          */
 656         maxlevel = root->level;
 657         for (i = maxlevel; i >= 0; --i)
 658                 for (p = opt_state->levels[i]; p; p = p->link) {
 659                         compute_local_ud(p);
 660                         p->out_use = 0;
 661                 }
 662
 663         for (i = 1; i <= maxlevel; ++i) {
 664                 for (p = opt_state->levels[i]; p; p = p->link) {
 665                         p->out_use |= JT(p)->in_use | JF(p)->in_use;
 666                         p->in_use |= p->out_use &~ p->kill;
 667                 }
 668         }
 669 }
 670 static void
 671 init_val(opt_state_t *opt_state)
 672 {
 673         opt_state->curval = 0;
 674         opt_state->next_vnode = opt_state->vnode_base;
 675         memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
 676         memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
 677 }
 678
 679 /* Because we really don't have an IR, this stuff is a little messy. */
 680 static int
 681 F(opt_state_t *opt_state, int code, int v0, int v1)
 682 {
 683         u_int hash;
 684         int val;
 685         struct valnode *p;
 686
 687         hash = (u_int)code ^ ((u_int)v0 << 4) ^ ((u_int)v1 << 8);
 688         hash %= MODULUS;
 689
 690         for (p = opt_state->hashtbl[hash]; p; p = p->next)
 691                 if (p->code == code && p->v0 == v0 && p->v1 == v1)
 692                         return p->val;
 693
 694         val = ++opt_state->curval;
 695         if (BPF_MODE(code) == BPF_IMM &&
 696             (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
 697                 opt_state->vmap[val].const_val = v0;
 698                 opt_state->vmap[val].is_const = 1;
 699         }
 700         p = opt_state->next_vnode++;
 701         p->val = val;
 702         p->code = code;
 703         p->v0 = v0;
 704         p->v1 = v1;
 705         p->next = opt_state->hashtbl[hash];
 706         opt_state->hashtbl[hash] = p;
 707
 708         return val;
 709 }
 710
 711 static inline void
 712 vstore(struct stmt *s, int *valp, int newval, int alter)
 713 {
 714         if (alter && newval != VAL_UNKNOWN && *valp == newval)
 715                 s->code = NOP;
 716         else
 717                 *valp = newval;
 718 }
 719
 720 /*
 721  * Do constant-folding on binary operators.
 722  * (Unary operators are handled elsewhere.)
 723  */
 724 static void
 725 fold_op(opt_state_t *opt_state, struct stmt *s, int v0, int v1)
 726 {
 727         bpf_u_int32 a, b;
 728
 729         a = opt_state->vmap[v0].const_val;
 730         b = opt_state->vmap[v1].const_val;
 731
 732         switch (BPF_OP(s->code)) {
 733         case BPF_ADD:
 734                 a += b;
 735                 break;
 736
 737         case BPF_SUB:
 738                 a -= b;
 739                 break;
 740
 741         case BPF_MUL:
 742                 a *= b;
 743                 break;
 744
 745         case BPF_DIV:
 746                 if (b == 0)
 747                         opt_error(opt_state, "division by zero");
 748                 a /= b;
 749                 break;
 750
 751         case BPF_MOD:
 752                 if (b == 0)
 753                         opt_error(opt_state, "modulus by zero");
 754                 a %= b;
 755                 break;
 756
 757         case BPF_AND:
 758                 a &= b;
 759                 break;
 760
 761         case BPF_OR:
 762                 a |= b;
 763                 break;
 764
 765         case BPF_XOR:
 766                 a ^= b;
 767                 break;
 768
 769         case BPF_LSH:
 770                 /*
 771                  * A left shift of more than the width of the type
 772                  * is undefined in C; we'll just treat it as shifting
 773                  * all the bits out.
 774                  *
 775                  * XXX - the BPF interpreter doesn't check for this,
 776                  * so its behavior is dependent on the behavior of
 777                  * the processor on which it's running.  There are
 778                  * processors on which it shifts all the bits out
 779                  * and processors on which it does no shift.
 780                  */
 781                 if (b < 32)
 782                         a <<= b;
 783                 else
 784                         a = 0;
 785                 break;
 786
 787         case BPF_RSH:
 788                 /*
 789                  * A right shift of more than the width of the type
 790                  * is undefined in C; we'll just treat it as shifting
 791                  * all the bits out.
 792                  *
 793                  * XXX - the BPF interpreter doesn't check for this,
 794                  * so its behavior is dependent on the behavior of
 795                  * the processor on which it's running.  There are
 796                  * processors on which it shifts all the bits out
 797                  * and processors on which it does no shift.
 798                  */
 799                 if (b < 32)
 800                         a >>= b;
 801                 else
 802                         a = 0;
 803                 break;
 804
 805         default:
 806                 abort();
 807         }
 808         s->k = a;
 809         s->code = BPF_LD|BPF_IMM;
 810         opt_state->done = 0;
 811 }
 812
 813 static inline struct slist *
 814 this_op(struct slist *s)
 815 {
 816         while (s != 0 && s->s.code == NOP)
 817                 s = s->next;
 818         return s;
 819 }
 820
 821 static void
 822 opt_not(struct block *b)
 823 {
 824         struct block *tmp = JT(b);
 825
 826         JT(b) = JF(b);
 827         JF(b) = tmp;
 828 }
 829
 830 static void
 831 opt_peep(opt_state_t *opt_state, struct block *b)
 832 {
 833         struct slist *s;
 834         struct slist *next, *last;
 835         int val;
 836
 837         s = b->stmts;
 838         if (s == 0)
 839                 return;
 840
 841         last = s;
 842         for (/*empty*/; /*empty*/; s = next) {
 843                 /*
 844                  * Skip over nops.
 845                  */
 846                 s = this_op(s);
 847                 if (s == 0)
 848                         break;  /* nothing left in the block */
 849
 850                 /*
 851                  * Find the next real instruction after that one
 852                  * (skipping nops).
 853                  */
 854                 next = this_op(s->next);
 855                 if (next == 0)
 856                         break;  /* no next instruction */
 857                 last = next;
 858
 859                 /*
 860                  * st  M[k]     -->     st  M[k]
 861                  * ldx M[k]             tax
 862                  */
 863                 if (s->s.code == BPF_ST &&
 864                     next->s.code == (BPF_LDX|BPF_MEM) &&
 865                     s->s.k == next->s.k) {
 866                         opt_state->done = 0;
 867                         next->s.code = BPF_MISC|BPF_TAX;
 868                 }
 869                 /*
 870                  * ld  #k       -->     ldx  #k
 871                  * tax                  txa
 872                  */
 873                 if (s->s.code == (BPF_LD|BPF_IMM) &&
 874                     next->s.code == (BPF_MISC|BPF_TAX)) {
 875                         s->s.code = BPF_LDX|BPF_IMM;
 876                         next->s.code = BPF_MISC|BPF_TXA;
 877                         opt_state->done = 0;
 878                 }
 879                 /*
 880                  * This is an ugly special case, but it happens
 881                  * when you say tcp[k] or udp[k] where k is a constant.
 882                  */
 883                 if (s->s.code == (BPF_LD|BPF_IMM)) {
 884                         struct slist *add, *tax, *ild;
 885
 886                         /*
 887                          * Check that X isn't used on exit from this
 888                          * block (which the optimizer might cause).
 889                          * We know the code generator won't generate
 890                          * any local dependencies.
 891                          */
 892                         if (ATOMELEM(b->out_use, X_ATOM))
 893                                 continue;
 894
 895                         /*
 896                          * Check that the instruction following the ldi
 897                          * is an addx, or it's an ldxms with an addx
 898                          * following it (with 0 or more nops between the
 899                          * ldxms and addx).
 900                          */
 901                         if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
 902                                 add = next;
 903                         else
 904                                 add = this_op(next->next);
 905                         if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
 906                                 continue;
 907
 908                         /*
 909                          * Check that a tax follows that (with 0 or more
 910                          * nops between them).
 911                          */
 912                         tax = this_op(add->next);
 913                         if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
 914                                 continue;
 915
 916                         /*
 917                          * Check that an ild follows that (with 0 or more
 918                          * nops between them).
 919                          */
 920                         ild = this_op(tax->next);
 921                         if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
 922                             BPF_MODE(ild->s.code) != BPF_IND)
 923                                 continue;
 924                         /*
 925                          * We want to turn this sequence:
 926                          *
 927                          * (004) ldi     #0x2           {s}
 928                          * (005) ldxms   [14]           {next}  -- optional
 929                          * (006) addx                   {add}
 930                          * (007) tax                    {tax}
 931                          * (008) ild     [x+0]          {ild}
 932                          *
 933                          * into this sequence:
 934                          *
 935                          * (004) nop
 936                          * (005) ldxms   [14]
 937                          * (006) nop
 938                          * (007) nop
 939                          * (008) ild     [x+2]
 940                          *
 941                          * XXX We need to check that X is not
 942                          * subsequently used, because we want to change
 943                          * what'll be in it after this sequence.
 944                          *
 945                          * We know we can eliminate the accumulator
 946                          * modifications earlier in the sequence since
 947                          * it is defined by the last stmt of this sequence
 948                          * (i.e., the last statement of the sequence loads
 949                          * a value into the accumulator, so we can eliminate
 950                          * earlier operations on the accumulator).
 951                          */
 952                         ild->s.k += s->s.k;
 953                         s->s.code = NOP;
 954                         add->s.code = NOP;
 955                         tax->s.code = NOP;
 956                         opt_state->done = 0;
 957                 }
 958         }
 959         /*
 960          * If the comparison at the end of a block is an equality
 961          * comparison against a constant, and nobody uses the value
 962          * we leave in the A register at the end of a block, and
 963          * the operation preceding the comparison is an arithmetic
 964          * operation, we can sometime optimize it away.
 965          */
 966         if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
 967             !ATOMELEM(b->out_use, A_ATOM)) {
 968                 /*
 969                  * We can optimize away certain subtractions of the
 970                  * X register.
 971                  */
 972                 if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
 973                         val = b->val[X_ATOM];
 974                         if (opt_state->vmap[val].is_const) {
 975                                 /*
 976                                  * If we have a subtract to do a comparison,
 977                                  * and the X register is a known constant,
 978                                  * we can merge this value into the
 979                                  * comparison:
 980                                  *
 981                                  * sub x  ->    nop
 982                                  * jeq #y       jeq #(x+y)
 983                                  */
 984                                 b->s.k += opt_state->vmap[val].const_val;
 985                                 last->s.code = NOP;
 986                                 opt_state->done = 0;
 987                         } else if (b->s.k == 0) {
 988                                 /*
 989                                  * If the X register isn't a constant,
 990                                  * and the comparison in the test is
 991                                  * against 0, we can compare with the
 992                                  * X register, instead:
 993                                  *
 994                                  * sub x  ->    nop
 995                                  * jeq #0       jeq x
 996                                  */
 997                                 last->s.code = NOP;
 998                                 b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
 999                                 opt_state->done = 0;
1000                         }
1001                 }
1002                 /*
1003                  * Likewise, a constant subtract can be simplified:
1004                  *
1005                  * sub #x ->    nop
1006                  * jeq #y ->    jeq #(x+y)
1007                  */
1008                 else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
1009                         last->s.code = NOP;
1010                         b->s.k += last->s.k;
1011                         opt_state->done = 0;
1012                 }
1013                 /*
1014                  * And, similarly, a constant AND can be simplified
1015                  * if we're testing against 0, i.e.:
1016                  *
1017                  * and #k       nop
1018                  * jeq #0  ->   jset #k
1019                  */
1020                 else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
1021                     b->s.k == 0) {
1022                         b->s.k = last->s.k;
1023                         b->s.code = BPF_JMP|BPF_K|BPF_JSET;
1024                         last->s.code = NOP;
1025                         opt_state->done = 0;
1026                         opt_not(b);
1027                 }
1028         }
1029         /*
1030          * jset #0        ->   never
1031          * jset #ffffffff ->   always
1032          */
1033         if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
1034                 if (b->s.k == 0)
1035                         JT(b) = JF(b);
1036                 if ((u_int)b->s.k == 0xffffffffU)
1037                         JF(b) = JT(b);
1038         }
1039         /*
1040          * If we're comparing against the index register, and the index
1041          * register is a known constant, we can just compare against that
1042          * constant.
1043          */
1044         val = b->val[X_ATOM];
1045         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
1046                 bpf_int32 v = opt_state->vmap[val].const_val;
1047                 b->s.code &= ~BPF_X;
1048                 b->s.k = v;
1049         }
1050         /*
1051          * If the accumulator is a known constant, we can compute the
1052          * comparison result.
1053          */
1054         val = b->val[A_ATOM];
1055         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1056                 bpf_int32 v = opt_state->vmap[val].const_val;
1057                 switch (BPF_OP(b->s.code)) {
1058
1059                 case BPF_JEQ:
1060                         v = v == b->s.k;
1061                         break;
1062
1063                 case BPF_JGT:
1064                         v = (unsigned)v > (unsigned)b->s.k;
1065                         break;
1066
1067                 case BPF_JGE:
1068                         v = (unsigned)v >= (unsigned)b->s.k;
1069                         break;
1070
1071                 case BPF_JSET:
1072                         v &= b->s.k;
1073                         break;
1074
1075                 default:
1076                         abort();
1077                 }
1078                 if (JF(b) != JT(b))
1079                         opt_state->done = 0;
1080                 if (v)
1081                         JF(b) = JT(b);
1082                 else
1083                         JT(b) = JF(b);
1084         }
1085 }
1086
1087 /*
1088  * Compute the symbolic value of expression of 's', and update
1089  * anything it defines in the value table 'val'.  If 'alter' is true,
1090  * do various optimizations.  This code would be cleaner if symbolic
1091  * evaluation and code transformations weren't folded together.
1092  */
1093 static void
1094 opt_stmt(opt_state_t *opt_state, struct stmt *s, int val[], int alter)
1095 {
1096         int op;
1097         int v;
1098
1099         switch (s->code) {
1100
1101         case BPF_LD|BPF_ABS|BPF_W:
1102         case BPF_LD|BPF_ABS|BPF_H:
1103         case BPF_LD|BPF_ABS|BPF_B:
1104                 v = F(opt_state, s->code, s->k, 0L);
1105                 vstore(s, &val[A_ATOM], v, alter);
1106                 break;
1107
1108         case BPF_LD|BPF_IND|BPF_W:
1109         case BPF_LD|BPF_IND|BPF_H:
1110         case BPF_LD|BPF_IND|BPF_B:
1111                 v = val[X_ATOM];
1112                 if (alter && opt_state->vmap[v].is_const) {
1113                         s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1114                         s->k += opt_state->vmap[v].const_val;
1115                         v = F(opt_state, s->code, s->k, 0L);
1116                         opt_state->done = 0;
1117                 }
1118                 else
1119                         v = F(opt_state, s->code, s->k, v);
1120                 vstore(s, &val[A_ATOM], v, alter);
1121                 break;
1122
1123         case BPF_LD|BPF_LEN:
1124                 v = F(opt_state, s->code, 0L, 0L);
1125                 vstore(s, &val[A_ATOM], v, alter);
1126                 break;
1127
1128         case BPF_LD|BPF_IMM:
1129                 v = K(s->k);
1130                 vstore(s, &val[A_ATOM], v, alter);
1131                 break;
1132
1133         case BPF_LDX|BPF_IMM:
1134                 v = K(s->k);
1135                 vstore(s, &val[X_ATOM], v, alter);
1136                 break;
1137
1138         case BPF_LDX|BPF_MSH|BPF_B:
1139                 v = F(opt_state, s->code, s->k, 0L);
1140                 vstore(s, &val[X_ATOM], v, alter);
1141                 break;
1142
1143         case BPF_ALU|BPF_NEG:
1144                 if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1145                         s->code = BPF_LD|BPF_IMM;
1146                         /*
1147                          * Do this negation as unsigned arithmetic; that's
1148                          * what modern BPF engines do, and it guarantees
1149                          * that all possible values can be negated.  (Yeah,
1150                          * negating 0x80000000, the minimum signed 32-bit
1151                          * two's-complement value, results in 0x80000000,
1152                          * so it's still negative, but we *should* be doing
1153                          * all unsigned arithmetic here, to match what
1154                          * modern BPF engines do.)
1155                          *
1156                          * Express it as 0U - (unsigned value) so that we
1157                          * don't get compiler warnings about negating an
1158                          * unsigned value and don't get UBSan warnings
1159                          * about the result of negating 0x80000000 being
1160                          * undefined.
1161                          */
1162                         s->k = 0U - (bpf_u_int32)(opt_state->vmap[val[A_ATOM]].const_val);
1163                         val[A_ATOM] = K(s->k);
1164                 }
1165                 else
1166                         val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1167                 break;
1168
1169         case BPF_ALU|BPF_ADD|BPF_K:
1170         case BPF_ALU|BPF_SUB|BPF_K:
1171         case BPF_ALU|BPF_MUL|BPF_K:
1172         case BPF_ALU|BPF_DIV|BPF_K:
1173         case BPF_ALU|BPF_MOD|BPF_K:
1174         case BPF_ALU|BPF_AND|BPF_K:
1175         case BPF_ALU|BPF_OR|BPF_K:
1176         case BPF_ALU|BPF_XOR|BPF_K:
1177         case BPF_ALU|BPF_LSH|BPF_K:
1178         case BPF_ALU|BPF_RSH|BPF_K:
1179                 op = BPF_OP(s->code);
1180                 if (alter) {
1181                         if (s->k == 0) {
1182                                 /*
1183                                  * Optimize operations where the constant
1184                                  * is zero.
1185                                  *
1186                                  * Don't optimize away "sub #0"
1187                                  * as it may be needed later to
1188                                  * fixup the generated math code.
1189                                  *
1190                                  * Fail if we're dividing by zero or taking
1191                                  * a modulus by zero.
1192                                  */
1193                                 if (op == BPF_ADD ||
1194                                     op == BPF_LSH || op == BPF_RSH ||
1195                                     op == BPF_OR || op == BPF_XOR) {
1196                                         s->code = NOP;
1197                                         break;
1198                                 }
1199                                 if (op == BPF_MUL || op == BPF_AND) {
1200                                         s->code = BPF_LD|BPF_IMM;
1201                                         val[A_ATOM] = K(s->k);
1202                                         break;
1203                                 }
1204                                 if (op == BPF_DIV)
1205                                         opt_error(opt_state,
1206                                             "division by zero");
1207                                 if (op == BPF_MOD)
1208                                         opt_error(opt_state,
1209                                             "modulus by zero");
1210                         }
1211                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1212                                 fold_op(opt_state, s, val[A_ATOM], K(s->k));
1213                                 val[A_ATOM] = K(s->k);
1214                                 break;
1215                         }
1216                 }
1217                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1218                 break;
1219
1220         case BPF_ALU|BPF_ADD|BPF_X:
1221         case BPF_ALU|BPF_SUB|BPF_X:
1222         case BPF_ALU|BPF_MUL|BPF_X:
1223         case BPF_ALU|BPF_DIV|BPF_X:
1224         case BPF_ALU|BPF_MOD|BPF_X:
1225         case BPF_ALU|BPF_AND|BPF_X:
1226         case BPF_ALU|BPF_OR|BPF_X:
1227         case BPF_ALU|BPF_XOR|BPF_X:
1228         case BPF_ALU|BPF_LSH|BPF_X:
1229         case BPF_ALU|BPF_RSH|BPF_X:
1230                 op = BPF_OP(s->code);
1231                 if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1232                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1233                                 fold_op(opt_state, s, val[A_ATOM], val[X_ATOM]);
1234                                 val[A_ATOM] = K(s->k);
1235                         }
1236                         else {
1237                                 s->code = BPF_ALU|BPF_K|op;
1238                                 s->k = opt_state->vmap[val[X_ATOM]].const_val;
1239                                 /*
1240                                  * XXX - we need to make up our minds
1241                                  * as to what integers are signed and
1242                                  * what integers are unsigned in BPF
1243                                  * programs and in our IR.
1244                                  */
1245                                 if ((op == BPF_LSH || op == BPF_RSH) &&
1246                                     (s->k < 0 || s->k > 31))
1247                                         opt_error(opt_state,
1248                                             "shift by more than 31 bits");
1249                                 opt_state->done = 0;
1250                                 val[A_ATOM] =
1251                                         F(opt_state, s->code, val[A_ATOM], K(s->k));
1252                         }
1253                         break;
1254                 }
1255                 /*
1256                  * Check if we're doing something to an accumulator
1257                  * that is 0, and simplify.  This may not seem like
1258                  * much of a simplification but it could open up further
1259                  * optimizations.
1260                  * XXX We could also check for mul by 1, etc.
1261                  */
1262                 if (alter && opt_state->vmap[val[A_ATOM]].is_const
1263                     && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1264                         if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1265                                 s->code = BPF_MISC|BPF_TXA;
1266                                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1267                                 break;
1268                         }
1269                         else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1270                                  op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1271                                 s->code = BPF_LD|BPF_IMM;
1272                                 s->k = 0;
1273                                 vstore(s, &val[A_ATOM], K(s->k), alter);
1274                                 break;
1275                         }
1276                         else if (op == BPF_NEG) {
1277                                 s->code = NOP;
1278                                 break;
1279                         }
1280                 }
1281                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1282                 break;
1283
1284         case BPF_MISC|BPF_TXA:
1285                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1286                 break;
1287
1288         case BPF_LD|BPF_MEM:
1289                 v = val[s->k];
1290                 if (alter && opt_state->vmap[v].is_const) {
1291                         s->code = BPF_LD|BPF_IMM;
1292                         s->k = opt_state->vmap[v].const_val;
1293                         opt_state->done = 0;
1294                 }
1295                 vstore(s, &val[A_ATOM], v, alter);
1296                 break;
1297
1298         case BPF_MISC|BPF_TAX:
1299                 vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1300                 break;
1301
1302         case BPF_LDX|BPF_MEM:
1303                 v = val[s->k];
1304                 if (alter && opt_state->vmap[v].is_const) {
1305                         s->code = BPF_LDX|BPF_IMM;
1306                         s->k = opt_state->vmap[v].const_val;
1307                         opt_state->done = 0;
1308                 }
1309                 vstore(s, &val[X_ATOM], v, alter);
1310                 break;
1311
1312         case BPF_ST:
1313                 vstore(s, &val[s->k], val[A_ATOM], alter);
1314                 break;
1315
1316         case BPF_STX:
1317                 vstore(s, &val[s->k], val[X_ATOM], alter);
1318                 break;
1319         }
1320 }
1321
1322 static void
1323 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1324 {
1325         register int atom;
1326
1327         atom = atomuse(s);
1328         if (atom >= 0) {
1329                 if (atom == AX_ATOM) {
1330                         last[X_ATOM] = 0;
1331                         last[A_ATOM] = 0;
1332                 }
1333                 else
1334                         last[atom] = 0;
1335         }
1336         atom = atomdef(s);
1337         if (atom >= 0) {
1338                 if (last[atom]) {
1339                         opt_state->done = 0;
1340                         last[atom]->code = NOP;
1341                 }
1342                 last[atom] = s;
1343         }
1344 }
1345
1346 static void
1347 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1348 {
1349         register struct slist *s;
1350         register int atom;
1351         struct stmt *last[N_ATOMS];
1352
1353         memset((char *)last, 0, sizeof last);
1354
1355         for (s = b->stmts; s != 0; s = s->next)
1356                 deadstmt(opt_state, &s->s, last);
1357         deadstmt(opt_state, &b->s, last);
1358
1359         for (atom = 0; atom < N_ATOMS; ++atom)
1360                 if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1361                         last[atom]->code = NOP;
1362                         opt_state->done = 0;
1363                 }
1364 }
1365
1366 static void
1367 opt_blk(opt_state_t *opt_state, struct block *b, int do_stmts)
1368 {
1369         struct slist *s;
1370         struct edge *p;
1371         int i;
1372         bpf_int32 aval, xval;
1373
1374 #if 0
1375         for (s = b->stmts; s && s->next; s = s->next)
1376                 if (BPF_CLASS(s->s.code) == BPF_JMP) {
1377                         do_stmts = 0;
1378                         break;
1379                 }
1380 #endif
1381
1382         /*
1383          * Initialize the atom values.
1384          */
1385         p = b->in_edges;
1386         if (p == 0) {
1387                 /*
1388                  * We have no predecessors, so everything is undefined
1389                  * upon entry to this block.
1390                  */
1391                 memset((char *)b->val, 0, sizeof(b->val));
1392         } else {
1393                 /*
1394                  * Inherit values from our predecessors.
1395                  *
1396                  * First, get the values from the predecessor along the
1397                  * first edge leading to this node.
1398                  */
1399                 memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1400                 /*
1401                  * Now look at all the other nodes leading to this node.
1402                  * If, for the predecessor along that edge, a register
1403                  * has a different value from the one we have (i.e.,
1404                  * control paths are merging, and the merging paths
1405                  * assign different values to that register), give the
1406                  * register the undefined value of 0.
1407                  */
1408                 while ((p = p->next) != NULL) {
1409                         for (i = 0; i < N_ATOMS; ++i)
1410                                 if (b->val[i] != p->pred->val[i])
1411                                         b->val[i] = 0;
1412                 }
1413         }
1414         aval = b->val[A_ATOM];
1415         xval = b->val[X_ATOM];
1416         for (s = b->stmts; s; s = s->next)
1417                 opt_stmt(opt_state, &s->s, b->val, do_stmts);
1418
1419         /*
1420          * This is a special case: if we don't use anything from this
1421          * block, and we load the accumulator or index register with a
1422          * value that is already there, or if this block is a return,
1423          * eliminate all the statements.
1424          *
1425          * XXX - what if it does a store?
1426          *
1427          * XXX - why does it matter whether we use anything from this
1428          * block?  If the accumulator or index register doesn't change
1429          * its value, isn't that OK even if we use that value?
1430          *
1431          * XXX - if we load the accumulator with a different value,
1432          * and the block ends with a conditional branch, we obviously
1433          * can't eliminate it, as the branch depends on that value.
1434          * For the index register, the conditional branch only depends
1435          * on the index register value if the test is against the index
1436          * register value rather than a constant; if nothing uses the
1437          * value we put into the index register, and we're not testing
1438          * against the index register's value, and there aren't any
1439          * other problems that would keep us from eliminating this
1440          * block, can we eliminate it?
1441          */
1442         if (do_stmts &&
1443             ((b->out_use == 0 &&
1444               aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1445               xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1446              BPF_CLASS(b->s.code) == BPF_RET)) {
1447                 if (b->stmts != 0) {
1448                         b->stmts = 0;
1449                         opt_state->done = 0;
1450                 }
1451         } else {
1452                 opt_peep(opt_state, b);
1453                 opt_deadstores(opt_state, b);
1454         }
1455         /*
1456          * Set up values for branch optimizer.
1457          */
1458         if (BPF_SRC(b->s.code) == BPF_K)
1459                 b->oval = K(b->s.k);
1460         else
1461                 b->oval = b->val[X_ATOM];
1462         b->et.code = b->s.code;
1463         b->ef.code = -b->s.code;
1464 }
1465
1466 /*
1467  * Return true if any register that is used on exit from 'succ', has
1468  * an exit value that is different from the corresponding exit value
1469  * from 'b'.
1470  */
1471 static int
1472 use_conflict(struct block *b, struct block *succ)
1473 {
1474         int atom;
1475         atomset use = succ->out_use;
1476
1477         if (use == 0)
1478                 return 0;
1479
1480         for (atom = 0; atom < N_ATOMS; ++atom)
1481                 if (ATOMELEM(use, atom))
1482                         if (b->val[atom] != succ->val[atom])
1483                                 return 1;
1484         return 0;
1485 }
1486
1487 static struct block *
1488 fold_edge(struct block *child, struct edge *ep)
1489 {
1490         int sense;
1491         int aval0, aval1, oval0, oval1;
1492         int code = ep->code;
1493
1494         if (code < 0) {
1495                 code = -code;
1496                 sense = 0;
1497         } else
1498                 sense = 1;
1499
1500         if (child->s.code != code)
1501                 return 0;
1502
1503         aval0 = child->val[A_ATOM];
1504         oval0 = child->oval;
1505         aval1 = ep->pred->val[A_ATOM];
1506         oval1 = ep->pred->oval;
1507
1508         if (aval0 != aval1)
1509                 return 0;
1510
1511         if (oval0 == oval1)
1512                 /*
1513                  * The operands of the branch instructions are
1514                  * identical, so the result is true if a true
1515                  * branch was taken to get here, otherwise false.
1516                  */
1517                 return sense ? JT(child) : JF(child);
1518
1519         if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1520                 /*
1521                  * At this point, we only know the comparison if we
1522                  * came down the true branch, and it was an equality
1523                  * comparison with a constant.
1524                  *
1525                  * I.e., if we came down the true branch, and the branch
1526                  * was an equality comparison with a constant, we know the
1527                  * accumulator contains that constant.  If we came down
1528                  * the false branch, or the comparison wasn't with a
1529                  * constant, we don't know what was in the accumulator.
1530                  *
1531                  * We rely on the fact that distinct constants have distinct
1532                  * value numbers.
1533                  */
1534                 return JF(child);
1535
1536         return 0;
1537 }
1538
1539 static void
1540 opt_j(opt_state_t *opt_state, struct edge *ep)
1541 {
1542         register int i, k;
1543         register struct block *target;
1544
1545         if (JT(ep->succ) == 0)
1546                 return;
1547
1548         if (JT(ep->succ) == JF(ep->succ)) {
1549                 /*
1550                  * Common branch targets can be eliminated, provided
1551                  * there is no data dependency.
1552                  */
1553                 if (!use_conflict(ep->pred, ep->succ->et.succ)) {
1554                         opt_state->done = 0;
1555                         ep->succ = JT(ep->succ);
1556                 }
1557         }
1558         /*
1559          * For each edge dominator that matches the successor of this
1560          * edge, promote the edge successor to the its grandchild.
1561          *
1562          * XXX We violate the set abstraction here in favor a reasonably
1563          * efficient loop.
1564          */
1565  top:
1566         for (i = 0; i < opt_state->edgewords; ++i) {
1567                 register bpf_u_int32 x = ep->edom[i];
1568
1569                 while (x != 0) {
1570                         k = lowest_set_bit(x);
1571                         x &=~ ((bpf_u_int32)1 << k);
1572                         k += i * BITS_PER_WORD;
1573
1574                         target = fold_edge(ep->succ, opt_state->edges[k]);
1575                         /*
1576                          * Check that there is no data dependency between
1577                          * nodes that will be violated if we move the edge.
1578                          */
1579                         if (target != 0 && !use_conflict(ep->pred, target)) {
1580                                 opt_state->done = 0;
1581                                 ep->succ = target;
1582                                 if (JT(target) != 0)
1583                                         /*
1584                                          * Start over unless we hit a leaf.
1585                                          */
1586                                         goto top;
1587                                 return;
1588                         }
1589                 }
1590         }
1591 }
1592
1593
1594 static void
1595 or_pullup(opt_state_t *opt_state, struct block *b)
1596 {
1597         int val, at_top;
1598         struct block *pull;
1599         struct block **diffp, **samep;
1600         struct edge *ep;
1601
1602         ep = b->in_edges;
1603         if (ep == 0)
1604                 return;
1605
1606         /*
1607          * Make sure each predecessor loads the same value.
1608          * XXX why?
1609          */
1610         val = ep->pred->val[A_ATOM];
1611         for (ep = ep->next; ep != 0; ep = ep->next)
1612                 if (val != ep->pred->val[A_ATOM])
1613                         return;
1614
1615         if (JT(b->in_edges->pred) == b)
1616                 diffp = &JT(b->in_edges->pred);
1617         else
1618                 diffp = &JF(b->in_edges->pred);
1619
1620         at_top = 1;
1621         for (;;) {
1622                 if (*diffp == 0)
1623                         return;
1624
1625                 if (JT(*diffp) != JT(b))
1626                         return;
1627
1628                 if (!SET_MEMBER((*diffp)->dom, b->id))
1629                         return;
1630
1631                 if ((*diffp)->val[A_ATOM] != val)
1632                         break;
1633
1634                 diffp = &JF(*diffp);
1635                 at_top = 0;
1636         }
1637         samep = &JF(*diffp);
1638         for (;;) {
1639                 if (*samep == 0)
1640                         return;
1641
1642                 if (JT(*samep) != JT(b))
1643                         return;
1644
1645                 if (!SET_MEMBER((*samep)->dom, b->id))
1646                         return;
1647
1648                 if ((*samep)->val[A_ATOM] == val)
1649                         break;
1650
1651                 /* XXX Need to check that there are no data dependencies
1652                    between dp0 and dp1.  Currently, the code generator
1653                    will not produce such dependencies. */
1654                 samep = &JF(*samep);
1655         }
1656 #ifdef notdef
1657         /* XXX This doesn't cover everything. */
1658         for (i = 0; i < N_ATOMS; ++i)
1659                 if ((*samep)->val[i] != pred->val[i])
1660                         return;
1661 #endif
1662         /* Pull up the node. */
1663         pull = *samep;
1664         *samep = JF(pull);
1665         JF(pull) = *diffp;
1666
1667         /*
1668          * At the top of the chain, each predecessor needs to point at the
1669          * pulled up node.  Inside the chain, there is only one predecessor
1670          * to worry about.
1671          */
1672         if (at_top) {
1673                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1674                         if (JT(ep->pred) == b)
1675                                 JT(ep->pred) = pull;
1676                         else
1677                                 JF(ep->pred) = pull;
1678                 }
1679         }
1680         else
1681                 *diffp = pull;
1682
1683         opt_state->done = 0;
1684 }
1685
1686 static void
1687 and_pullup(opt_state_t *opt_state, struct block *b)
1688 {
1689         int val, at_top;
1690         struct block *pull;
1691         struct block **diffp, **samep;
1692         struct edge *ep;
1693
1694         ep = b->in_edges;
1695         if (ep == 0)
1696                 return;
1697
1698         /*
1699          * Make sure each predecessor loads the same value.
1700          */
1701         val = ep->pred->val[A_ATOM];
1702         for (ep = ep->next; ep != 0; ep = ep->next)
1703                 if (val != ep->pred->val[A_ATOM])
1704                         return;
1705
1706         if (JT(b->in_edges->pred) == b)
1707                 diffp = &JT(b->in_edges->pred);
1708         else
1709                 diffp = &JF(b->in_edges->pred);
1710
1711         at_top = 1;
1712         for (;;) {
1713                 if (*diffp == 0)
1714                         return;
1715
1716                 if (JF(*diffp) != JF(b))
1717                         return;
1718
1719                 if (!SET_MEMBER((*diffp)->dom, b->id))
1720                         return;
1721
1722                 if ((*diffp)->val[A_ATOM] != val)
1723                         break;
1724
1725                 diffp = &JT(*diffp);
1726                 at_top = 0;
1727         }
1728         samep = &JT(*diffp);
1729         for (;;) {
1730                 if (*samep == 0)
1731                         return;
1732
1733                 if (JF(*samep) != JF(b))
1734                         return;
1735
1736                 if (!SET_MEMBER((*samep)->dom, b->id))
1737                         return;
1738
1739                 if ((*samep)->val[A_ATOM] == val)
1740                         break;
1741
1742                 /* XXX Need to check that there are no data dependencies
1743                    between diffp and samep.  Currently, the code generator
1744                    will not produce such dependencies. */
1745                 samep = &JT(*samep);
1746         }
1747 #ifdef notdef
1748         /* XXX This doesn't cover everything. */
1749         for (i = 0; i < N_ATOMS; ++i)
1750                 if ((*samep)->val[i] != pred->val[i])
1751                         return;
1752 #endif
1753         /* Pull up the node. */
1754         pull = *samep;
1755         *samep = JT(pull);
1756         JT(pull) = *diffp;
1757
1758         /*
1759          * At the top of the chain, each predecessor needs to point at the
1760          * pulled up node.  Inside the chain, there is only one predecessor
1761          * to worry about.
1762          */
1763         if (at_top) {
1764                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1765                         if (JT(ep->pred) == b)
1766                                 JT(ep->pred) = pull;
1767                         else
1768                                 JF(ep->pred) = pull;
1769                 }
1770         }
1771         else
1772                 *diffp = pull;
1773
1774         opt_state->done = 0;
1775 }
1776
1777 static void
1778 opt_blks(opt_state_t *opt_state, struct icode *ic, int do_stmts)
1779 {
1780         int i, maxlevel;
1781         struct block *p;
1782
1783         init_val(opt_state);
1784         maxlevel = ic->root->level;
1785
1786         find_inedges(opt_state, ic->root);
1787         for (i = maxlevel; i >= 0; --i)
1788                 for (p = opt_state->levels[i]; p; p = p->link)
1789                         opt_blk(opt_state, p, do_stmts);
1790
1791         if (do_stmts)
1792                 /*
1793                  * No point trying to move branches; it can't possibly
1794                  * make a difference at this point.
1795                  */
1796                 return;
1797
1798         for (i = 1; i <= maxlevel; ++i) {
1799                 for (p = opt_state->levels[i]; p; p = p->link) {
1800                         opt_j(opt_state, &p->et);
1801                         opt_j(opt_state, &p->ef);
1802                 }
1803         }
1804
1805         find_inedges(opt_state, ic->root);
1806         for (i = 1; i <= maxlevel; ++i) {
1807                 for (p = opt_state->levels[i]; p; p = p->link) {
1808                         or_pullup(opt_state, p);
1809                         and_pullup(opt_state, p);
1810                 }
1811         }
1812 }
1813
1814 static inline void
1815 link_inedge(struct edge *parent, struct block *child)
1816 {
1817         parent->next = child->in_edges;
1818         child->in_edges = parent;
1819 }
1820
1821 static void
1822 find_inedges(opt_state_t *opt_state, struct block *root)
1823 {
1824         int i;
1825         struct block *b;
1826
1827         for (i = 0; i < opt_state->n_blocks; ++i)
1828                 opt_state->blocks[i]->in_edges = 0;
1829
1830         /*
1831          * Traverse the graph, adding each edge to the predecessor
1832          * list of its successors.  Skip the leaves (i.e. level 0).
1833          */
1834         for (i = root->level; i > 0; --i) {
1835                 for (b = opt_state->levels[i]; b != 0; b = b->link) {
1836                         link_inedge(&b->et, JT(b));
1837                         link_inedge(&b->ef, JF(b));
1838                 }
1839         }
1840 }
1841
1842 static void
1843 opt_root(struct block **b)
1844 {
1845         struct slist *tmp, *s;
1846
1847         s = (*b)->stmts;
1848         (*b)->stmts = 0;
1849         while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1850                 *b = JT(*b);
1851
1852         tmp = (*b)->stmts;
1853         if (tmp != 0)
1854                 sappend(s, tmp);
1855         (*b)->stmts = s;
1856
1857         /*
1858          * If the root node is a return, then there is no
1859          * point executing any statements (since the bpf machine
1860          * has no side effects).
1861          */
1862         if (BPF_CLASS((*b)->s.code) == BPF_RET)
1863                 (*b)->stmts = 0;
1864 }
1865
1866 static void
1867 opt_loop(opt_state_t *opt_state, struct icode *ic, int do_stmts)
1868 {
1869
1870 #ifdef BDEBUG
1871         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1872                 printf("opt_loop(root, %d) begin\n", do_stmts);
1873                 opt_dump(opt_state, ic);
1874         }
1875 #endif
1876         do {
1877                 opt_state->done = 1;
1878                 find_levels(opt_state, ic);
1879                 find_dom(opt_state, ic->root);
1880                 find_closure(opt_state, ic->root);
1881                 find_ud(opt_state, ic->root);
1882                 find_edom(opt_state, ic->root);
1883                 opt_blks(opt_state, ic, do_stmts);
1884 #ifdef BDEBUG
1885                 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1886                         printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
1887                         opt_dump(opt_state, ic);
1888                 }
1889 #endif
1890         } while (!opt_state->done);
1891 }
1892
1893 /*
1894  * Optimize the filter code in its dag representation.
1895  * Return 0 on success, -1 on error.
1896  */
1897 int
1898 bpf_optimize(struct icode *ic, char *errbuf)
1899 {
1900         opt_state_t opt_state;
1901
1902         memset(&opt_state, 0, sizeof(opt_state));
1903         opt_state.errbuf = errbuf;
1904         if (setjmp(opt_state.top_ctx)) {
1905                 opt_cleanup(&opt_state);
1906                 return -1;
1907         }
1908         opt_init(&opt_state, ic);
1909         opt_loop(&opt_state, ic, 0);
1910         opt_loop(&opt_state, ic, 1);
1911         intern_blocks(&opt_state, ic);
1912 #ifdef BDEBUG
1913         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1914                 printf("after intern_blocks()\n");
1915                 opt_dump(&opt_state, ic);
1916         }
1917 #endif
1918         opt_root(&ic->root);
1919 #ifdef BDEBUG
1920         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1921                 printf("after opt_root()\n");
1922                 opt_dump(&opt_state, ic);
1923         }
1924 #endif
1925         opt_cleanup(&opt_state);
1926         return 0;
1927 }
1928
1929 static void
1930 make_marks(struct icode *ic, struct block *p)
1931 {
1932         if (!isMarked(ic, p)) {
1933                 Mark(ic, p);
1934                 if (BPF_CLASS(p->s.code) != BPF_RET) {
1935                         make_marks(ic, JT(p));
1936                         make_marks(ic, JF(p));
1937                 }
1938         }
1939 }
1940
1941 /*
1942  * Mark code array such that isMarked(ic->cur_mark, i) is true
1943  * only for nodes that are alive.
1944  */
1945 static void
1946 mark_code(struct icode *ic)
1947 {
1948         ic->cur_mark += 1;
1949         make_marks(ic, ic->root);
1950 }
1951
1952 /*
1953  * True iff the two stmt lists load the same value from the packet into
1954  * the accumulator.
1955  */
1956 static int
1957 eq_slist(struct slist *x, struct slist *y)
1958 {
1959         for (;;) {
1960                 while (x && x->s.code == NOP)
1961                         x = x->next;
1962                 while (y && y->s.code == NOP)
1963                         y = y->next;
1964                 if (x == 0)
1965                         return y == 0;
1966                 if (y == 0)
1967                         return x == 0;
1968                 if (x->s.code != y->s.code || x->s.k != y->s.k)
1969                         return 0;
1970                 x = x->next;
1971                 y = y->next;
1972         }
1973 }
1974
1975 static inline int
1976 eq_blk(struct block *b0, struct block *b1)
1977 {
1978         if (b0->s.code == b1->s.code &&
1979             b0->s.k == b1->s.k &&
1980             b0->et.succ == b1->et.succ &&
1981             b0->ef.succ == b1->ef.succ)
1982                 return eq_slist(b0->stmts, b1->stmts);
1983         return 0;
1984 }
1985
1986 static void
1987 intern_blocks(opt_state_t *opt_state, struct icode *ic)
1988 {
1989         struct block *p;
1990         int i, j;
1991         int done1; /* don't shadow global */
1992  top:
1993         done1 = 1;
1994         for (i = 0; i < opt_state->n_blocks; ++i)
1995                 opt_state->blocks[i]->link = 0;
1996
1997         mark_code(ic);
1998
1999         for (i = opt_state->n_blocks - 1; --i >= 0; ) {
2000                 if (!isMarked(ic, opt_state->blocks[i]))
2001                         continue;
2002                 for (j = i + 1; j < opt_state->n_blocks; ++j) {
2003                         if (!isMarked(ic, opt_state->blocks[j]))
2004                                 continue;
2005                         if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
2006                                 opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
2007                                         opt_state->blocks[j]->link : opt_state->blocks[j];
2008                                 break;
2009                         }
2010                 }
2011         }
2012         for (i = 0; i < opt_state->n_blocks; ++i) {
2013                 p = opt_state->blocks[i];
2014                 if (JT(p) == 0)
2015                         continue;
2016                 if (JT(p)->link) {
2017                         done1 = 0;
2018                         JT(p) = JT(p)->link;
2019                 }
2020                 if (JF(p)->link) {
2021                         done1 = 0;
2022                         JF(p) = JF(p)->link;
2023                 }
2024         }
2025         if (!done1)
2026                 goto top;
2027 }
2028
2029 static void
2030 opt_cleanup(opt_state_t *opt_state)
2031 {
2032         free((void *)opt_state->vnode_base);
2033         free((void *)opt_state->vmap);
2034         free((void *)opt_state->edges);
2035         free((void *)opt_state->space);
2036         free((void *)opt_state->levels);
2037         free((void *)opt_state->blocks);
2038 }
2039
2040 /*
2041  * For optimizer errors.
2042  */
2043 static void PCAP_NORETURN
2044 opt_error(opt_state_t *opt_state, const char *fmt, ...)
2045 {
2046         va_list ap;
2047
2048         if (opt_state->errbuf != NULL) {
2049                 va_start(ap, fmt);
2050                 (void)pcap_vsnprintf(opt_state->errbuf,
2051                     PCAP_ERRBUF_SIZE, fmt, ap);
2052                 va_end(ap);
2053         }
2054         longjmp(opt_state->top_ctx, 1);
2055         /* NOTREACHED */
2056 }
2057
2058 /*
2059  * Return the number of stmts in 's'.
2060  */
2061 static u_int
2062 slength(struct slist *s)
2063 {
2064         u_int n = 0;
2065
2066         for (; s; s = s->next)
2067                 if (s->s.code != NOP)
2068                         ++n;
2069         return n;
2070 }
2071
2072 /*
2073  * Return the number of nodes reachable by 'p'.
2074  * All nodes should be initially unmarked.
2075  */
2076 static int
2077 count_blocks(struct icode *ic, struct block *p)
2078 {
2079         if (p == 0 || isMarked(ic, p))
2080                 return 0;
2081         Mark(ic, p);
2082         return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
2083 }
2084
2085 /*
2086  * Do a depth first search on the flow graph, numbering the
2087  * the basic blocks, and entering them into the 'blocks' array.`
2088  */
2089 static void
2090 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
2091 {
2092         int n;
2093
2094         if (p == 0 || isMarked(ic, p))
2095                 return;
2096
2097         Mark(ic, p);
2098         n = opt_state->n_blocks++;
2099         p->id = n;
2100         opt_state->blocks[n] = p;
2101
2102         number_blks_r(opt_state, ic, JT(p));
2103         number_blks_r(opt_state, ic, JF(p));
2104 }
2105
2106 /*
2107  * Return the number of stmts in the flowgraph reachable by 'p'.
2108  * The nodes should be unmarked before calling.
2109  *
2110  * Note that "stmts" means "instructions", and that this includes
2111  *
2112  *      side-effect statements in 'p' (slength(p->stmts));
2113  *
2114  *      statements in the true branch from 'p' (count_stmts(JT(p)));
2115  *
2116  *      statements in the false branch from 'p' (count_stmts(JF(p)));
2117  *
2118  *      the conditional jump itself (1);
2119  *
2120  *      an extra long jump if the true branch requires it (p->longjt);
2121  *
2122  *      an extra long jump if the false branch requires it (p->longjf).
2123  */
2124 static u_int
2125 count_stmts(struct icode *ic, struct block *p)
2126 {
2127         u_int n;
2128
2129         if (p == 0 || isMarked(ic, p))
2130                 return 0;
2131         Mark(ic, p);
2132         n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2133         return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2134 }
2135
2136 /*
2137  * Allocate memory.  All allocation is done before optimization
2138  * is begun.  A linear bound on the size of all data structures is computed
2139  * from the total number of blocks and/or statements.
2140  */
2141 static void
2142 opt_init(opt_state_t *opt_state, struct icode *ic)
2143 {
2144         bpf_u_int32 *p;
2145         int i, n, max_stmts;
2146
2147         /*
2148          * First, count the blocks, so we can malloc an array to map
2149          * block number to block.  Then, put the blocks into the array.
2150          */
2151         unMarkAll(ic);
2152         n = count_blocks(ic, ic->root);
2153         opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2154         if (opt_state->blocks == NULL)
2155                 opt_error(opt_state, "malloc");
2156         unMarkAll(ic);
2157         opt_state->n_blocks = 0;
2158         number_blks_r(opt_state, ic, ic->root);
2159
2160         opt_state->n_edges = 2 * opt_state->n_blocks;
2161         opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2162         if (opt_state->edges == NULL) {
2163                 opt_error(opt_state, "malloc");
2164         }
2165
2166         /*
2167          * The number of levels is bounded by the number of nodes.
2168          */
2169         opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2170         if (opt_state->levels == NULL) {
2171                 opt_error(opt_state, "malloc");
2172         }
2173
2174         opt_state->edgewords = opt_state->n_edges / (8 * sizeof(bpf_u_int32)) + 1;
2175         opt_state->nodewords = opt_state->n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
2176
2177         /* XXX */
2178         opt_state->space = (bpf_u_int32 *)malloc(2 * opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->space)
2179                                  + opt_state->n_edges * opt_state->edgewords * sizeof(*opt_state->space));
2180         if (opt_state->space == NULL) {
2181                 opt_error(opt_state, "malloc");
2182         }
2183         p = opt_state->space;
2184         opt_state->all_dom_sets = p;
2185         for (i = 0; i < n; ++i) {
2186                 opt_state->blocks[i]->dom = p;
2187                 p += opt_state->nodewords;
2188         }
2189         opt_state->all_closure_sets = p;
2190         for (i = 0; i < n; ++i) {
2191                 opt_state->blocks[i]->closure = p;
2192                 p += opt_state->nodewords;
2193         }
2194         opt_state->all_edge_sets = p;
2195         for (i = 0; i < n; ++i) {
2196                 register struct block *b = opt_state->blocks[i];
2197
2198                 b->et.edom = p;
2199                 p += opt_state->edgewords;
2200                 b->ef.edom = p;
2201                 p += opt_state->edgewords;
2202                 b->et.id = i;
2203                 opt_state->edges[i] = &b->et;
2204                 b->ef.id = opt_state->n_blocks + i;
2205                 opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2206                 b->et.pred = b;
2207                 b->ef.pred = b;
2208         }
2209         max_stmts = 0;
2210         for (i = 0; i < n; ++i)
2211                 max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2212         /*
2213          * We allocate at most 3 value numbers per statement,
2214          * so this is an upper bound on the number of valnodes
2215          * we'll need.
2216          */
2217         opt_state->maxval = 3 * max_stmts;
2218         opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2219         if (opt_state->vmap == NULL) {
2220                 opt_error(opt_state, "malloc");
2221         }
2222         opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2223         if (opt_state->vnode_base == NULL) {
2224                 opt_error(opt_state, "malloc");
2225         }
2226 }
2227
2228 /*
2229  * This is only used when supporting optimizer debugging.  It is
2230  * global state, so do *not* do more than one compile in parallel
2231  * and expect it to provide meaningful information.
2232  */
2233 #ifdef BDEBUG
2234 int bids[NBIDS];
2235 #endif
2236
2237 static void PCAP_NORETURN conv_error(conv_state_t *, const char *, ...)
2238     PCAP_PRINTFLIKE(2, 3);
2239
2240 /*
2241  * Returns true if successful.  Returns false if a branch has
2242  * an offset that is too large.  If so, we have marked that
2243  * branch so that on a subsequent iteration, it will be treated
2244  * properly.
2245  */
2246 static int
2247 convert_code_r(conv_state_t *conv_state, struct icode *ic, struct block *p)
2248 {
2249         struct bpf_insn *dst;
2250         struct slist *src;
2251         u_int slen;
2252         u_int off;
2253         u_int extrajmps;        /* number of extra jumps inserted */
2254         struct slist **offset = NULL;
2255
2256         if (p == 0 || isMarked(ic, p))
2257                 return (1);
2258         Mark(ic, p);
2259
2260         if (convert_code_r(conv_state, ic, JF(p)) == 0)
2261                 return (0);
2262         if (convert_code_r(conv_state, ic, JT(p)) == 0)
2263                 return (0);
2264
2265         slen = slength(p->stmts);
2266         dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2267                 /* inflate length by any extra jumps */
2268
2269         p->offset = (int)(dst - conv_state->fstart);
2270
2271         /* generate offset[] for convenience  */
2272         if (slen) {
2273                 offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2274                 if (!offset) {
2275                         conv_error(conv_state, "not enough core");
2276                         /*NOTREACHED*/
2277                 }
2278         }
2279         src = p->stmts;
2280         for (off = 0; off < slen && src; off++) {
2281 #if 0
2282                 printf("off=%d src=%x\n", off, src);
2283 #endif
2284                 offset[off] = src;
2285                 src = src->next;
2286         }
2287
2288         off = 0;
2289         for (src = p->stmts; src; src = src->next) {
2290                 if (src->s.code == NOP)
2291                         continue;
2292                 dst->code = (u_short)src->s.code;
2293                 dst->k = src->s.k;
2294
2295                 /* fill block-local relative jump */
2296                 if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2297 #if 0
2298                         if (src->s.jt || src->s.jf) {
2299                                 free(offset);
2300                                 conv_error(conv_state, "illegal jmp destination");
2301                                 /*NOTREACHED*/
2302                         }
2303 #endif
2304                         goto filled;
2305                 }
2306                 if (off == slen - 2)    /*???*/
2307                         goto filled;
2308
2309             {
2310                 u_int i;
2311                 int jt, jf;
2312                 const char ljerr[] = "%s for block-local relative jump: off=%d";
2313
2314 #if 0
2315                 printf("code=%x off=%d %x %x\n", src->s.code,
2316                         off, src->s.jt, src->s.jf);
2317 #endif
2318
2319                 if (!src->s.jt || !src->s.jf) {
2320                         free(offset);
2321                         conv_error(conv_state, ljerr, "no jmp destination", off);
2322                         /*NOTREACHED*/
2323                 }
2324
2325                 jt = jf = 0;
2326                 for (i = 0; i < slen; i++) {
2327                         if (offset[i] == src->s.jt) {
2328                                 if (jt) {
2329                                         free(offset);
2330                                         conv_error(conv_state, ljerr, "multiple matches", off);
2331                                         /*NOTREACHED*/
2332                                 }
2333
2334                                 if (i - off - 1 >= 256) {
2335                                         free(offset);
2336                                         conv_error(conv_state, ljerr, "out-of-range jump", off);
2337                                         /*NOTREACHED*/
2338                                 }
2339                                 dst->jt = (u_char)(i - off - 1);
2340                                 jt++;
2341                         }
2342                         if (offset[i] == src->s.jf) {
2343                                 if (jf) {
2344                                         free(offset);
2345                                         conv_error(conv_state, ljerr, "multiple matches", off);
2346                                         /*NOTREACHED*/
2347                                 }
2348                                 if (i - off - 1 >= 256) {
2349                                         free(offset);
2350                                         conv_error(conv_state, ljerr, "out-of-range jump", off);
2351                                         /*NOTREACHED*/
2352                                 }
2353                                 dst->jf = (u_char)(i - off - 1);
2354                                 jf++;
2355                         }
2356                 }
2357                 if (!jt || !jf) {
2358                         free(offset);
2359                         conv_error(conv_state, ljerr, "no destination found", off);
2360                         /*NOTREACHED*/
2361                 }
2362             }
2363 filled:
2364                 ++dst;
2365                 ++off;
2366         }
2367         if (offset)
2368                 free(offset);
2369
2370 #ifdef BDEBUG
2371         if (dst - conv_state->fstart < NBIDS)
2372                 bids[dst - conv_state->fstart] = p->id + 1;
2373 #endif
2374         dst->code = (u_short)p->s.code;
2375         dst->k = p->s.k;
2376         if (JT(p)) {
2377                 extrajmps = 0;
2378                 off = JT(p)->offset - (p->offset + slen) - 1;
2379                 if (off >= 256) {
2380                     /* offset too large for branch, must add a jump */
2381                     if (p->longjt == 0) {
2382                         /* mark this instruction and retry */
2383                         p->longjt++;
2384                         return(0);
2385                     }
2386                     /* branch if T to following jump */
2387                     if (extrajmps >= 256) {
2388                         conv_error(conv_state, "too many extra jumps");
2389                         /*NOTREACHED*/
2390                     }
2391                     dst->jt = (u_char)extrajmps;
2392                     extrajmps++;
2393                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2394                     dst[extrajmps].k = off - extrajmps;
2395                 }
2396                 else
2397                     dst->jt = (u_char)off;
2398                 off = JF(p)->offset - (p->offset + slen) - 1;
2399                 if (off >= 256) {
2400                     /* offset too large for branch, must add a jump */
2401                     if (p->longjf == 0) {
2402                         /* mark this instruction and retry */
2403                         p->longjf++;
2404                         return(0);
2405                     }
2406                     /* branch if F to following jump */
2407                     /* if two jumps are inserted, F goes to second one */
2408                     if (extrajmps >= 256) {
2409                         conv_error(conv_state, "too many extra jumps");
2410                         /*NOTREACHED*/
2411                     }
2412                     dst->jf = (u_char)extrajmps;
2413                     extrajmps++;
2414                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2415                     dst[extrajmps].k = off - extrajmps;
2416                 }
2417                 else
2418                     dst->jf = (u_char)off;
2419         }
2420         return (1);
2421 }
2422
2423
2424 /*
2425  * Convert flowgraph intermediate representation to the
2426  * BPF array representation.  Set *lenp to the number of instructions.
2427  *
2428  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2429  * not* do free(fp) before returning fp; doing so would make no sense,
2430  * as the BPF array pointed to by the return value of icode_to_fcode()
2431  * must be valid - it's being returned for use in a bpf_program structure.
2432  *
2433  * If it appears that icode_to_fcode() is leaking, the problem is that
2434  * the program using pcap_compile() is failing to free the memory in
2435  * the BPF program when it's done - the leak is in the program, not in
2436  * the routine that happens to be allocating the memory.  (By analogy, if
2437  * a program calls fopen() without ever calling fclose() on the FILE *,
2438  * it will leak the FILE structure; the leak is not in fopen(), it's in
2439  * the program.)  Change the program to use pcap_freecode() when it's
2440  * done with the filter program.  See the pcap man page.
2441  */
2442 struct bpf_insn *
2443 icode_to_fcode(struct icode *ic, struct block *root, u_int *lenp,
2444     char *errbuf)
2445 {
2446         u_int n;
2447         struct bpf_insn *fp;
2448         conv_state_t conv_state;
2449
2450         conv_state.fstart = NULL;
2451         conv_state.errbuf = errbuf;
2452         if (setjmp(conv_state.top_ctx) != 0) {
2453                 free(conv_state.fstart);
2454                 return NULL;
2455         }
2456
2457         /*
2458          * Loop doing convert_code_r() until no branches remain
2459          * with too-large offsets.
2460          */
2461         for (;;) {
2462             unMarkAll(ic);
2463             n = *lenp = count_stmts(ic, root);
2464
2465             fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2466             if (fp == NULL) {
2467                 (void)pcap_snprintf(errbuf, PCAP_ERRBUF_SIZE,
2468                     "malloc");
2469                 free(fp);
2470                 return NULL;
2471             }
2472             memset((char *)fp, 0, sizeof(*fp) * n);
2473             conv_state.fstart = fp;
2474             conv_state.ftail = fp + n;
2475
2476             unMarkAll(ic);
2477             if (convert_code_r(&conv_state, ic, root))
2478                 break;
2479             free(fp);
2480         }
2481
2482         return fp;
2483 }
2484
2485 /*
2486  * For iconv_to_fconv() errors.
2487  */
2488 static void PCAP_NORETURN
2489 conv_error(conv_state_t *conv_state, const char *fmt, ...)
2490 {
2491         va_list ap;
2492
2493         va_start(ap, fmt);
2494         (void)pcap_vsnprintf(conv_state->errbuf,
2495             PCAP_ERRBUF_SIZE, fmt, ap);
2496         va_end(ap);
2497         longjmp(conv_state->top_ctx, 1);
2498         /* NOTREACHED */
2499 }
2500
2501 /*
2502  * Make a copy of a BPF program and put it in the "fcode" member of
2503  * a "pcap_t".
2504  *
2505  * If we fail to allocate memory for the copy, fill in the "errbuf"
2506  * member of the "pcap_t" with an error message, and return -1;
2507  * otherwise, return 0.
2508  */
2509 int
2510 install_bpf_program(pcap_t *p, struct bpf_program *fp)
2511 {
2512         size_t prog_size;
2513
2514         /*
2515          * Validate the program.
2516          */
2517         if (!bpf_validate(fp->bf_insns, fp->bf_len)) {
2518                 pcap_snprintf(p->errbuf, sizeof(p->errbuf),
2519                         "BPF program is not valid");
2520                 return (-1);
2521         }
2522
2523         /*
2524          * Free up any already installed program.
2525          */
2526         pcap_freecode(&p->fcode);
2527
2528         prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2529         p->fcode.bf_len = fp->bf_len;
2530         p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2531         if (p->fcode.bf_insns == NULL) {
2532                 pcap_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2533                     errno, "malloc");
2534                 return (-1);
2535         }
2536         memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2537         return (0);
2538 }
2539
2540 #ifdef BDEBUG
2541 static void
2542 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2543     FILE *out)
2544 {
2545         int icount, noffset;
2546         int i;
2547
2548         if (block == NULL || isMarked(ic, block))
2549                 return;
2550         Mark(ic, block);
2551
2552         icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
2553         noffset = min(block->offset + icount, (int)prog->bf_len);
2554
2555         fprintf(out, "\tblock%d [shape=ellipse, id=\"block-%d\" label=\"BLOCK%d\\n", block->id, block->id, block->id);
2556         for (i = block->offset; i < noffset; i++) {
2557                 fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
2558         }
2559         fprintf(out, "\" tooltip=\"");
2560         for (i = 0; i < BPF_MEMWORDS; i++)
2561                 if (block->val[i] != VAL_UNKNOWN)
2562                         fprintf(out, "val[%d]=%d ", i, block->val[i]);
2563         fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
2564         fprintf(out, "val[X]=%d", block->val[X_ATOM]);
2565         fprintf(out, "\"");
2566         if (JT(block) == NULL)
2567                 fprintf(out, ", peripheries=2");
2568         fprintf(out, "];\n");
2569
2570         dot_dump_node(ic, JT(block), prog, out);
2571         dot_dump_node(ic, JF(block), prog, out);
2572 }
2573
2574 static void
2575 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
2576 {
2577         if (block == NULL || isMarked(ic, block))
2578                 return;
2579         Mark(ic, block);
2580
2581         if (JT(block)) {
2582                 fprintf(out, "\t\"block%d\":se -> \"block%d\":n [label=\"T\"]; \n",
2583                                 block->id, JT(block)->id);
2584                 fprintf(out, "\t\"block%d\":sw -> \"block%d\":n [label=\"F\"]; \n",
2585                            block->id, JF(block)->id);
2586         }
2587         dot_dump_edge(ic, JT(block), out);
2588         dot_dump_edge(ic, JF(block), out);
2589 }
2590
2591 /* Output the block CFG using graphviz/DOT language
2592  * In the CFG, block's code, value index for each registers at EXIT,
2593  * and the jump relationship is show.
2594  *
2595  * example DOT for BPF `ip src host 1.1.1.1' is:
2596     digraph BPF {
2597         block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2  jf 5" tooltip="val[A]=0 val[X]=0"];
2598         block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4  jf 5" tooltip="val[A]=0 val[X]=0"];
2599         block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
2600         block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
2601         "block0":se -> "block1":n [label="T"];
2602         "block0":sw -> "block3":n [label="F"];
2603         "block1":se -> "block2":n [label="T"];
2604         "block1":sw -> "block3":n [label="F"];
2605     }
2606  *
2607  *  After install graphviz on http://www.graphviz.org/, save it as bpf.dot
2608  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
2609  */
2610 static int
2611 dot_dump(struct icode *ic, char *errbuf)
2612 {
2613         struct bpf_program f;
2614         FILE *out = stdout;
2615
2616         memset(bids, 0, sizeof bids);
2617         f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
2618         if (f.bf_insns == NULL)
2619                 return -1;
2620
2621         fprintf(out, "digraph BPF {\n");
2622         unMarkAll(ic);
2623         dot_dump_node(ic, ic->root, &f, out);
2624         unMarkAll(ic);
2625         dot_dump_edge(ic, ic->root, out);
2626         fprintf(out, "}\n");
2627
2628         free((char *)f.bf_insns);
2629         return 0;
2630 }
2631
2632 static int
2633 plain_dump(struct icode *ic, char *errbuf)
2634 {
2635         struct bpf_program f;
2636
2637         memset(bids, 0, sizeof bids);
2638         f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
2639         if (f.bf_insns == NULL)
2640                 return -1;
2641         bpf_dump(&f, 1);
2642         putchar('\n');
2643         free((char *)f.bf_insns);
2644         return 0;
2645 }
2646
2647 static void
2648 opt_dump(opt_state_t *opt_state, struct icode *ic)
2649 {
2650         int status;
2651         char errbuf[PCAP_ERRBUF_SIZE];
2652
2653         /*
2654          * If the CFG, in DOT format, is requested, output it rather than
2655          * the code that would be generated from that graph.
2656          */
2657         if (pcap_print_dot_graph)
2658                 status = dot_dump(ic, errbuf);
2659         else
2660                 status = plain_dump(ic, errbuf);
2661         if (status == -1)
2662                 opt_error(opt_state, "opt_dump: icode_to_fcode failed: %s", errbuf);
2663 }
2664 #endif