contrib/libpcap/optimize.c

   1 /*
   2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that: (1) source code distributions
   7  * retain the above copyright notice and this paragraph in its entirety, (2)
   8  * distributions including binary code include the above copyright notice and
   9  * this paragraph in its entirety in the documentation or other materials
  10  * provided with the distribution, and (3) all advertising materials mentioning
  11  * features or use of this software display the following acknowledgement:
  12  * ``This product includes software developed by the University of California,
  13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
  14  * the University nor the names of its contributors may be used to endorse
  15  * or promote products derived from this software without specific prior
  16  * written permission.
  17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
  18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  20  *
  21  *  Optimization module for BPF code intermediate representation.
  22  */
  23
  24 #ifdef HAVE_CONFIG_H
  25 #include <config.h>
  26 #endif
  27
  28 #include <pcap-types.h>
  29
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <memory.h>
  33 #include <string.h>
  34
  35 #include <errno.h>
  36
  37 #include "pcap-int.h"
  38
  39 #include "gencode.h"
  40 #include "optimize.h"
  41
  42 #ifdef HAVE_OS_PROTO_H
  43 #include "os-proto.h"
  44 #endif
  45
  46 #ifdef BDEBUG
  47 /*
  48  * The internal "debug printout" flag for the filter expression optimizer.
  49  * The code to print that stuff is present only if BDEBUG is defined, so
  50  * the flag, and the routine to set it, are defined only if BDEBUG is
  51  * defined.
  52  */
  53 static int pcap_optimizer_debug;
  54
  55 /*
  56  * Routine to set that flag.
  57  *
  58  * This is intended for libpcap developers, not for general use.
  59  * If you want to set these in a program, you'll have to declare this
  60  * routine yourself, with the appropriate DLL import attribute on Windows;
  61  * it's not declared in any header file, and won't be declared in any
  62  * header file provided by libpcap.
  63  */
  64 PCAP_API void pcap_set_optimizer_debug(int value);
  65
  66 PCAP_API_DEF void
  67 pcap_set_optimizer_debug(int value)
  68 {
  69         pcap_optimizer_debug = value;
  70 }
  71
  72 /*
  73  * The internal "print dot graph" flag for the filter expression optimizer.
  74  * The code to print that stuff is present only if BDEBUG is defined, so
  75  * the flag, and the routine to set it, are defined only if BDEBUG is
  76  * defined.
  77  */
  78 static int pcap_print_dot_graph;
  79
  80 /*
  81  * Routine to set that flag.
  82  *
  83  * This is intended for libpcap developers, not for general use.
  84  * If you want to set these in a program, you'll have to declare this
  85  * routine yourself, with the appropriate DLL import attribute on Windows;
  86  * it's not declared in any header file, and won't be declared in any
  87  * header file provided by libpcap.
  88  */
  89 PCAP_API void pcap_set_print_dot_graph(int value);
  90
  91 PCAP_API_DEF void
  92 pcap_set_print_dot_graph(int value)
  93 {
  94         pcap_print_dot_graph = value;
  95 }
  96
  97 #endif
  98
  99 /*
 100  * lowest_set_bit().
 101  *
 102  * Takes a 32-bit integer as an argument.
 103  *
 104  * If handed a non-zero value, returns the index of the lowest set bit,
 105  * counting upwards fro zero.
 106  *
 107  * If handed zero, the results are platform- and compiler-dependent.
 108  * Keep it out of the light, don't give it any water, don't feed it
 109  * after midnight, and don't pass zero to it.
 110  *
 111  * This is the same as the count of trailing zeroes in the word.
 112  */
 113 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
 114   /*
 115    * GCC 3.4 and later; we have __builtin_ctz().
 116    */
 117   #define lowest_set_bit(mask) __builtin_ctz(mask)
 118 #elif defined(_MSC_VER)
 119   /*
 120    * Visual Studio; we support only 2005 and later, so use
 121    * _BitScanForward().
 122    */
 123 #include <intrin.h>
 124
 125 #ifndef __clang__
 126 #pragma intrinsic(_BitScanForward)
 127 #endif
 128
 129 static __forceinline int
 130 lowest_set_bit(int mask)
 131 {
 132         unsigned long bit;
 133
 134         /*
 135          * Don't sign-extend mask if long is longer than int.
 136          * (It's currently not, in MSVC, even on 64-bit platforms, but....)
 137          */
 138         if (_BitScanForward(&bit, (unsigned int)mask) == 0)
 139                 return -1;      /* mask is zero */
 140         return (int)bit;
 141 }
 142 #elif defined(MSDOS) && defined(__DJGPP__)
 143   /*
 144    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
 145    * we've already included.
 146    */
 147   #define lowest_set_bit(mask)  (ffs((mask)) - 1)
 148 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
 149   /*
 150    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
 151    * or some other platform (UN*X conforming to a sufficient recent version
 152    * of the Single UNIX Specification).
 153    */
 154   #include <strings.h>
 155   #define lowest_set_bit(mask)  (ffs((mask)) - 1)
 156 #else
 157 /*
 158  * None of the above.
 159  * Use a perfect-hash-function-based function.
 160  */
 161 static int
 162 lowest_set_bit(int mask)
 163 {
 164         unsigned int v = (unsigned int)mask;
 165
 166         static const int MultiplyDeBruijnBitPosition[32] = {
 167                 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
 168                 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
 169         };
 170
 171         /*
 172          * We strip off all but the lowermost set bit (v & ~v),
 173          * and perform a minimal perfect hash on it to look up the
 174          * number of low-order zero bits in a table.
 175          *
 176          * See:
 177          *
 178          *      http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
 179          *
 180          *      http://supertech.csail.mit.edu/papers/debruijn.pdf
 181          */
 182         return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
 183 }
 184 #endif
 185
 186 /*
 187  * Represents a deleted instruction.
 188  */
 189 #define NOP -1
 190
 191 /*
 192  * Register numbers for use-def values.
 193  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
 194  * location.  A_ATOM is the accumulator and X_ATOM is the index
 195  * register.
 196  */
 197 #define A_ATOM BPF_MEMWORDS
 198 #define X_ATOM (BPF_MEMWORDS+1)
 199
 200 /*
 201  * This define is used to represent *both* the accumulator and
 202  * x register in use-def computations.
 203  * Currently, the use-def code assumes only one definition per instruction.
 204  */
 205 #define AX_ATOM N_ATOMS
 206
 207 /*
 208  * These data structures are used in a Cocke and Shwarz style
 209  * value numbering scheme.  Since the flowgraph is acyclic,
 210  * exit values can be propagated from a node's predecessors
 211  * provided it is uniquely defined.
 212  */
 213 struct valnode {
 214         int code;
 215         int v0, v1;
 216         int val;
 217         struct valnode *next;
 218 };
 219
 220 /* Integer constants mapped with the load immediate opcode. */
 221 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0L)
 222
 223 struct vmapinfo {
 224         int is_const;
 225         bpf_int32 const_val;
 226 };
 227
 228 typedef struct {
 229         /*
 230          * A flag to indicate that further optimization is needed.
 231          * Iterative passes are continued until a given pass yields no
 232          * branch movement.
 233          */
 234         int done;
 235
 236         int n_blocks;
 237         struct block **blocks;
 238         int n_edges;
 239         struct edge **edges;
 240
 241         /*
 242          * A bit vector set representation of the dominators.
 243          * We round up the set size to the next power of two.
 244          */
 245         int nodewords;
 246         int edgewords;
 247         struct block **levels;
 248         bpf_u_int32 *space;
 249
 250 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
 251 /*
 252  * True if a is in uset {p}
 253  */
 254 #define SET_MEMBER(p, a) \
 255 ((p)[(unsigned)(a) / BITS_PER_WORD] & (1 << ((unsigned)(a) % BITS_PER_WORD)))
 256
 257 /*
 258  * Add 'a' to uset p.
 259  */
 260 #define SET_INSERT(p, a) \
 261 (p)[(unsigned)(a) / BITS_PER_WORD] |= (1 << ((unsigned)(a) % BITS_PER_WORD))
 262
 263 /*
 264  * Delete 'a' from uset p.
 265  */
 266 #define SET_DELETE(p, a) \
 267 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~(1 << ((unsigned)(a) % BITS_PER_WORD))
 268
 269 /*
 270  * a := a intersect b
 271  */
 272 #define SET_INTERSECT(a, b, n)\
 273 {\
 274         register bpf_u_int32 *_x = a, *_y = b;\
 275         register int _n = n;\
 276         while (--_n >= 0) *_x++ &= *_y++;\
 277 }
 278
 279 /*
 280  * a := a - b
 281  */
 282 #define SET_SUBTRACT(a, b, n)\
 283 {\
 284         register bpf_u_int32 *_x = a, *_y = b;\
 285         register int _n = n;\
 286         while (--_n >= 0) *_x++ &=~ *_y++;\
 287 }
 288
 289 /*
 290  * a := a union b
 291  */
 292 #define SET_UNION(a, b, n)\
 293 {\
 294         register bpf_u_int32 *_x = a, *_y = b;\
 295         register int _n = n;\
 296         while (--_n >= 0) *_x++ |= *_y++;\
 297 }
 298
 299         uset all_dom_sets;
 300         uset all_closure_sets;
 301         uset all_edge_sets;
 302
 303 #define MODULUS 213
 304         struct valnode *hashtbl[MODULUS];
 305         int curval;
 306         int maxval;
 307
 308         struct vmapinfo *vmap;
 309         struct valnode *vnode_base;
 310         struct valnode *next_vnode;
 311 } opt_state_t;
 312
 313 typedef struct {
 314         /*
 315          * Some pointers used to convert the basic block form of the code,
 316          * into the array form that BPF requires.  'fstart' will point to
 317          * the malloc'd array while 'ftail' is used during the recursive
 318          * traversal.
 319          */
 320         struct bpf_insn *fstart;
 321         struct bpf_insn *ftail;
 322 } conv_state_t;
 323
 324 static void opt_init(compiler_state_t *, opt_state_t *, struct icode *);
 325 static void opt_cleanup(opt_state_t *);
 326
 327 static void intern_blocks(opt_state_t *, struct icode *);
 328
 329 static void find_inedges(opt_state_t *, struct block *);
 330 #ifdef BDEBUG
 331 static void opt_dump(compiler_state_t *, struct icode *);
 332 #endif
 333
 334 #ifndef MAX
 335 #define MAX(a,b) ((a)>(b)?(a):(b))
 336 #endif
 337
 338 static void
 339 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
 340 {
 341         int level;
 342
 343         if (isMarked(ic, b))
 344                 return;
 345
 346         Mark(ic, b);
 347         b->link = 0;
 348
 349         if (JT(b)) {
 350                 find_levels_r(opt_state, ic, JT(b));
 351                 find_levels_r(opt_state, ic, JF(b));
 352                 level = MAX(JT(b)->level, JF(b)->level) + 1;
 353         } else
 354                 level = 0;
 355         b->level = level;
 356         b->link = opt_state->levels[level];
 357         opt_state->levels[level] = b;
 358 }
 359
 360 /*
 361  * Level graph.  The levels go from 0 at the leaves to
 362  * N_LEVELS at the root.  The opt_state->levels[] array points to the
 363  * first node of the level list, whose elements are linked
 364  * with the 'link' field of the struct block.
 365  */
 366 static void
 367 find_levels(opt_state_t *opt_state, struct icode *ic)
 368 {
 369         memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
 370         unMarkAll(ic);
 371         find_levels_r(opt_state, ic, ic->root);
 372 }
 373
 374 /*
 375  * Find dominator relationships.
 376  * Assumes graph has been leveled.
 377  */
 378 static void
 379 find_dom(opt_state_t *opt_state, struct block *root)
 380 {
 381         int i;
 382         struct block *b;
 383         bpf_u_int32 *x;
 384
 385         /*
 386          * Initialize sets to contain all nodes.
 387          */
 388         x = opt_state->all_dom_sets;
 389         i = opt_state->n_blocks * opt_state->nodewords;
 390         while (--i >= 0)
 391                 *x++ = 0xFFFFFFFFU;
 392         /* Root starts off empty. */
 393         for (i = opt_state->nodewords; --i >= 0;)
 394                 root->dom[i] = 0;
 395
 396         /* root->level is the highest level no found. */
 397         for (i = root->level; i >= 0; --i) {
 398                 for (b = opt_state->levels[i]; b; b = b->link) {
 399                         SET_INSERT(b->dom, b->id);
 400                         if (JT(b) == 0)
 401                                 continue;
 402                         SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
 403                         SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
 404                 }
 405         }
 406 }
 407
 408 static void
 409 propedom(opt_state_t *opt_state, struct edge *ep)
 410 {
 411         SET_INSERT(ep->edom, ep->id);
 412         if (ep->succ) {
 413                 SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
 414                 SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
 415         }
 416 }
 417
 418 /*
 419  * Compute edge dominators.
 420  * Assumes graph has been leveled and predecessors established.
 421  */
 422 static void
 423 find_edom(opt_state_t *opt_state, struct block *root)
 424 {
 425         int i;
 426         uset x;
 427         struct block *b;
 428
 429         x = opt_state->all_edge_sets;
 430         for (i = opt_state->n_edges * opt_state->edgewords; --i >= 0; )
 431                 x[i] = 0xFFFFFFFFU;
 432
 433         /* root->level is the highest level no found. */
 434         memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 435         memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
 436         for (i = root->level; i >= 0; --i) {
 437                 for (b = opt_state->levels[i]; b != 0; b = b->link) {
 438                         propedom(opt_state, &b->et);
 439                         propedom(opt_state, &b->ef);
 440                 }
 441         }
 442 }
 443
 444 /*
 445  * Find the backwards transitive closure of the flow graph.  These sets
 446  * are backwards in the sense that we find the set of nodes that reach
 447  * a given node, not the set of nodes that can be reached by a node.
 448  *
 449  * Assumes graph has been leveled.
 450  */
 451 static void
 452 find_closure(opt_state_t *opt_state, struct block *root)
 453 {
 454         int i;
 455         struct block *b;
 456
 457         /*
 458          * Initialize sets to contain no nodes.
 459          */
 460         memset((char *)opt_state->all_closure_sets, 0,
 461               opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
 462
 463         /* root->level is the highest level no found. */
 464         for (i = root->level; i >= 0; --i) {
 465                 for (b = opt_state->levels[i]; b; b = b->link) {
 466                         SET_INSERT(b->closure, b->id);
 467                         if (JT(b) == 0)
 468                                 continue;
 469                         SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
 470                         SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
 471                 }
 472         }
 473 }
 474
 475 /*
 476  * Return the register number that is used by s.  If A and X are both
 477  * used, return AX_ATOM.  If no register is used, return -1.
 478  *
 479  * The implementation should probably change to an array access.
 480  */
 481 static int
 482 atomuse(struct stmt *s)
 483 {
 484         register int c = s->code;
 485
 486         if (c == NOP)
 487                 return -1;
 488
 489         switch (BPF_CLASS(c)) {
 490
 491         case BPF_RET:
 492                 return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
 493                         (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
 494
 495         case BPF_LD:
 496         case BPF_LDX:
 497                 return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
 498                         (BPF_MODE(c) == BPF_MEM) ? s->k : -1;
 499
 500         case BPF_ST:
 501                 return A_ATOM;
 502
 503         case BPF_STX:
 504                 return X_ATOM;
 505
 506         case BPF_JMP:
 507         case BPF_ALU:
 508                 if (BPF_SRC(c) == BPF_X)
 509                         return AX_ATOM;
 510                 return A_ATOM;
 511
 512         case BPF_MISC:
 513                 return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
 514         }
 515         abort();
 516         /* NOTREACHED */
 517 }
 518
 519 /*
 520  * Return the register number that is defined by 's'.  We assume that
 521  * a single stmt cannot define more than one register.  If no register
 522  * is defined, return -1.
 523  *
 524  * The implementation should probably change to an array access.
 525  */
 526 static int
 527 atomdef(struct stmt *s)
 528 {
 529         if (s->code == NOP)
 530                 return -1;
 531
 532         switch (BPF_CLASS(s->code)) {
 533
 534         case BPF_LD:
 535         case BPF_ALU:
 536                 return A_ATOM;
 537
 538         case BPF_LDX:
 539                 return X_ATOM;
 540
 541         case BPF_ST:
 542         case BPF_STX:
 543                 return s->k;
 544
 545         case BPF_MISC:
 546                 return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
 547         }
 548         return -1;
 549 }
 550
 551 /*
 552  * Compute the sets of registers used, defined, and killed by 'b'.
 553  *
 554  * "Used" means that a statement in 'b' uses the register before any
 555  * statement in 'b' defines it, i.e. it uses the value left in
 556  * that register by a predecessor block of this block.
 557  * "Defined" means that a statement in 'b' defines it.
 558  * "Killed" means that a statement in 'b' defines it before any
 559  * statement in 'b' uses it, i.e. it kills the value left in that
 560  * register by a predecessor block of this block.
 561  */
 562 static void
 563 compute_local_ud(struct block *b)
 564 {
 565         struct slist *s;
 566         atomset def = 0, use = 0, killed = 0;
 567         int atom;
 568
 569         for (s = b->stmts; s; s = s->next) {
 570                 if (s->s.code == NOP)
 571                         continue;
 572                 atom = atomuse(&s->s);
 573                 if (atom >= 0) {
 574                         if (atom == AX_ATOM) {
 575                                 if (!ATOMELEM(def, X_ATOM))
 576                                         use |= ATOMMASK(X_ATOM);
 577                                 if (!ATOMELEM(def, A_ATOM))
 578                                         use |= ATOMMASK(A_ATOM);
 579                         }
 580                         else if (atom < N_ATOMS) {
 581                                 if (!ATOMELEM(def, atom))
 582                                         use |= ATOMMASK(atom);
 583                         }
 584                         else
 585                                 abort();
 586                 }
 587                 atom = atomdef(&s->s);
 588                 if (atom >= 0) {
 589                         if (!ATOMELEM(use, atom))
 590                                 killed |= ATOMMASK(atom);
 591                         def |= ATOMMASK(atom);
 592                 }
 593         }
 594         if (BPF_CLASS(b->s.code) == BPF_JMP) {
 595                 /*
 596                  * XXX - what about RET?
 597                  */
 598                 atom = atomuse(&b->s);
 599                 if (atom >= 0) {
 600                         if (atom == AX_ATOM) {
 601                                 if (!ATOMELEM(def, X_ATOM))
 602                                         use |= ATOMMASK(X_ATOM);
 603                                 if (!ATOMELEM(def, A_ATOM))
 604                                         use |= ATOMMASK(A_ATOM);
 605                         }
 606                         else if (atom < N_ATOMS) {
 607                                 if (!ATOMELEM(def, atom))
 608                                         use |= ATOMMASK(atom);
 609                         }
 610                         else
 611                                 abort();
 612                 }
 613         }
 614
 615         b->def = def;
 616         b->kill = killed;
 617         b->in_use = use;
 618 }
 619
 620 /*
 621  * Assume graph is already leveled.
 622  */
 623 static void
 624 find_ud(opt_state_t *opt_state, struct block *root)
 625 {
 626         int i, maxlevel;
 627         struct block *p;
 628
 629         /*
 630          * root->level is the highest level no found;
 631          * count down from there.
 632          */
 633         maxlevel = root->level;
 634         for (i = maxlevel; i >= 0; --i)
 635                 for (p = opt_state->levels[i]; p; p = p->link) {
 636                         compute_local_ud(p);
 637                         p->out_use = 0;
 638                 }
 639
 640         for (i = 1; i <= maxlevel; ++i) {
 641                 for (p = opt_state->levels[i]; p; p = p->link) {
 642                         p->out_use |= JT(p)->in_use | JF(p)->in_use;
 643                         p->in_use |= p->out_use &~ p->kill;
 644                 }
 645         }
 646 }
 647 static void
 648 init_val(opt_state_t *opt_state)
 649 {
 650         opt_state->curval = 0;
 651         opt_state->next_vnode = opt_state->vnode_base;
 652         memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
 653         memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
 654 }
 655
 656 /* Because we really don't have an IR, this stuff is a little messy. */
 657 static int
 658 F(opt_state_t *opt_state, int code, int v0, int v1)
 659 {
 660         u_int hash;
 661         int val;
 662         struct valnode *p;
 663
 664         hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
 665         hash %= MODULUS;
 666
 667         for (p = opt_state->hashtbl[hash]; p; p = p->next)
 668                 if (p->code == code && p->v0 == v0 && p->v1 == v1)
 669                         return p->val;
 670
 671         val = ++opt_state->curval;
 672         if (BPF_MODE(code) == BPF_IMM &&
 673             (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
 674                 opt_state->vmap[val].const_val = v0;
 675                 opt_state->vmap[val].is_const = 1;
 676         }
 677         p = opt_state->next_vnode++;
 678         p->val = val;
 679         p->code = code;
 680         p->v0 = v0;
 681         p->v1 = v1;
 682         p->next = opt_state->hashtbl[hash];
 683         opt_state->hashtbl[hash] = p;
 684
 685         return val;
 686 }
 687
 688 static inline void
 689 vstore(struct stmt *s, int *valp, int newval, int alter)
 690 {
 691         if (alter && newval != VAL_UNKNOWN && *valp == newval)
 692                 s->code = NOP;
 693         else
 694                 *valp = newval;
 695 }
 696
 697 /*
 698  * Do constant-folding on binary operators.
 699  * (Unary operators are handled elsewhere.)
 700  */
 701 static void
 702 fold_op(compiler_state_t *cstate, opt_state_t *opt_state,
 703     struct stmt *s, int v0, int v1)
 704 {
 705         bpf_u_int32 a, b;
 706
 707         a = opt_state->vmap[v0].const_val;
 708         b = opt_state->vmap[v1].const_val;
 709
 710         switch (BPF_OP(s->code)) {
 711         case BPF_ADD:
 712                 a += b;
 713                 break;
 714
 715         case BPF_SUB:
 716                 a -= b;
 717                 break;
 718
 719         case BPF_MUL:
 720                 a *= b;
 721                 break;
 722
 723         case BPF_DIV:
 724                 if (b == 0)
 725                         bpf_error(cstate, "division by zero");
 726                 a /= b;
 727                 break;
 728
 729         case BPF_MOD:
 730                 if (b == 0)
 731                         bpf_error(cstate, "modulus by zero");
 732                 a %= b;
 733                 break;
 734
 735         case BPF_AND:
 736                 a &= b;
 737                 break;
 738
 739         case BPF_OR:
 740                 a |= b;
 741                 break;
 742
 743         case BPF_XOR:
 744                 a ^= b;
 745                 break;
 746
 747         case BPF_LSH:
 748                 a <<= b;
 749                 break;
 750
 751         case BPF_RSH:
 752                 a >>= b;
 753                 break;
 754
 755         default:
 756                 abort();
 757         }
 758         s->k = a;
 759         s->code = BPF_LD|BPF_IMM;
 760         opt_state->done = 0;
 761 }
 762
 763 static inline struct slist *
 764 this_op(struct slist *s)
 765 {
 766         while (s != 0 && s->s.code == NOP)
 767                 s = s->next;
 768         return s;
 769 }
 770
 771 static void
 772 opt_not(struct block *b)
 773 {
 774         struct block *tmp = JT(b);
 775
 776         JT(b) = JF(b);
 777         JF(b) = tmp;
 778 }
 779
 780 static void
 781 opt_peep(opt_state_t *opt_state, struct block *b)
 782 {
 783         struct slist *s;
 784         struct slist *next, *last;
 785         int val;
 786
 787         s = b->stmts;
 788         if (s == 0)
 789                 return;
 790
 791         last = s;
 792         for (/*empty*/; /*empty*/; s = next) {
 793                 /*
 794                  * Skip over nops.
 795                  */
 796                 s = this_op(s);
 797                 if (s == 0)
 798                         break;  /* nothing left in the block */
 799
 800                 /*
 801                  * Find the next real instruction after that one
 802                  * (skipping nops).
 803                  */
 804                 next = this_op(s->next);
 805                 if (next == 0)
 806                         break;  /* no next instruction */
 807                 last = next;
 808
 809                 /*
 810                  * st  M[k]     -->     st  M[k]
 811                  * ldx M[k]             tax
 812                  */
 813                 if (s->s.code == BPF_ST &&
 814                     next->s.code == (BPF_LDX|BPF_MEM) &&
 815                     s->s.k == next->s.k) {
 816                         opt_state->done = 0;
 817                         next->s.code = BPF_MISC|BPF_TAX;
 818                 }
 819                 /*
 820                  * ld  #k       -->     ldx  #k
 821                  * tax                  txa
 822                  */
 823                 if (s->s.code == (BPF_LD|BPF_IMM) &&
 824                     next->s.code == (BPF_MISC|BPF_TAX)) {
 825                         s->s.code = BPF_LDX|BPF_IMM;
 826                         next->s.code = BPF_MISC|BPF_TXA;
 827                         opt_state->done = 0;
 828                 }
 829                 /*
 830                  * This is an ugly special case, but it happens
 831                  * when you say tcp[k] or udp[k] where k is a constant.
 832                  */
 833                 if (s->s.code == (BPF_LD|BPF_IMM)) {
 834                         struct slist *add, *tax, *ild;
 835
 836                         /*
 837                          * Check that X isn't used on exit from this
 838                          * block (which the optimizer might cause).
 839                          * We know the code generator won't generate
 840                          * any local dependencies.
 841                          */
 842                         if (ATOMELEM(b->out_use, X_ATOM))
 843                                 continue;
 844
 845                         /*
 846                          * Check that the instruction following the ldi
 847                          * is an addx, or it's an ldxms with an addx
 848                          * following it (with 0 or more nops between the
 849                          * ldxms and addx).
 850                          */
 851                         if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
 852                                 add = next;
 853                         else
 854                                 add = this_op(next->next);
 855                         if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
 856                                 continue;
 857
 858                         /*
 859                          * Check that a tax follows that (with 0 or more
 860                          * nops between them).
 861                          */
 862                         tax = this_op(add->next);
 863                         if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
 864                                 continue;
 865
 866                         /*
 867                          * Check that an ild follows that (with 0 or more
 868                          * nops between them).
 869                          */
 870                         ild = this_op(tax->next);
 871                         if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
 872                             BPF_MODE(ild->s.code) != BPF_IND)
 873                                 continue;
 874                         /*
 875                          * We want to turn this sequence:
 876                          *
 877                          * (004) ldi     #0x2           {s}
 878                          * (005) ldxms   [14]           {next}  -- optional
 879                          * (006) addx                   {add}
 880                          * (007) tax                    {tax}
 881                          * (008) ild     [x+0]          {ild}
 882                          *
 883                          * into this sequence:
 884                          *
 885                          * (004) nop
 886                          * (005) ldxms   [14]
 887                          * (006) nop
 888                          * (007) nop
 889                          * (008) ild     [x+2]
 890                          *
 891                          * XXX We need to check that X is not
 892                          * subsequently used, because we want to change
 893                          * what'll be in it after this sequence.
 894                          *
 895                          * We know we can eliminate the accumulator
 896                          * modifications earlier in the sequence since
 897                          * it is defined by the last stmt of this sequence
 898                          * (i.e., the last statement of the sequence loads
 899                          * a value into the accumulator, so we can eliminate
 900                          * earlier operations on the accumulator).
 901                          */
 902                         ild->s.k += s->s.k;
 903                         s->s.code = NOP;
 904                         add->s.code = NOP;
 905                         tax->s.code = NOP;
 906                         opt_state->done = 0;
 907                 }
 908         }
 909         /*
 910          * If the comparison at the end of a block is an equality
 911          * comparison against a constant, and nobody uses the value
 912          * we leave in the A register at the end of a block, and
 913          * the operation preceding the comparison is an arithmetic
 914          * operation, we can sometime optimize it away.
 915          */
 916         if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
 917             !ATOMELEM(b->out_use, A_ATOM)) {
 918                 /*
 919                  * We can optimize away certain subtractions of the
 920                  * X register.
 921                  */
 922                 if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
 923                         val = b->val[X_ATOM];
 924                         if (opt_state->vmap[val].is_const) {
 925                                 /*
 926                                  * If we have a subtract to do a comparison,
 927                                  * and the X register is a known constant,
 928                                  * we can merge this value into the
 929                                  * comparison:
 930                                  *
 931                                  * sub x  ->    nop
 932                                  * jeq #y       jeq #(x+y)
 933                                  */
 934                                 b->s.k += opt_state->vmap[val].const_val;
 935                                 last->s.code = NOP;
 936                                 opt_state->done = 0;
 937                         } else if (b->s.k == 0) {
 938                                 /*
 939                                  * If the X register isn't a constant,
 940                                  * and the comparison in the test is
 941                                  * against 0, we can compare with the
 942                                  * X register, instead:
 943                                  *
 944                                  * sub x  ->    nop
 945                                  * jeq #0       jeq x
 946                                  */
 947                                 last->s.code = NOP;
 948                                 b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
 949                                 opt_state->done = 0;
 950                         }
 951                 }
 952                 /*
 953                  * Likewise, a constant subtract can be simplified:
 954                  *
 955                  * sub #x ->    nop
 956                  * jeq #y ->    jeq #(x+y)
 957                  */
 958                 else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
 959                         last->s.code = NOP;
 960                         b->s.k += last->s.k;
 961                         opt_state->done = 0;
 962                 }
 963                 /*
 964                  * And, similarly, a constant AND can be simplified
 965                  * if we're testing against 0, i.e.:
 966                  *
 967                  * and #k       nop
 968                  * jeq #0  ->   jset #k
 969                  */
 970                 else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
 971                     b->s.k == 0) {
 972                         b->s.k = last->s.k;
 973                         b->s.code = BPF_JMP|BPF_K|BPF_JSET;
 974                         last->s.code = NOP;
 975                         opt_state->done = 0;
 976                         opt_not(b);
 977                 }
 978         }
 979         /*
 980          * jset #0        ->   never
 981          * jset #ffffffff ->   always
 982          */
 983         if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
 984                 if (b->s.k == 0)
 985                         JT(b) = JF(b);
 986                 if ((u_int)b->s.k == 0xffffffffU)
 987                         JF(b) = JT(b);
 988         }
 989         /*
 990          * If we're comparing against the index register, and the index
 991          * register is a known constant, we can just compare against that
 992          * constant.
 993          */
 994         val = b->val[X_ATOM];
 995         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
 996                 bpf_int32 v = opt_state->vmap[val].const_val;
 997                 b->s.code &= ~BPF_X;
 998                 b->s.k = v;
 999         }
1000         /*
1001          * If the accumulator is a known constant, we can compute the
1002          * comparison result.
1003          */
1004         val = b->val[A_ATOM];
1005         if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1006                 bpf_int32 v = opt_state->vmap[val].const_val;
1007                 switch (BPF_OP(b->s.code)) {
1008
1009                 case BPF_JEQ:
1010                         v = v == b->s.k;
1011                         break;
1012
1013                 case BPF_JGT:
1014                         v = (unsigned)v > (unsigned)b->s.k;
1015                         break;
1016
1017                 case BPF_JGE:
1018                         v = (unsigned)v >= (unsigned)b->s.k;
1019                         break;
1020
1021                 case BPF_JSET:
1022                         v &= b->s.k;
1023                         break;
1024
1025                 default:
1026                         abort();
1027                 }
1028                 if (JF(b) != JT(b))
1029                         opt_state->done = 0;
1030                 if (v)
1031                         JF(b) = JT(b);
1032                 else
1033                         JT(b) = JF(b);
1034         }
1035 }
1036
1037 /*
1038  * Compute the symbolic value of expression of 's', and update
1039  * anything it defines in the value table 'val'.  If 'alter' is true,
1040  * do various optimizations.  This code would be cleaner if symbolic
1041  * evaluation and code transformations weren't folded together.
1042  */
1043 static void
1044 opt_stmt(compiler_state_t *cstate, opt_state_t *opt_state,
1045     struct stmt *s, int val[], int alter)
1046 {
1047         int op;
1048         int v;
1049
1050         switch (s->code) {
1051
1052         case BPF_LD|BPF_ABS|BPF_W:
1053         case BPF_LD|BPF_ABS|BPF_H:
1054         case BPF_LD|BPF_ABS|BPF_B:
1055                 v = F(opt_state, s->code, s->k, 0L);
1056                 vstore(s, &val[A_ATOM], v, alter);
1057                 break;
1058
1059         case BPF_LD|BPF_IND|BPF_W:
1060         case BPF_LD|BPF_IND|BPF_H:
1061         case BPF_LD|BPF_IND|BPF_B:
1062                 v = val[X_ATOM];
1063                 if (alter && opt_state->vmap[v].is_const) {
1064                         s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1065                         s->k += opt_state->vmap[v].const_val;
1066                         v = F(opt_state, s->code, s->k, 0L);
1067                         opt_state->done = 0;
1068                 }
1069                 else
1070                         v = F(opt_state, s->code, s->k, v);
1071                 vstore(s, &val[A_ATOM], v, alter);
1072                 break;
1073
1074         case BPF_LD|BPF_LEN:
1075                 v = F(opt_state, s->code, 0L, 0L);
1076                 vstore(s, &val[A_ATOM], v, alter);
1077                 break;
1078
1079         case BPF_LD|BPF_IMM:
1080                 v = K(s->k);
1081                 vstore(s, &val[A_ATOM], v, alter);
1082                 break;
1083
1084         case BPF_LDX|BPF_IMM:
1085                 v = K(s->k);
1086                 vstore(s, &val[X_ATOM], v, alter);
1087                 break;
1088
1089         case BPF_LDX|BPF_MSH|BPF_B:
1090                 v = F(opt_state, s->code, s->k, 0L);
1091                 vstore(s, &val[X_ATOM], v, alter);
1092                 break;
1093
1094         case BPF_ALU|BPF_NEG:
1095                 if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1096                         s->code = BPF_LD|BPF_IMM;
1097                         s->k = -opt_state->vmap[val[A_ATOM]].const_val;
1098                         val[A_ATOM] = K(s->k);
1099                 }
1100                 else
1101                         val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1102                 break;
1103
1104         case BPF_ALU|BPF_ADD|BPF_K:
1105         case BPF_ALU|BPF_SUB|BPF_K:
1106         case BPF_ALU|BPF_MUL|BPF_K:
1107         case BPF_ALU|BPF_DIV|BPF_K:
1108         case BPF_ALU|BPF_MOD|BPF_K:
1109         case BPF_ALU|BPF_AND|BPF_K:
1110         case BPF_ALU|BPF_OR|BPF_K:
1111         case BPF_ALU|BPF_XOR|BPF_K:
1112         case BPF_ALU|BPF_LSH|BPF_K:
1113         case BPF_ALU|BPF_RSH|BPF_K:
1114                 op = BPF_OP(s->code);
1115                 if (alter) {
1116                         if (s->k == 0) {
1117                                 /* don't optimize away "sub #0"
1118                                  * as it may be needed later to
1119                                  * fixup the generated math code */
1120                                 if (op == BPF_ADD ||
1121                                     op == BPF_LSH || op == BPF_RSH ||
1122                                     op == BPF_OR || op == BPF_XOR) {
1123                                         s->code = NOP;
1124                                         break;
1125                                 }
1126                                 if (op == BPF_MUL || op == BPF_AND) {
1127                                         s->code = BPF_LD|BPF_IMM;
1128                                         val[A_ATOM] = K(s->k);
1129                                         break;
1130                                 }
1131                         }
1132                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1133                                 fold_op(cstate, opt_state, s, val[A_ATOM], K(s->k));
1134                                 val[A_ATOM] = K(s->k);
1135                                 break;
1136                         }
1137                 }
1138                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1139                 break;
1140
1141         case BPF_ALU|BPF_ADD|BPF_X:
1142         case BPF_ALU|BPF_SUB|BPF_X:
1143         case BPF_ALU|BPF_MUL|BPF_X:
1144         case BPF_ALU|BPF_DIV|BPF_X:
1145         case BPF_ALU|BPF_MOD|BPF_X:
1146         case BPF_ALU|BPF_AND|BPF_X:
1147         case BPF_ALU|BPF_OR|BPF_X:
1148         case BPF_ALU|BPF_XOR|BPF_X:
1149         case BPF_ALU|BPF_LSH|BPF_X:
1150         case BPF_ALU|BPF_RSH|BPF_X:
1151                 op = BPF_OP(s->code);
1152                 if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1153                         if (opt_state->vmap[val[A_ATOM]].is_const) {
1154                                 fold_op(cstate, opt_state, s, val[A_ATOM], val[X_ATOM]);
1155                                 val[A_ATOM] = K(s->k);
1156                         }
1157                         else {
1158                                 s->code = BPF_ALU|BPF_K|op;
1159                                 s->k = opt_state->vmap[val[X_ATOM]].const_val;
1160                                 opt_state->done = 0;
1161                                 val[A_ATOM] =
1162                                         F(opt_state, s->code, val[A_ATOM], K(s->k));
1163                         }
1164                         break;
1165                 }
1166                 /*
1167                  * Check if we're doing something to an accumulator
1168                  * that is 0, and simplify.  This may not seem like
1169                  * much of a simplification but it could open up further
1170                  * optimizations.
1171                  * XXX We could also check for mul by 1, etc.
1172                  */
1173                 if (alter && opt_state->vmap[val[A_ATOM]].is_const
1174                     && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1175                         if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1176                                 s->code = BPF_MISC|BPF_TXA;
1177                                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1178                                 break;
1179                         }
1180                         else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1181                                  op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1182                                 s->code = BPF_LD|BPF_IMM;
1183                                 s->k = 0;
1184                                 vstore(s, &val[A_ATOM], K(s->k), alter);
1185                                 break;
1186                         }
1187                         else if (op == BPF_NEG) {
1188                                 s->code = NOP;
1189                                 break;
1190                         }
1191                 }
1192                 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1193                 break;
1194
1195         case BPF_MISC|BPF_TXA:
1196                 vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1197                 break;
1198
1199         case BPF_LD|BPF_MEM:
1200                 v = val[s->k];
1201                 if (alter && opt_state->vmap[v].is_const) {
1202                         s->code = BPF_LD|BPF_IMM;
1203                         s->k = opt_state->vmap[v].const_val;
1204                         opt_state->done = 0;
1205                 }
1206                 vstore(s, &val[A_ATOM], v, alter);
1207                 break;
1208
1209         case BPF_MISC|BPF_TAX:
1210                 vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1211                 break;
1212
1213         case BPF_LDX|BPF_MEM:
1214                 v = val[s->k];
1215                 if (alter && opt_state->vmap[v].is_const) {
1216                         s->code = BPF_LDX|BPF_IMM;
1217                         s->k = opt_state->vmap[v].const_val;
1218                         opt_state->done = 0;
1219                 }
1220                 vstore(s, &val[X_ATOM], v, alter);
1221                 break;
1222
1223         case BPF_ST:
1224                 vstore(s, &val[s->k], val[A_ATOM], alter);
1225                 break;
1226
1227         case BPF_STX:
1228                 vstore(s, &val[s->k], val[X_ATOM], alter);
1229                 break;
1230         }
1231 }
1232
1233 static void
1234 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1235 {
1236         register int atom;
1237
1238         atom = atomuse(s);
1239         if (atom >= 0) {
1240                 if (atom == AX_ATOM) {
1241                         last[X_ATOM] = 0;
1242                         last[A_ATOM] = 0;
1243                 }
1244                 else
1245                         last[atom] = 0;
1246         }
1247         atom = atomdef(s);
1248         if (atom >= 0) {
1249                 if (last[atom]) {
1250                         opt_state->done = 0;
1251                         last[atom]->code = NOP;
1252                 }
1253                 last[atom] = s;
1254         }
1255 }
1256
1257 static void
1258 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1259 {
1260         register struct slist *s;
1261         register int atom;
1262         struct stmt *last[N_ATOMS];
1263
1264         memset((char *)last, 0, sizeof last);
1265
1266         for (s = b->stmts; s != 0; s = s->next)
1267                 deadstmt(opt_state, &s->s, last);
1268         deadstmt(opt_state, &b->s, last);
1269
1270         for (atom = 0; atom < N_ATOMS; ++atom)
1271                 if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1272                         last[atom]->code = NOP;
1273                         opt_state->done = 0;
1274                 }
1275 }
1276
1277 static void
1278 opt_blk(compiler_state_t *cstate, opt_state_t *opt_state,
1279     struct block *b, int do_stmts)
1280 {
1281         struct slist *s;
1282         struct edge *p;
1283         int i;
1284         bpf_int32 aval, xval;
1285
1286 #if 0
1287         for (s = b->stmts; s && s->next; s = s->next)
1288                 if (BPF_CLASS(s->s.code) == BPF_JMP) {
1289                         do_stmts = 0;
1290                         break;
1291                 }
1292 #endif
1293
1294         /*
1295          * Initialize the atom values.
1296          */
1297         p = b->in_edges;
1298         if (p == 0) {
1299                 /*
1300                  * We have no predecessors, so everything is undefined
1301                  * upon entry to this block.
1302                  */
1303                 memset((char *)b->val, 0, sizeof(b->val));
1304         } else {
1305                 /*
1306                  * Inherit values from our predecessors.
1307                  *
1308                  * First, get the values from the predecessor along the
1309                  * first edge leading to this node.
1310                  */
1311                 memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1312                 /*
1313                  * Now look at all the other nodes leading to this node.
1314                  * If, for the predecessor along that edge, a register
1315                  * has a different value from the one we have (i.e.,
1316                  * control paths are merging, and the merging paths
1317                  * assign different values to that register), give the
1318                  * register the undefined value of 0.
1319                  */
1320                 while ((p = p->next) != NULL) {
1321                         for (i = 0; i < N_ATOMS; ++i)
1322                                 if (b->val[i] != p->pred->val[i])
1323                                         b->val[i] = 0;
1324                 }
1325         }
1326         aval = b->val[A_ATOM];
1327         xval = b->val[X_ATOM];
1328         for (s = b->stmts; s; s = s->next)
1329                 opt_stmt(cstate, opt_state, &s->s, b->val, do_stmts);
1330
1331         /*
1332          * This is a special case: if we don't use anything from this
1333          * block, and we load the accumulator or index register with a
1334          * value that is already there, or if this block is a return,
1335          * eliminate all the statements.
1336          *
1337          * XXX - what if it does a store?
1338          *
1339          * XXX - why does it matter whether we use anything from this
1340          * block?  If the accumulator or index register doesn't change
1341          * its value, isn't that OK even if we use that value?
1342          *
1343          * XXX - if we load the accumulator with a different value,
1344          * and the block ends with a conditional branch, we obviously
1345          * can't eliminate it, as the branch depends on that value.
1346          * For the index register, the conditional branch only depends
1347          * on the index register value if the test is against the index
1348          * register value rather than a constant; if nothing uses the
1349          * value we put into the index register, and we're not testing
1350          * against the index register's value, and there aren't any
1351          * other problems that would keep us from eliminating this
1352          * block, can we eliminate it?
1353          */
1354         if (do_stmts &&
1355             ((b->out_use == 0 &&
1356               aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1357               xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1358              BPF_CLASS(b->s.code) == BPF_RET)) {
1359                 if (b->stmts != 0) {
1360                         b->stmts = 0;
1361                         opt_state->done = 0;
1362                 }
1363         } else {
1364                 opt_peep(opt_state, b);
1365                 opt_deadstores(opt_state, b);
1366         }
1367         /*
1368          * Set up values for branch optimizer.
1369          */
1370         if (BPF_SRC(b->s.code) == BPF_K)
1371                 b->oval = K(b->s.k);
1372         else
1373                 b->oval = b->val[X_ATOM];
1374         b->et.code = b->s.code;
1375         b->ef.code = -b->s.code;
1376 }
1377
1378 /*
1379  * Return true if any register that is used on exit from 'succ', has
1380  * an exit value that is different from the corresponding exit value
1381  * from 'b'.
1382  */
1383 static int
1384 use_conflict(struct block *b, struct block *succ)
1385 {
1386         int atom;
1387         atomset use = succ->out_use;
1388
1389         if (use == 0)
1390                 return 0;
1391
1392         for (atom = 0; atom < N_ATOMS; ++atom)
1393                 if (ATOMELEM(use, atom))
1394                         if (b->val[atom] != succ->val[atom])
1395                                 return 1;
1396         return 0;
1397 }
1398
1399 static struct block *
1400 fold_edge(struct block *child, struct edge *ep)
1401 {
1402         int sense;
1403         int aval0, aval1, oval0, oval1;
1404         int code = ep->code;
1405
1406         if (code < 0) {
1407                 code = -code;
1408                 sense = 0;
1409         } else
1410                 sense = 1;
1411
1412         if (child->s.code != code)
1413                 return 0;
1414
1415         aval0 = child->val[A_ATOM];
1416         oval0 = child->oval;
1417         aval1 = ep->pred->val[A_ATOM];
1418         oval1 = ep->pred->oval;
1419
1420         if (aval0 != aval1)
1421                 return 0;
1422
1423         if (oval0 == oval1)
1424                 /*
1425                  * The operands of the branch instructions are
1426                  * identical, so the result is true if a true
1427                  * branch was taken to get here, otherwise false.
1428                  */
1429                 return sense ? JT(child) : JF(child);
1430
1431         if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1432                 /*
1433                  * At this point, we only know the comparison if we
1434                  * came down the true branch, and it was an equality
1435                  * comparison with a constant.
1436                  *
1437                  * I.e., if we came down the true branch, and the branch
1438                  * was an equality comparison with a constant, we know the
1439                  * accumulator contains that constant.  If we came down
1440                  * the false branch, or the comparison wasn't with a
1441                  * constant, we don't know what was in the accumulator.
1442                  *
1443                  * We rely on the fact that distinct constants have distinct
1444                  * value numbers.
1445                  */
1446                 return JF(child);
1447
1448         return 0;
1449 }
1450
1451 static void
1452 opt_j(opt_state_t *opt_state, struct edge *ep)
1453 {
1454         register int i, k;
1455         register struct block *target;
1456
1457         if (JT(ep->succ) == 0)
1458                 return;
1459
1460         if (JT(ep->succ) == JF(ep->succ)) {
1461                 /*
1462                  * Common branch targets can be eliminated, provided
1463                  * there is no data dependency.
1464                  */
1465                 if (!use_conflict(ep->pred, ep->succ->et.succ)) {
1466                         opt_state->done = 0;
1467                         ep->succ = JT(ep->succ);
1468                 }
1469         }
1470         /*
1471          * For each edge dominator that matches the successor of this
1472          * edge, promote the edge successor to the its grandchild.
1473          *
1474          * XXX We violate the set abstraction here in favor a reasonably
1475          * efficient loop.
1476          */
1477  top:
1478         for (i = 0; i < opt_state->edgewords; ++i) {
1479                 register bpf_u_int32 x = ep->edom[i];
1480
1481                 while (x != 0) {
1482                         k = lowest_set_bit(x);
1483                         x &=~ (1 << k);
1484                         k += i * BITS_PER_WORD;
1485
1486                         target = fold_edge(ep->succ, opt_state->edges[k]);
1487                         /*
1488                          * Check that there is no data dependency between
1489                          * nodes that will be violated if we move the edge.
1490                          */
1491                         if (target != 0 && !use_conflict(ep->pred, target)) {
1492                                 opt_state->done = 0;
1493                                 ep->succ = target;
1494                                 if (JT(target) != 0)
1495                                         /*
1496                                          * Start over unless we hit a leaf.
1497                                          */
1498                                         goto top;
1499                                 return;
1500                         }
1501                 }
1502         }
1503 }
1504
1505
1506 static void
1507 or_pullup(opt_state_t *opt_state, struct block *b)
1508 {
1509         int val, at_top;
1510         struct block *pull;
1511         struct block **diffp, **samep;
1512         struct edge *ep;
1513
1514         ep = b->in_edges;
1515         if (ep == 0)
1516                 return;
1517
1518         /*
1519          * Make sure each predecessor loads the same value.
1520          * XXX why?
1521          */
1522         val = ep->pred->val[A_ATOM];
1523         for (ep = ep->next; ep != 0; ep = ep->next)
1524                 if (val != ep->pred->val[A_ATOM])
1525                         return;
1526
1527         if (JT(b->in_edges->pred) == b)
1528                 diffp = &JT(b->in_edges->pred);
1529         else
1530                 diffp = &JF(b->in_edges->pred);
1531
1532         at_top = 1;
1533         for (;;) {
1534                 if (*diffp == 0)
1535                         return;
1536
1537                 if (JT(*diffp) != JT(b))
1538                         return;
1539
1540                 if (!SET_MEMBER((*diffp)->dom, b->id))
1541                         return;
1542
1543                 if ((*diffp)->val[A_ATOM] != val)
1544                         break;
1545
1546                 diffp = &JF(*diffp);
1547                 at_top = 0;
1548         }
1549         samep = &JF(*diffp);
1550         for (;;) {
1551                 if (*samep == 0)
1552                         return;
1553
1554                 if (JT(*samep) != JT(b))
1555                         return;
1556
1557                 if (!SET_MEMBER((*samep)->dom, b->id))
1558                         return;
1559
1560                 if ((*samep)->val[A_ATOM] == val)
1561                         break;
1562
1563                 /* XXX Need to check that there are no data dependencies
1564                    between dp0 and dp1.  Currently, the code generator
1565                    will not produce such dependencies. */
1566                 samep = &JF(*samep);
1567         }
1568 #ifdef notdef
1569         /* XXX This doesn't cover everything. */
1570         for (i = 0; i < N_ATOMS; ++i)
1571                 if ((*samep)->val[i] != pred->val[i])
1572                         return;
1573 #endif
1574         /* Pull up the node. */
1575         pull = *samep;
1576         *samep = JF(pull);
1577         JF(pull) = *diffp;
1578
1579         /*
1580          * At the top of the chain, each predecessor needs to point at the
1581          * pulled up node.  Inside the chain, there is only one predecessor
1582          * to worry about.
1583          */
1584         if (at_top) {
1585                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1586                         if (JT(ep->pred) == b)
1587                                 JT(ep->pred) = pull;
1588                         else
1589                                 JF(ep->pred) = pull;
1590                 }
1591         }
1592         else
1593                 *diffp = pull;
1594
1595         opt_state->done = 0;
1596 }
1597
1598 static void
1599 and_pullup(opt_state_t *opt_state, struct block *b)
1600 {
1601         int val, at_top;
1602         struct block *pull;
1603         struct block **diffp, **samep;
1604         struct edge *ep;
1605
1606         ep = b->in_edges;
1607         if (ep == 0)
1608                 return;
1609
1610         /*
1611          * Make sure each predecessor loads the same value.
1612          */
1613         val = ep->pred->val[A_ATOM];
1614         for (ep = ep->next; ep != 0; ep = ep->next)
1615                 if (val != ep->pred->val[A_ATOM])
1616                         return;
1617
1618         if (JT(b->in_edges->pred) == b)
1619                 diffp = &JT(b->in_edges->pred);
1620         else
1621                 diffp = &JF(b->in_edges->pred);
1622
1623         at_top = 1;
1624         for (;;) {
1625                 if (*diffp == 0)
1626                         return;
1627
1628                 if (JF(*diffp) != JF(b))
1629                         return;
1630
1631                 if (!SET_MEMBER((*diffp)->dom, b->id))
1632                         return;
1633
1634                 if ((*diffp)->val[A_ATOM] != val)
1635                         break;
1636
1637                 diffp = &JT(*diffp);
1638                 at_top = 0;
1639         }
1640         samep = &JT(*diffp);
1641         for (;;) {
1642                 if (*samep == 0)
1643                         return;
1644
1645                 if (JF(*samep) != JF(b))
1646                         return;
1647
1648                 if (!SET_MEMBER((*samep)->dom, b->id))
1649                         return;
1650
1651                 if ((*samep)->val[A_ATOM] == val)
1652                         break;
1653
1654                 /* XXX Need to check that there are no data dependencies
1655                    between diffp and samep.  Currently, the code generator
1656                    will not produce such dependencies. */
1657                 samep = &JT(*samep);
1658         }
1659 #ifdef notdef
1660         /* XXX This doesn't cover everything. */
1661         for (i = 0; i < N_ATOMS; ++i)
1662                 if ((*samep)->val[i] != pred->val[i])
1663                         return;
1664 #endif
1665         /* Pull up the node. */
1666         pull = *samep;
1667         *samep = JT(pull);
1668         JT(pull) = *diffp;
1669
1670         /*
1671          * At the top of the chain, each predecessor needs to point at the
1672          * pulled up node.  Inside the chain, there is only one predecessor
1673          * to worry about.
1674          */
1675         if (at_top) {
1676                 for (ep = b->in_edges; ep != 0; ep = ep->next) {
1677                         if (JT(ep->pred) == b)
1678                                 JT(ep->pred) = pull;
1679                         else
1680                                 JF(ep->pred) = pull;
1681                 }
1682         }
1683         else
1684                 *diffp = pull;
1685
1686         opt_state->done = 0;
1687 }
1688
1689 static void
1690 opt_blks(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic,
1691     int do_stmts)
1692 {
1693         int i, maxlevel;
1694         struct block *p;
1695
1696         init_val(opt_state);
1697         maxlevel = ic->root->level;
1698
1699         find_inedges(opt_state, ic->root);
1700         for (i = maxlevel; i >= 0; --i)
1701                 for (p = opt_state->levels[i]; p; p = p->link)
1702                         opt_blk(cstate, opt_state, p, do_stmts);
1703
1704         if (do_stmts)
1705                 /*
1706                  * No point trying to move branches; it can't possibly
1707                  * make a difference at this point.
1708                  */
1709                 return;
1710
1711         for (i = 1; i <= maxlevel; ++i) {
1712                 for (p = opt_state->levels[i]; p; p = p->link) {
1713                         opt_j(opt_state, &p->et);
1714                         opt_j(opt_state, &p->ef);
1715                 }
1716         }
1717
1718         find_inedges(opt_state, ic->root);
1719         for (i = 1; i <= maxlevel; ++i) {
1720                 for (p = opt_state->levels[i]; p; p = p->link) {
1721                         or_pullup(opt_state, p);
1722                         and_pullup(opt_state, p);
1723                 }
1724         }
1725 }
1726
1727 static inline void
1728 link_inedge(struct edge *parent, struct block *child)
1729 {
1730         parent->next = child->in_edges;
1731         child->in_edges = parent;
1732 }
1733
1734 static void
1735 find_inedges(opt_state_t *opt_state, struct block *root)
1736 {
1737         int i;
1738         struct block *b;
1739
1740         for (i = 0; i < opt_state->n_blocks; ++i)
1741                 opt_state->blocks[i]->in_edges = 0;
1742
1743         /*
1744          * Traverse the graph, adding each edge to the predecessor
1745          * list of its successors.  Skip the leaves (i.e. level 0).
1746          */
1747         for (i = root->level; i > 0; --i) {
1748                 for (b = opt_state->levels[i]; b != 0; b = b->link) {
1749                         link_inedge(&b->et, JT(b));
1750                         link_inedge(&b->ef, JF(b));
1751                 }
1752         }
1753 }
1754
1755 static void
1756 opt_root(struct block **b)
1757 {
1758         struct slist *tmp, *s;
1759
1760         s = (*b)->stmts;
1761         (*b)->stmts = 0;
1762         while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1763                 *b = JT(*b);
1764
1765         tmp = (*b)->stmts;
1766         if (tmp != 0)
1767                 sappend(s, tmp);
1768         (*b)->stmts = s;
1769
1770         /*
1771          * If the root node is a return, then there is no
1772          * point executing any statements (since the bpf machine
1773          * has no side effects).
1774          */
1775         if (BPF_CLASS((*b)->s.code) == BPF_RET)
1776                 (*b)->stmts = 0;
1777 }
1778
1779 static void
1780 opt_loop(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic,
1781     int do_stmts)
1782 {
1783
1784 #ifdef BDEBUG
1785         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1786                 printf("opt_loop(root, %d) begin\n", do_stmts);
1787                 opt_dump(cstate, ic);
1788         }
1789 #endif
1790         do {
1791                 opt_state->done = 1;
1792                 find_levels(opt_state, ic);
1793                 find_dom(opt_state, ic->root);
1794                 find_closure(opt_state, ic->root);
1795                 find_ud(opt_state, ic->root);
1796                 find_edom(opt_state, ic->root);
1797                 opt_blks(cstate, opt_state, ic, do_stmts);
1798 #ifdef BDEBUG
1799                 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1800                         printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
1801                         opt_dump(cstate, ic);
1802                 }
1803 #endif
1804         } while (!opt_state->done);
1805 }
1806
1807 /*
1808  * Optimize the filter code in its dag representation.
1809  */
1810 void
1811 bpf_optimize(compiler_state_t *cstate, struct icode *ic)
1812 {
1813         opt_state_t opt_state;
1814
1815         opt_init(cstate, &opt_state, ic);
1816         opt_loop(cstate, &opt_state, ic, 0);
1817         opt_loop(cstate, &opt_state, ic, 1);
1818         intern_blocks(&opt_state, ic);
1819 #ifdef BDEBUG
1820         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1821                 printf("after intern_blocks()\n");
1822                 opt_dump(cstate, ic);
1823         }
1824 #endif
1825         opt_root(&ic->root);
1826 #ifdef BDEBUG
1827         if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1828                 printf("after opt_root()\n");
1829                 opt_dump(cstate, ic);
1830         }
1831 #endif
1832         opt_cleanup(&opt_state);
1833 }
1834
1835 static void
1836 make_marks(struct icode *ic, struct block *p)
1837 {
1838         if (!isMarked(ic, p)) {
1839                 Mark(ic, p);
1840                 if (BPF_CLASS(p->s.code) != BPF_RET) {
1841                         make_marks(ic, JT(p));
1842                         make_marks(ic, JF(p));
1843                 }
1844         }
1845 }
1846
1847 /*
1848  * Mark code array such that isMarked(ic->cur_mark, i) is true
1849  * only for nodes that are alive.
1850  */
1851 static void
1852 mark_code(struct icode *ic)
1853 {
1854         ic->cur_mark += 1;
1855         make_marks(ic, ic->root);
1856 }
1857
1858 /*
1859  * True iff the two stmt lists load the same value from the packet into
1860  * the accumulator.
1861  */
1862 static int
1863 eq_slist(struct slist *x, struct slist *y)
1864 {
1865         for (;;) {
1866                 while (x && x->s.code == NOP)
1867                         x = x->next;
1868                 while (y && y->s.code == NOP)
1869                         y = y->next;
1870                 if (x == 0)
1871                         return y == 0;
1872                 if (y == 0)
1873                         return x == 0;
1874                 if (x->s.code != y->s.code || x->s.k != y->s.k)
1875                         return 0;
1876                 x = x->next;
1877                 y = y->next;
1878         }
1879 }
1880
1881 static inline int
1882 eq_blk(struct block *b0, struct block *b1)
1883 {
1884         if (b0->s.code == b1->s.code &&
1885             b0->s.k == b1->s.k &&
1886             b0->et.succ == b1->et.succ &&
1887             b0->ef.succ == b1->ef.succ)
1888                 return eq_slist(b0->stmts, b1->stmts);
1889         return 0;
1890 }
1891
1892 static void
1893 intern_blocks(opt_state_t *opt_state, struct icode *ic)
1894 {
1895         struct block *p;
1896         int i, j;
1897         int done1; /* don't shadow global */
1898  top:
1899         done1 = 1;
1900         for (i = 0; i < opt_state->n_blocks; ++i)
1901                 opt_state->blocks[i]->link = 0;
1902
1903         mark_code(ic);
1904
1905         for (i = opt_state->n_blocks - 1; --i >= 0; ) {
1906                 if (!isMarked(ic, opt_state->blocks[i]))
1907                         continue;
1908                 for (j = i + 1; j < opt_state->n_blocks; ++j) {
1909                         if (!isMarked(ic, opt_state->blocks[j]))
1910                                 continue;
1911                         if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
1912                                 opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
1913                                         opt_state->blocks[j]->link : opt_state->blocks[j];
1914                                 break;
1915                         }
1916                 }
1917         }
1918         for (i = 0; i < opt_state->n_blocks; ++i) {
1919                 p = opt_state->blocks[i];
1920                 if (JT(p) == 0)
1921                         continue;
1922                 if (JT(p)->link) {
1923                         done1 = 0;
1924                         JT(p) = JT(p)->link;
1925                 }
1926                 if (JF(p)->link) {
1927                         done1 = 0;
1928                         JF(p) = JF(p)->link;
1929                 }
1930         }
1931         if (!done1)
1932                 goto top;
1933 }
1934
1935 static void
1936 opt_cleanup(opt_state_t *opt_state)
1937 {
1938         free((void *)opt_state->vnode_base);
1939         free((void *)opt_state->vmap);
1940         free((void *)opt_state->edges);
1941         free((void *)opt_state->space);
1942         free((void *)opt_state->levels);
1943         free((void *)opt_state->blocks);
1944 }
1945
1946 /*
1947  * Return the number of stmts in 's'.
1948  */
1949 static u_int
1950 slength(struct slist *s)
1951 {
1952         u_int n = 0;
1953
1954         for (; s; s = s->next)
1955                 if (s->s.code != NOP)
1956                         ++n;
1957         return n;
1958 }
1959
1960 /*
1961  * Return the number of nodes reachable by 'p'.
1962  * All nodes should be initially unmarked.
1963  */
1964 static int
1965 count_blocks(struct icode *ic, struct block *p)
1966 {
1967         if (p == 0 || isMarked(ic, p))
1968                 return 0;
1969         Mark(ic, p);
1970         return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
1971 }
1972
1973 /*
1974  * Do a depth first search on the flow graph, numbering the
1975  * the basic blocks, and entering them into the 'blocks' array.`
1976  */
1977 static void
1978 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
1979 {
1980         int n;
1981
1982         if (p == 0 || isMarked(ic, p))
1983                 return;
1984
1985         Mark(ic, p);
1986         n = opt_state->n_blocks++;
1987         p->id = n;
1988         opt_state->blocks[n] = p;
1989
1990         number_blks_r(opt_state, ic, JT(p));
1991         number_blks_r(opt_state, ic, JF(p));
1992 }
1993
1994 /*
1995  * Return the number of stmts in the flowgraph reachable by 'p'.
1996  * The nodes should be unmarked before calling.
1997  *
1998  * Note that "stmts" means "instructions", and that this includes
1999  *
2000  *      side-effect statements in 'p' (slength(p->stmts));
2001  *
2002  *      statements in the true branch from 'p' (count_stmts(JT(p)));
2003  *
2004  *      statements in the false branch from 'p' (count_stmts(JF(p)));
2005  *
2006  *      the conditional jump itself (1);
2007  *
2008  *      an extra long jump if the true branch requires it (p->longjt);
2009  *
2010  *      an extra long jump if the false branch requires it (p->longjf).
2011  */
2012 static u_int
2013 count_stmts(struct icode *ic, struct block *p)
2014 {
2015         u_int n;
2016
2017         if (p == 0 || isMarked(ic, p))
2018                 return 0;
2019         Mark(ic, p);
2020         n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2021         return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2022 }
2023
2024 /*
2025  * Allocate memory.  All allocation is done before optimization
2026  * is begun.  A linear bound on the size of all data structures is computed
2027  * from the total number of blocks and/or statements.
2028  */
2029 static void
2030 opt_init(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic)
2031 {
2032         bpf_u_int32 *p;
2033         int i, n, max_stmts;
2034
2035         /*
2036          * First, count the blocks, so we can malloc an array to map
2037          * block number to block.  Then, put the blocks into the array.
2038          */
2039         unMarkAll(ic);
2040         n = count_blocks(ic, ic->root);
2041         opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2042         if (opt_state->blocks == NULL)
2043                 bpf_error(cstate, "malloc");
2044         unMarkAll(ic);
2045         opt_state->n_blocks = 0;
2046         number_blks_r(opt_state, ic, ic->root);
2047
2048         opt_state->n_edges = 2 * opt_state->n_blocks;
2049         opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2050         if (opt_state->edges == NULL)
2051                 bpf_error(cstate, "malloc");
2052
2053         /*
2054          * The number of levels is bounded by the number of nodes.
2055          */
2056         opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2057         if (opt_state->levels == NULL)
2058                 bpf_error(cstate, "malloc");
2059
2060         opt_state->edgewords = opt_state->n_edges / (8 * sizeof(bpf_u_int32)) + 1;
2061         opt_state->nodewords = opt_state->n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
2062
2063         /* XXX */
2064         opt_state->space = (bpf_u_int32 *)malloc(2 * opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->space)
2065                                  + opt_state->n_edges * opt_state->edgewords * sizeof(*opt_state->space));
2066         if (opt_state->space == NULL)
2067                 bpf_error(cstate, "malloc");
2068         p = opt_state->space;
2069         opt_state->all_dom_sets = p;
2070         for (i = 0; i < n; ++i) {
2071                 opt_state->blocks[i]->dom = p;
2072                 p += opt_state->nodewords;
2073         }
2074         opt_state->all_closure_sets = p;
2075         for (i = 0; i < n; ++i) {
2076                 opt_state->blocks[i]->closure = p;
2077                 p += opt_state->nodewords;
2078         }
2079         opt_state->all_edge_sets = p;
2080         for (i = 0; i < n; ++i) {
2081                 register struct block *b = opt_state->blocks[i];
2082
2083                 b->et.edom = p;
2084                 p += opt_state->edgewords;
2085                 b->ef.edom = p;
2086                 p += opt_state->edgewords;
2087                 b->et.id = i;
2088                 opt_state->edges[i] = &b->et;
2089                 b->ef.id = opt_state->n_blocks + i;
2090                 opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2091                 b->et.pred = b;
2092                 b->ef.pred = b;
2093         }
2094         max_stmts = 0;
2095         for (i = 0; i < n; ++i)
2096                 max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2097         /*
2098          * We allocate at most 3 value numbers per statement,
2099          * so this is an upper bound on the number of valnodes
2100          * we'll need.
2101          */
2102         opt_state->maxval = 3 * max_stmts;
2103         opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2104         opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2105         if (opt_state->vmap == NULL || opt_state->vnode_base == NULL)
2106                 bpf_error(cstate, "malloc");
2107 }
2108
2109 /*
2110  * This is only used when supporting optimizer debugging.  It is
2111  * global state, so do *not* do more than one compile in parallel
2112  * and expect it to provide meaningful information.
2113  */
2114 #ifdef BDEBUG
2115 int bids[NBIDS];
2116 #endif
2117
2118 /*
2119  * Returns true if successful.  Returns false if a branch has
2120  * an offset that is too large.  If so, we have marked that
2121  * branch so that on a subsequent iteration, it will be treated
2122  * properly.
2123  */
2124 static int
2125 convert_code_r(compiler_state_t *cstate, conv_state_t *conv_state,
2126     struct icode *ic, struct block *p)
2127 {
2128         struct bpf_insn *dst;
2129         struct slist *src;
2130         u_int slen;
2131         u_int off;
2132         u_int extrajmps;        /* number of extra jumps inserted */
2133         struct slist **offset = NULL;
2134
2135         if (p == 0 || isMarked(ic, p))
2136                 return (1);
2137         Mark(ic, p);
2138
2139         if (convert_code_r(cstate, conv_state, ic, JF(p)) == 0)
2140                 return (0);
2141         if (convert_code_r(cstate, conv_state, ic, JT(p)) == 0)
2142                 return (0);
2143
2144         slen = slength(p->stmts);
2145         dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2146                 /* inflate length by any extra jumps */
2147
2148         p->offset = (int)(dst - conv_state->fstart);
2149
2150         /* generate offset[] for convenience  */
2151         if (slen) {
2152                 offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2153                 if (!offset) {
2154                         bpf_error(cstate, "not enough core");
2155                         /*NOTREACHED*/
2156                 }
2157         }
2158         src = p->stmts;
2159         for (off = 0; off < slen && src; off++) {
2160 #if 0
2161                 printf("off=%d src=%x\n", off, src);
2162 #endif
2163                 offset[off] = src;
2164                 src = src->next;
2165         }
2166
2167         off = 0;
2168         for (src = p->stmts; src; src = src->next) {
2169                 if (src->s.code == NOP)
2170                         continue;
2171                 dst->code = (u_short)src->s.code;
2172                 dst->k = src->s.k;
2173
2174                 /* fill block-local relative jump */
2175                 if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2176 #if 0
2177                         if (src->s.jt || src->s.jf) {
2178                                 bpf_error(cstate, "illegal jmp destination");
2179                                 /*NOTREACHED*/
2180                         }
2181 #endif
2182                         goto filled;
2183                 }
2184                 if (off == slen - 2)    /*???*/
2185                         goto filled;
2186
2187             {
2188                 u_int i;
2189                 int jt, jf;
2190                 const char ljerr[] = "%s for block-local relative jump: off=%d";
2191
2192 #if 0
2193                 printf("code=%x off=%d %x %x\n", src->s.code,
2194                         off, src->s.jt, src->s.jf);
2195 #endif
2196
2197                 if (!src->s.jt || !src->s.jf) {
2198                         bpf_error(cstate, ljerr, "no jmp destination", off);
2199                         /*NOTREACHED*/
2200                 }
2201
2202                 jt = jf = 0;
2203                 for (i = 0; i < slen; i++) {
2204                         if (offset[i] == src->s.jt) {
2205                                 if (jt) {
2206                                         bpf_error(cstate, ljerr, "multiple matches", off);
2207                                         /*NOTREACHED*/
2208                                 }
2209
2210                                 if (i - off - 1 >= 256) {
2211                                         bpf_error(cstate, ljerr, "out-of-range jump", off);
2212                                         /*NOTREACHED*/
2213                                 }
2214                                 dst->jt = (u_char)(i - off - 1);
2215                                 jt++;
2216                         }
2217                         if (offset[i] == src->s.jf) {
2218                                 if (jf) {
2219                                         bpf_error(cstate, ljerr, "multiple matches", off);
2220                                         /*NOTREACHED*/
2221                                 }
2222                                 if (i - off - 1 >= 256) {
2223                                         bpf_error(cstate, ljerr, "out-of-range jump", off);
2224                                         /*NOTREACHED*/
2225                                 }
2226                                 dst->jf = (u_char)(i - off - 1);
2227                                 jf++;
2228                         }
2229                 }
2230                 if (!jt || !jf) {
2231                         bpf_error(cstate, ljerr, "no destination found", off);
2232                         /*NOTREACHED*/
2233                 }
2234             }
2235 filled:
2236                 ++dst;
2237                 ++off;
2238         }
2239         if (offset)
2240                 free(offset);
2241
2242 #ifdef BDEBUG
2243         if (dst - conv_state->fstart < NBIDS)
2244                 bids[dst - conv_state->fstart] = p->id + 1;
2245 #endif
2246         dst->code = (u_short)p->s.code;
2247         dst->k = p->s.k;
2248         if (JT(p)) {
2249                 extrajmps = 0;
2250                 off = JT(p)->offset - (p->offset + slen) - 1;
2251                 if (off >= 256) {
2252                     /* offset too large for branch, must add a jump */
2253                     if (p->longjt == 0) {
2254                         /* mark this instruction and retry */
2255                         p->longjt++;
2256                         return(0);
2257                     }
2258                     /* branch if T to following jump */
2259                     if (extrajmps >= 256) {
2260                         bpf_error(cstate, "too many extra jumps");
2261                         /*NOTREACHED*/
2262                     }
2263                     dst->jt = (u_char)extrajmps;
2264                     extrajmps++;
2265                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2266                     dst[extrajmps].k = off - extrajmps;
2267                 }
2268                 else
2269                     dst->jt = (u_char)off;
2270                 off = JF(p)->offset - (p->offset + slen) - 1;
2271                 if (off >= 256) {
2272                     /* offset too large for branch, must add a jump */
2273                     if (p->longjf == 0) {
2274                         /* mark this instruction and retry */
2275                         p->longjf++;
2276                         return(0);
2277                     }
2278                     /* branch if F to following jump */
2279                     /* if two jumps are inserted, F goes to second one */
2280                     if (extrajmps >= 256) {
2281                         bpf_error(cstate, "too many extra jumps");
2282                         /*NOTREACHED*/
2283                     }
2284                     dst->jf = (u_char)extrajmps;
2285                     extrajmps++;
2286                     dst[extrajmps].code = BPF_JMP|BPF_JA;
2287                     dst[extrajmps].k = off - extrajmps;
2288                 }
2289                 else
2290                     dst->jf = (u_char)off;
2291         }
2292         return (1);
2293 }
2294
2295
2296 /*
2297  * Convert flowgraph intermediate representation to the
2298  * BPF array representation.  Set *lenp to the number of instructions.
2299  *
2300  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2301  * not* do free(fp) before returning fp; doing so would make no sense,
2302  * as the BPF array pointed to by the return value of icode_to_fcode()
2303  * must be valid - it's being returned for use in a bpf_program structure.
2304  *
2305  * If it appears that icode_to_fcode() is leaking, the problem is that
2306  * the program using pcap_compile() is failing to free the memory in
2307  * the BPF program when it's done - the leak is in the program, not in
2308  * the routine that happens to be allocating the memory.  (By analogy, if
2309  * a program calls fopen() without ever calling fclose() on the FILE *,
2310  * it will leak the FILE structure; the leak is not in fopen(), it's in
2311  * the program.)  Change the program to use pcap_freecode() when it's
2312  * done with the filter program.  See the pcap man page.
2313  */
2314 struct bpf_insn *
2315 icode_to_fcode(compiler_state_t *cstate, struct icode *ic,
2316     struct block *root, u_int *lenp)
2317 {
2318         u_int n;
2319         struct bpf_insn *fp;
2320         conv_state_t conv_state;
2321
2322         /*
2323          * Loop doing convert_code_r() until no branches remain
2324          * with too-large offsets.
2325          */
2326         for (;;) {
2327             unMarkAll(ic);
2328             n = *lenp = count_stmts(ic, root);
2329
2330             fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2331             if (fp == NULL)
2332                     bpf_error(cstate, "malloc");
2333             memset((char *)fp, 0, sizeof(*fp) * n);
2334             conv_state.fstart = fp;
2335             conv_state.ftail = fp + n;
2336
2337             unMarkAll(ic);
2338             if (convert_code_r(cstate, &conv_state, ic, root))
2339                 break;
2340             free(fp);
2341         }
2342
2343         return fp;
2344 }
2345
2346 /*
2347  * Make a copy of a BPF program and put it in the "fcode" member of
2348  * a "pcap_t".
2349  *
2350  * If we fail to allocate memory for the copy, fill in the "errbuf"
2351  * member of the "pcap_t" with an error message, and return -1;
2352  * otherwise, return 0.
2353  */
2354 int
2355 install_bpf_program(pcap_t *p, struct bpf_program *fp)
2356 {
2357         size_t prog_size;
2358
2359         /*
2360          * Validate the program.
2361          */
2362         if (!bpf_validate(fp->bf_insns, fp->bf_len)) {
2363                 pcap_snprintf(p->errbuf, sizeof(p->errbuf),
2364                         "BPF program is not valid");
2365                 return (-1);
2366         }
2367
2368         /*
2369          * Free up any already installed program.
2370          */
2371         pcap_freecode(&p->fcode);
2372
2373         prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2374         p->fcode.bf_len = fp->bf_len;
2375         p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2376         if (p->fcode.bf_insns == NULL) {
2377                 pcap_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2378                     errno, "malloc");
2379                 return (-1);
2380         }
2381         memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2382         return (0);
2383 }
2384
2385 #ifdef BDEBUG
2386 static void
2387 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2388     FILE *out)
2389 {
2390         int icount, noffset;
2391         int i;
2392
2393         if (block == NULL || isMarked(ic, block))
2394                 return;
2395         Mark(ic, block);
2396
2397         icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
2398         noffset = min(block->offset + icount, (int)prog->bf_len);
2399
2400         fprintf(out, "\tblock%d [shape=ellipse, id=\"block-%d\" label=\"BLOCK%d\\n", block->id, block->id, block->id);
2401         for (i = block->offset; i < noffset; i++) {
2402                 fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
2403         }
2404         fprintf(out, "\" tooltip=\"");
2405         for (i = 0; i < BPF_MEMWORDS; i++)
2406                 if (block->val[i] != VAL_UNKNOWN)
2407                         fprintf(out, "val[%d]=%d ", i, block->val[i]);
2408         fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
2409         fprintf(out, "val[X]=%d", block->val[X_ATOM]);
2410         fprintf(out, "\"");
2411         if (JT(block) == NULL)
2412                 fprintf(out, ", peripheries=2");
2413         fprintf(out, "];\n");
2414
2415         dot_dump_node(ic, JT(block), prog, out);
2416         dot_dump_node(ic, JF(block), prog, out);
2417 }
2418
2419 static void
2420 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
2421 {
2422         if (block == NULL || isMarked(ic, block))
2423                 return;
2424         Mark(ic, block);
2425
2426         if (JT(block)) {
2427                 fprintf(out, "\t\"block%d\":se -> \"block%d\":n [label=\"T\"]; \n",
2428                                 block->id, JT(block)->id);
2429                 fprintf(out, "\t\"block%d\":sw -> \"block%d\":n [label=\"F\"]; \n",
2430                            block->id, JF(block)->id);
2431         }
2432         dot_dump_edge(ic, JT(block), out);
2433         dot_dump_edge(ic, JF(block), out);
2434 }
2435
2436 /* Output the block CFG using graphviz/DOT language
2437  * In the CFG, block's code, value index for each registers at EXIT,
2438  * and the jump relationship is show.
2439  *
2440  * example DOT for BPF `ip src host 1.1.1.1' is:
2441     digraph BPF {
2442         block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2  jf 5" tooltip="val[A]=0 val[X]=0"];
2443         block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4  jf 5" tooltip="val[A]=0 val[X]=0"];
2444         block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
2445         block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
2446         "block0":se -> "block1":n [label="T"];
2447         "block0":sw -> "block3":n [label="F"];
2448         "block1":se -> "block2":n [label="T"];
2449         "block1":sw -> "block3":n [label="F"];
2450     }
2451  *
2452  *  After install graphviz on http://www.graphviz.org/, save it as bpf.dot
2453  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
2454  */
2455 static void
2456 dot_dump(compiler_state_t *cstate, struct icode *ic)
2457 {
2458         struct bpf_program f;
2459         FILE *out = stdout;
2460
2461         memset(bids, 0, sizeof bids);
2462         f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
2463
2464         fprintf(out, "digraph BPF {\n");
2465         unMarkAll(ic);
2466         dot_dump_node(ic, ic->root, &f, out);
2467         unMarkAll(ic);
2468         dot_dump_edge(ic, ic->root, out);
2469         fprintf(out, "}\n");
2470
2471         free((char *)f.bf_insns);
2472 }
2473
2474 static void
2475 plain_dump(compiler_state_t *cstate, struct icode *ic)
2476 {
2477         struct bpf_program f;
2478
2479         memset(bids, 0, sizeof bids);
2480         f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
2481         bpf_dump(&f, 1);
2482         putchar('\n');
2483         free((char *)f.bf_insns);
2484 }
2485
2486 static void
2487 opt_dump(compiler_state_t *cstate, struct icode *ic)
2488 {
2489         /*
2490          * If the CFG, in DOT format, is requested, output it rather than
2491          * the code that would be generated from that graph.
2492          */
2493         if (pcap_print_dot_graph)
2494                 dot_dump(cstate, ic);
2495         else
2496                 plain_dump(cstate, ic);
2497 }
2498 #endif