]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/unbound/services/mesh.c
Merge missed sources for lldb-specific TableGen tool.
[FreeBSD/FreeBSD.git] / contrib / unbound / services / mesh.c
1 /*
2  * services/mesh.c - deal with mesh of query states and handle events for that.
3  *
4  * Copyright (c) 2007, NLnet Labs. All rights reserved.
5  *
6  * This software is open source.
7  * 
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 
12  * Redistributions of source code must retain the above copyright notice,
13  * this list of conditions and the following disclaimer.
14  * 
15  * Redistributions in binary form must reproduce the above copyright notice,
16  * this list of conditions and the following disclaimer in the documentation
17  * and/or other materials provided with the distribution.
18  * 
19  * Neither the name of the NLNET LABS nor the names of its contributors may
20  * be used to endorse or promote products derived from this software without
21  * specific prior written permission.
22  * 
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  */
35
36 /**
37  * \file
38  *
39  * This file contains functions to assist in dealing with a mesh of
40  * query states. This mesh is supposed to be thread-specific.
41  * It consists of query states (per qname, qtype, qclass) and connections
42  * between query states and the super and subquery states, and replies to
43  * send back to clients.
44  */
45 #include "config.h"
46 #include "services/mesh.h"
47 #include "services/outbound_list.h"
48 #include "services/cache/dns.h"
49 #include "util/log.h"
50 #include "util/net_help.h"
51 #include "util/module.h"
52 #include "util/regional.h"
53 #include "util/data/msgencode.h"
54 #include "util/timehist.h"
55 #include "util/fptr_wlist.h"
56 #include "util/alloc.h"
57 #include "util/config_file.h"
58 #include "util/edns.h"
59 #include "sldns/sbuffer.h"
60 #include "sldns/wire2str.h"
61 #include "services/localzone.h"
62 #include "util/data/dname.h"
63 #include "respip/respip.h"
64 #include "services/listen_dnsport.h"
65
66 /** subtract timers and the values do not overflow or become negative */
67 static void
68 timeval_subtract(struct timeval* d, const struct timeval* end, const struct timeval* start)
69 {
70 #ifndef S_SPLINT_S
71         time_t end_usec = end->tv_usec;
72         d->tv_sec = end->tv_sec - start->tv_sec;
73         if(end_usec < start->tv_usec) {
74                 end_usec += 1000000;
75                 d->tv_sec--;
76         }
77         d->tv_usec = end_usec - start->tv_usec;
78 #endif
79 }
80
81 /** add timers and the values do not overflow or become negative */
82 static void
83 timeval_add(struct timeval* d, const struct timeval* add)
84 {
85 #ifndef S_SPLINT_S
86         d->tv_sec += add->tv_sec;
87         d->tv_usec += add->tv_usec;
88         if(d->tv_usec > 1000000 ) {
89                 d->tv_usec -= 1000000;
90                 d->tv_sec++;
91         }
92 #endif
93 }
94
95 /** divide sum of timers to get average */
96 static void
97 timeval_divide(struct timeval* avg, const struct timeval* sum, size_t d)
98 {
99 #ifndef S_SPLINT_S
100         size_t leftover;
101         if(d == 0) {
102                 avg->tv_sec = 0;
103                 avg->tv_usec = 0;
104                 return;
105         }
106         avg->tv_sec = sum->tv_sec / d;
107         avg->tv_usec = sum->tv_usec / d;
108         /* handle fraction from seconds divide */
109         leftover = sum->tv_sec - avg->tv_sec*d;
110         avg->tv_usec += (leftover*1000000)/d;
111 #endif
112 }
113
114 /** histogram compare of time values */
115 static int
116 timeval_smaller(const struct timeval* x, const struct timeval* y)
117 {
118 #ifndef S_SPLINT_S
119         if(x->tv_sec < y->tv_sec)
120                 return 1;
121         else if(x->tv_sec == y->tv_sec) {
122                 if(x->tv_usec <= y->tv_usec)
123                         return 1;
124                 else    return 0;
125         }
126         else    return 0;
127 #endif
128 }
129
130 /*
131  * Compare two response-ip client info entries for the purpose of mesh state
132  * compare.  It returns 0 if ci_a and ci_b are considered equal; otherwise
133  * 1 or -1 (they mean 'ci_a is larger/smaller than ci_b', respectively, but
134  * in practice it should be only used to mean they are different).
135  * We cannot share the mesh state for two queries if different response-ip
136  * actions can apply in the end, even if those queries are otherwise identical.
137  * For this purpose we compare tag lists and tag action lists; they should be
138  * identical to share the same state.
139  * For tag data, we don't look into the data content, as it can be
140  * expensive; unless tag data are not defined for both or they point to the
141  * exact same data in memory (i.e., they come from the same ACL entry), we
142  * consider these data different.
143  * Likewise, if the client info is associated with views, we don't look into
144  * the views.  They are considered different unless they are exactly the same
145  * even if the views only differ in the names.
146  */
147 static int
148 client_info_compare(const struct respip_client_info* ci_a,
149         const struct respip_client_info* ci_b)
150 {
151         int cmp;
152
153         if(!ci_a && !ci_b)
154                 return 0;
155         if(ci_a && !ci_b)
156                 return -1;
157         if(!ci_a && ci_b)
158                 return 1;
159         if(ci_a->taglen != ci_b->taglen)
160                 return (ci_a->taglen < ci_b->taglen) ? -1 : 1;
161         cmp = memcmp(ci_a->taglist, ci_b->taglist, ci_a->taglen);
162         if(cmp != 0)
163                 return cmp;
164         if(ci_a->tag_actions_size != ci_b->tag_actions_size)
165                 return (ci_a->tag_actions_size < ci_b->tag_actions_size) ?
166                         -1 : 1;
167         cmp = memcmp(ci_a->tag_actions, ci_b->tag_actions,
168                 ci_a->tag_actions_size);
169         if(cmp != 0)
170                 return cmp;
171         if(ci_a->tag_datas != ci_b->tag_datas)
172                 return ci_a->tag_datas < ci_b->tag_datas ? -1 : 1;
173         if(ci_a->view != ci_b->view)
174                 return ci_a->view < ci_b->view ? -1 : 1;
175         /* For the unbound daemon these should be non-NULL and identical,
176          * but we check that just in case. */
177         if(ci_a->respip_set != ci_b->respip_set)
178                 return ci_a->respip_set < ci_b->respip_set ? -1 : 1;
179         return 0;
180 }
181
182 int
183 mesh_state_compare(const void* ap, const void* bp)
184 {
185         struct mesh_state* a = (struct mesh_state*)ap;
186         struct mesh_state* b = (struct mesh_state*)bp;
187         int cmp;
188
189         if(a->unique < b->unique)
190                 return -1;
191         if(a->unique > b->unique)
192                 return 1;
193
194         if(a->s.is_priming && !b->s.is_priming)
195                 return -1;
196         if(!a->s.is_priming && b->s.is_priming)
197                 return 1;
198
199         if(a->s.is_valrec && !b->s.is_valrec)
200                 return -1;
201         if(!a->s.is_valrec && b->s.is_valrec)
202                 return 1;
203
204         if((a->s.query_flags&BIT_RD) && !(b->s.query_flags&BIT_RD))
205                 return -1;
206         if(!(a->s.query_flags&BIT_RD) && (b->s.query_flags&BIT_RD))
207                 return 1;
208
209         if((a->s.query_flags&BIT_CD) && !(b->s.query_flags&BIT_CD))
210                 return -1;
211         if(!(a->s.query_flags&BIT_CD) && (b->s.query_flags&BIT_CD))
212                 return 1;
213
214         cmp = query_info_compare(&a->s.qinfo, &b->s.qinfo);
215         if(cmp != 0)
216                 return cmp;
217         return client_info_compare(a->s.client_info, b->s.client_info);
218 }
219
220 int
221 mesh_state_ref_compare(const void* ap, const void* bp)
222 {
223         struct mesh_state_ref* a = (struct mesh_state_ref*)ap;
224         struct mesh_state_ref* b = (struct mesh_state_ref*)bp;
225         return mesh_state_compare(a->s, b->s);
226 }
227
228 struct mesh_area* 
229 mesh_create(struct module_stack* stack, struct module_env* env)
230 {
231         struct mesh_area* mesh = calloc(1, sizeof(struct mesh_area));
232         if(!mesh) {
233                 log_err("mesh area alloc: out of memory");
234                 return NULL;
235         }
236         mesh->histogram = timehist_setup();
237         mesh->qbuf_bak = sldns_buffer_new(env->cfg->msg_buffer_size);
238         if(!mesh->histogram || !mesh->qbuf_bak) {
239                 free(mesh);
240                 log_err("mesh area alloc: out of memory");
241                 return NULL;
242         }
243         mesh->mods = *stack;
244         mesh->env = env;
245         rbtree_init(&mesh->run, &mesh_state_compare);
246         rbtree_init(&mesh->all, &mesh_state_compare);
247         mesh->num_reply_addrs = 0;
248         mesh->num_reply_states = 0;
249         mesh->num_detached_states = 0;
250         mesh->num_forever_states = 0;
251         mesh->stats_jostled = 0;
252         mesh->stats_dropped = 0;
253         mesh->max_reply_states = env->cfg->num_queries_per_thread;
254         mesh->max_forever_states = (mesh->max_reply_states+1)/2;
255 #ifndef S_SPLINT_S
256         mesh->jostle_max.tv_sec = (time_t)(env->cfg->jostle_time / 1000);
257         mesh->jostle_max.tv_usec = (time_t)((env->cfg->jostle_time % 1000)
258                 *1000);
259 #endif
260         return mesh;
261 }
262
263 /** help mesh delete delete mesh states */
264 static void
265 mesh_delete_helper(rbnode_type* n)
266 {
267         struct mesh_state* mstate = (struct mesh_state*)n->key;
268         /* perform a full delete, not only 'cleanup' routine,
269          * because other callbacks expect a clean state in the mesh.
270          * For 're-entrant' calls */
271         mesh_state_delete(&mstate->s);
272         /* but because these delete the items from the tree, postorder
273          * traversal and rbtree rebalancing do not work together */
274 }
275
276 void 
277 mesh_delete(struct mesh_area* mesh)
278 {
279         if(!mesh)
280                 return;
281         /* free all query states */
282         while(mesh->all.count)
283                 mesh_delete_helper(mesh->all.root);
284         timehist_delete(mesh->histogram);
285         sldns_buffer_free(mesh->qbuf_bak);
286         free(mesh);
287 }
288
289 void
290 mesh_delete_all(struct mesh_area* mesh)
291 {
292         /* free all query states */
293         while(mesh->all.count)
294                 mesh_delete_helper(mesh->all.root);
295         mesh->stats_dropped += mesh->num_reply_addrs;
296         /* clear mesh area references */
297         rbtree_init(&mesh->run, &mesh_state_compare);
298         rbtree_init(&mesh->all, &mesh_state_compare);
299         mesh->num_reply_addrs = 0;
300         mesh->num_reply_states = 0;
301         mesh->num_detached_states = 0;
302         mesh->num_forever_states = 0;
303         mesh->forever_first = NULL;
304         mesh->forever_last = NULL;
305         mesh->jostle_first = NULL;
306         mesh->jostle_last = NULL;
307 }
308
309 int mesh_make_new_space(struct mesh_area* mesh, sldns_buffer* qbuf)
310 {
311         struct mesh_state* m = mesh->jostle_first;
312         /* free space is available */
313         if(mesh->num_reply_states < mesh->max_reply_states)
314                 return 1;
315         /* try to kick out a jostle-list item */
316         if(m && m->reply_list && m->list_select == mesh_jostle_list) {
317                 /* how old is it? */
318                 struct timeval age;
319                 timeval_subtract(&age, mesh->env->now_tv, 
320                         &m->reply_list->start_time);
321                 if(timeval_smaller(&mesh->jostle_max, &age)) {
322                         /* its a goner */
323                         log_nametypeclass(VERB_ALGO, "query jostled out to "
324                                 "make space for a new one",
325                                 m->s.qinfo.qname, m->s.qinfo.qtype,
326                                 m->s.qinfo.qclass);
327                         /* backup the query */
328                         if(qbuf) sldns_buffer_copy(mesh->qbuf_bak, qbuf);
329                         /* notify supers */
330                         if(m->super_set.count > 0) {
331                                 verbose(VERB_ALGO, "notify supers of failure");
332                                 m->s.return_msg = NULL;
333                                 m->s.return_rcode = LDNS_RCODE_SERVFAIL;
334                                 mesh_walk_supers(mesh, m);
335                         }
336                         mesh->stats_jostled ++;
337                         mesh_state_delete(&m->s);
338                         /* restore the query - note that the qinfo ptr to
339                          * the querybuffer is then correct again. */
340                         if(qbuf) sldns_buffer_copy(qbuf, mesh->qbuf_bak);
341                         return 1;
342                 }
343         }
344         /* no space for new item */
345         return 0;
346 }
347
348 void mesh_new_client(struct mesh_area* mesh, struct query_info* qinfo,
349         struct respip_client_info* cinfo, uint16_t qflags,
350         struct edns_data* edns, struct comm_reply* rep, uint16_t qid)
351 {
352         struct mesh_state* s = NULL;
353         int unique = unique_mesh_state(edns->opt_list, mesh->env);
354         int was_detached = 0;
355         int was_noreply = 0;
356         int added = 0;
357         struct sldns_buffer* r_buffer = rep->c->buffer;
358         if(rep->c->tcp_req_info) {
359                 r_buffer = rep->c->tcp_req_info->spool_buffer;
360         }
361         if(!unique)
362                 s = mesh_area_find(mesh, cinfo, qinfo, qflags&(BIT_RD|BIT_CD), 0, 0);
363         /* does this create a new reply state? */
364         if(!s || s->list_select == mesh_no_list) {
365                 if(!mesh_make_new_space(mesh, rep->c->buffer)) {
366                         verbose(VERB_ALGO, "Too many queries. dropping "
367                                 "incoming query.");
368                         comm_point_drop_reply(rep);
369                         mesh->stats_dropped ++;
370                         return;
371                 }
372                 /* for this new reply state, the reply address is free,
373                  * so the limit of reply addresses does not stop reply states*/
374         } else {
375                 /* protect our memory usage from storing reply addresses */
376                 if(mesh->num_reply_addrs > mesh->max_reply_states*16) {
377                         verbose(VERB_ALGO, "Too many requests queued. "
378                                 "dropping incoming query.");
379                         mesh->stats_dropped++;
380                         comm_point_drop_reply(rep);
381                         return;
382                 }
383         }
384         /* see if it already exists, if not, create one */
385         if(!s) {
386 #ifdef UNBOUND_DEBUG
387                 struct rbnode_type* n;
388 #endif
389                 s = mesh_state_create(mesh->env, qinfo, cinfo,
390                         qflags&(BIT_RD|BIT_CD), 0, 0);
391                 if(!s) {
392                         log_err("mesh_state_create: out of memory; SERVFAIL");
393                         if(!inplace_cb_reply_servfail_call(mesh->env, qinfo, NULL, NULL,
394                                 LDNS_RCODE_SERVFAIL, edns, rep, mesh->env->scratch))
395                                         edns->opt_list = NULL;
396                         error_encode(r_buffer, LDNS_RCODE_SERVFAIL,
397                                 qinfo, qid, qflags, edns);
398                         comm_point_send_reply(rep);
399                         return;
400                 }
401                 if(unique)
402                         mesh_state_make_unique(s);
403                 /* copy the edns options we got from the front */
404                 if(edns->opt_list) {
405                         s->s.edns_opts_front_in = edns_opt_copy_region(edns->opt_list,
406                                 s->s.region);
407                         if(!s->s.edns_opts_front_in) {
408                                 log_err("mesh_state_create: out of memory; SERVFAIL");
409                                 if(!inplace_cb_reply_servfail_call(mesh->env, qinfo, NULL,
410                                         NULL, LDNS_RCODE_SERVFAIL, edns, rep, mesh->env->scratch))
411                                                 edns->opt_list = NULL;
412                                 error_encode(r_buffer, LDNS_RCODE_SERVFAIL,
413                                         qinfo, qid, qflags, edns);
414                                 comm_point_send_reply(rep);
415                                 return;
416                         }
417                 }
418
419 #ifdef UNBOUND_DEBUG
420                 n =
421 #else
422                 (void)
423 #endif
424                 rbtree_insert(&mesh->all, &s->node);
425                 log_assert(n != NULL);
426                 /* set detached (it is now) */
427                 mesh->num_detached_states++;
428                 added = 1;
429         }
430         if(!s->reply_list && !s->cb_list && s->super_set.count == 0)
431                 was_detached = 1;
432         if(!s->reply_list && !s->cb_list)
433                 was_noreply = 1;
434         /* add reply to s */
435         if(!mesh_state_add_reply(s, edns, rep, qid, qflags, qinfo)) {
436                         log_err("mesh_new_client: out of memory; SERVFAIL");
437                 servfail_mem:
438                         if(!inplace_cb_reply_servfail_call(mesh->env, qinfo, &s->s,
439                                 NULL, LDNS_RCODE_SERVFAIL, edns, rep, mesh->env->scratch))
440                                         edns->opt_list = NULL;
441                         error_encode(r_buffer, LDNS_RCODE_SERVFAIL,
442                                 qinfo, qid, qflags, edns);
443                         comm_point_send_reply(rep);
444                         if(added)
445                                 mesh_state_delete(&s->s);
446                         return;
447         }
448         if(rep->c->tcp_req_info) {
449                 if(!tcp_req_info_add_meshstate(rep->c->tcp_req_info, mesh, s)) {
450                         log_err("mesh_new_client: out of memory add tcpreqinfo");
451                         goto servfail_mem;
452                 }
453         }
454         /* update statistics */
455         if(was_detached) {
456                 log_assert(mesh->num_detached_states > 0);
457                 mesh->num_detached_states--;
458         }
459         if(was_noreply) {
460                 mesh->num_reply_states ++;
461         }
462         mesh->num_reply_addrs++;
463         if(s->list_select == mesh_no_list) {
464                 /* move to either the forever or the jostle_list */
465                 if(mesh->num_forever_states < mesh->max_forever_states) {
466                         mesh->num_forever_states ++;
467                         mesh_list_insert(s, &mesh->forever_first, 
468                                 &mesh->forever_last);
469                         s->list_select = mesh_forever_list;
470                 } else {
471                         mesh_list_insert(s, &mesh->jostle_first, 
472                                 &mesh->jostle_last);
473                         s->list_select = mesh_jostle_list;
474                 }
475         }
476         if(added)
477                 mesh_run(mesh, s, module_event_new, NULL);
478 }
479
480 int 
481 mesh_new_callback(struct mesh_area* mesh, struct query_info* qinfo,
482         uint16_t qflags, struct edns_data* edns, sldns_buffer* buf, 
483         uint16_t qid, mesh_cb_func_type cb, void* cb_arg)
484 {
485         struct mesh_state* s = NULL;
486         int unique = unique_mesh_state(edns->opt_list, mesh->env);
487         int was_detached = 0;
488         int was_noreply = 0;
489         int added = 0;
490         if(!unique)
491                 s = mesh_area_find(mesh, NULL, qinfo, qflags&(BIT_RD|BIT_CD), 0, 0);
492
493         /* there are no limits on the number of callbacks */
494
495         /* see if it already exists, if not, create one */
496         if(!s) {
497 #ifdef UNBOUND_DEBUG
498                 struct rbnode_type* n;
499 #endif
500                 s = mesh_state_create(mesh->env, qinfo, NULL,
501                         qflags&(BIT_RD|BIT_CD), 0, 0);
502                 if(!s) {
503                         return 0;
504                 }
505                 if(unique)
506                         mesh_state_make_unique(s);
507                 if(edns->opt_list) {
508                         s->s.edns_opts_front_in = edns_opt_copy_region(edns->opt_list,
509                                 s->s.region);
510                         if(!s->s.edns_opts_front_in) {
511                                 return 0;
512                         }
513                 }
514 #ifdef UNBOUND_DEBUG
515                 n =
516 #else
517                 (void)
518 #endif
519                 rbtree_insert(&mesh->all, &s->node);
520                 log_assert(n != NULL);
521                 /* set detached (it is now) */
522                 mesh->num_detached_states++;
523                 added = 1;
524         }
525         if(!s->reply_list && !s->cb_list && s->super_set.count == 0)
526                 was_detached = 1;
527         if(!s->reply_list && !s->cb_list)
528                 was_noreply = 1;
529         /* add reply to s */
530         if(!mesh_state_add_cb(s, edns, buf, cb, cb_arg, qid, qflags)) {
531                         if(added)
532                                 mesh_state_delete(&s->s);
533                         return 0;
534         }
535         /* update statistics */
536         if(was_detached) {
537                 log_assert(mesh->num_detached_states > 0);
538                 mesh->num_detached_states--;
539         }
540         if(was_noreply) {
541                 mesh->num_reply_states ++;
542         }
543         mesh->num_reply_addrs++;
544         if(added)
545                 mesh_run(mesh, s, module_event_new, NULL);
546         return 1;
547 }
548
549 static void mesh_schedule_prefetch(struct mesh_area* mesh,
550         struct query_info* qinfo, uint16_t qflags, time_t leeway, int run);
551
552 void mesh_new_prefetch(struct mesh_area* mesh, struct query_info* qinfo,
553         uint16_t qflags, time_t leeway)
554 {
555         mesh_schedule_prefetch(mesh, qinfo, qflags, leeway, 1);
556 }
557
558 /* Internal backend routine of mesh_new_prefetch().  It takes one additional
559  * parameter, 'run', which controls whether to run the prefetch state
560  * immediately.  When this function is called internally 'run' could be
561  * 0 (false), in which case the new state is only made runnable so it
562  * will not be run recursively on top of the current state. */
563 static void mesh_schedule_prefetch(struct mesh_area* mesh,
564         struct query_info* qinfo, uint16_t qflags, time_t leeway, int run)
565 {
566         struct mesh_state* s = mesh_area_find(mesh, NULL, qinfo,
567                 qflags&(BIT_RD|BIT_CD), 0, 0);
568 #ifdef UNBOUND_DEBUG
569         struct rbnode_type* n;
570 #endif
571         /* already exists, and for a different purpose perhaps.
572          * if mesh_no_list, keep it that way. */
573         if(s) {
574                 /* make it ignore the cache from now on */
575                 if(!s->s.blacklist)
576                         sock_list_insert(&s->s.blacklist, NULL, 0, s->s.region);
577                 if(s->s.prefetch_leeway < leeway)
578                         s->s.prefetch_leeway = leeway;
579                 return;
580         }
581         if(!mesh_make_new_space(mesh, NULL)) {
582                 verbose(VERB_ALGO, "Too many queries. dropped prefetch.");
583                 mesh->stats_dropped ++;
584                 return;
585         }
586
587         s = mesh_state_create(mesh->env, qinfo, NULL,
588                 qflags&(BIT_RD|BIT_CD), 0, 0);
589         if(!s) {
590                 log_err("prefetch mesh_state_create: out of memory");
591                 return;
592         }
593 #ifdef UNBOUND_DEBUG
594         n =
595 #else
596         (void)
597 #endif
598         rbtree_insert(&mesh->all, &s->node);
599         log_assert(n != NULL);
600         /* set detached (it is now) */
601         mesh->num_detached_states++;
602         /* make it ignore the cache */
603         sock_list_insert(&s->s.blacklist, NULL, 0, s->s.region);
604         s->s.prefetch_leeway = leeway;
605
606         if(s->list_select == mesh_no_list) {
607                 /* move to either the forever or the jostle_list */
608                 if(mesh->num_forever_states < mesh->max_forever_states) {
609                         mesh->num_forever_states ++;
610                         mesh_list_insert(s, &mesh->forever_first, 
611                                 &mesh->forever_last);
612                         s->list_select = mesh_forever_list;
613                 } else {
614                         mesh_list_insert(s, &mesh->jostle_first, 
615                                 &mesh->jostle_last);
616                         s->list_select = mesh_jostle_list;
617                 }
618         }
619
620         if(!run) {
621 #ifdef UNBOUND_DEBUG
622                 n =
623 #else
624                 (void)
625 #endif
626                 rbtree_insert(&mesh->run, &s->run_node);
627                 log_assert(n != NULL);
628                 return;
629         }
630
631         mesh_run(mesh, s, module_event_new, NULL);
632 }
633
634 void mesh_report_reply(struct mesh_area* mesh, struct outbound_entry* e,
635         struct comm_reply* reply, int what)
636 {
637         enum module_ev event = module_event_reply;
638         e->qstate->reply = reply;
639         if(what != NETEVENT_NOERROR) {
640                 event = module_event_noreply;
641                 if(what == NETEVENT_CAPSFAIL)
642                         event = module_event_capsfail;
643         }
644         mesh_run(mesh, e->qstate->mesh_info, event, e);
645 }
646
647 struct mesh_state*
648 mesh_state_create(struct module_env* env, struct query_info* qinfo,
649         struct respip_client_info* cinfo, uint16_t qflags, int prime,
650         int valrec)
651 {
652         struct regional* region = alloc_reg_obtain(env->alloc);
653         struct mesh_state* mstate;
654         int i;
655         if(!region)
656                 return NULL;
657         mstate = (struct mesh_state*)regional_alloc(region, 
658                 sizeof(struct mesh_state));
659         if(!mstate) {
660                 alloc_reg_release(env->alloc, region);
661                 return NULL;
662         }
663         memset(mstate, 0, sizeof(*mstate));
664         mstate->node = *RBTREE_NULL;
665         mstate->run_node = *RBTREE_NULL;
666         mstate->node.key = mstate;
667         mstate->run_node.key = mstate;
668         mstate->reply_list = NULL;
669         mstate->list_select = mesh_no_list;
670         mstate->replies_sent = 0;
671         rbtree_init(&mstate->super_set, &mesh_state_ref_compare);
672         rbtree_init(&mstate->sub_set, &mesh_state_ref_compare);
673         mstate->num_activated = 0;
674         mstate->unique = NULL;
675         /* init module qstate */
676         mstate->s.qinfo.qtype = qinfo->qtype;
677         mstate->s.qinfo.qclass = qinfo->qclass;
678         mstate->s.qinfo.local_alias = NULL;
679         mstate->s.qinfo.qname_len = qinfo->qname_len;
680         mstate->s.qinfo.qname = regional_alloc_init(region, qinfo->qname,
681                 qinfo->qname_len);
682         if(!mstate->s.qinfo.qname) {
683                 alloc_reg_release(env->alloc, region);
684                 return NULL;
685         }
686         if(cinfo) {
687                 mstate->s.client_info = regional_alloc_init(region, cinfo,
688                         sizeof(*cinfo));
689                 if(!mstate->s.client_info) {
690                         alloc_reg_release(env->alloc, region);
691                         return NULL;
692                 }
693         }
694         /* remove all weird bits from qflags */
695         mstate->s.query_flags = (qflags & (BIT_RD|BIT_CD));
696         mstate->s.is_priming = prime;
697         mstate->s.is_valrec = valrec;
698         mstate->s.reply = NULL;
699         mstate->s.region = region;
700         mstate->s.curmod = 0;
701         mstate->s.return_msg = 0;
702         mstate->s.return_rcode = LDNS_RCODE_NOERROR;
703         mstate->s.env = env;
704         mstate->s.mesh_info = mstate;
705         mstate->s.prefetch_leeway = 0;
706         mstate->s.no_cache_lookup = 0;
707         mstate->s.no_cache_store = 0;
708         mstate->s.need_refetch = 0;
709         mstate->s.was_ratelimited = 0;
710
711         /* init modules */
712         for(i=0; i<env->mesh->mods.num; i++) {
713                 mstate->s.minfo[i] = NULL;
714                 mstate->s.ext_state[i] = module_state_initial;
715         }
716         /* init edns option lists */
717         mstate->s.edns_opts_front_in = NULL;
718         mstate->s.edns_opts_back_out = NULL;
719         mstate->s.edns_opts_back_in = NULL;
720         mstate->s.edns_opts_front_out = NULL;
721
722         return mstate;
723 }
724
725 int
726 mesh_state_is_unique(struct mesh_state* mstate)
727 {
728         return mstate->unique != NULL;
729 }
730
731 void
732 mesh_state_make_unique(struct mesh_state* mstate)
733 {
734         mstate->unique = mstate;
735 }
736
737 void 
738 mesh_state_cleanup(struct mesh_state* mstate)
739 {
740         struct mesh_area* mesh;
741         int i;
742         if(!mstate)
743                 return;
744         mesh = mstate->s.env->mesh;
745         /* drop unsent replies */
746         if(!mstate->replies_sent) {
747                 struct mesh_reply* rep = mstate->reply_list;
748                 struct mesh_cb* cb;
749                 /* in tcp_req_info, the mstates linked are removed, but
750                  * the reply_list is now NULL, so the remove-from-empty-list
751                  * takes no time and also it does not do the mesh accounting */
752                 mstate->reply_list = NULL;
753                 for(; rep; rep=rep->next) {
754                         comm_point_drop_reply(&rep->query_reply);
755                         mesh->num_reply_addrs--;
756                 }
757                 while((cb = mstate->cb_list)!=NULL) {
758                         mstate->cb_list = cb->next;
759                         fptr_ok(fptr_whitelist_mesh_cb(cb->cb));
760                         (*cb->cb)(cb->cb_arg, LDNS_RCODE_SERVFAIL, NULL,
761                                 sec_status_unchecked, NULL, 0);
762                         mesh->num_reply_addrs--;
763                 }
764         }
765
766         /* de-init modules */
767         for(i=0; i<mesh->mods.num; i++) {
768                 fptr_ok(fptr_whitelist_mod_clear(mesh->mods.mod[i]->clear));
769                 (*mesh->mods.mod[i]->clear)(&mstate->s, i);
770                 mstate->s.minfo[i] = NULL;
771                 mstate->s.ext_state[i] = module_finished;
772         }
773         alloc_reg_release(mstate->s.env->alloc, mstate->s.region);
774 }
775
776 void 
777 mesh_state_delete(struct module_qstate* qstate)
778 {
779         struct mesh_area* mesh;
780         struct mesh_state_ref* super, ref;
781         struct mesh_state* mstate;
782         if(!qstate)
783                 return;
784         mstate = qstate->mesh_info;
785         mesh = mstate->s.env->mesh;
786         mesh_detach_subs(&mstate->s);
787         if(mstate->list_select == mesh_forever_list) {
788                 mesh->num_forever_states --;
789                 mesh_list_remove(mstate, &mesh->forever_first, 
790                         &mesh->forever_last);
791         } else if(mstate->list_select == mesh_jostle_list) {
792                 mesh_list_remove(mstate, &mesh->jostle_first, 
793                         &mesh->jostle_last);
794         }
795         if(!mstate->reply_list && !mstate->cb_list
796                 && mstate->super_set.count == 0) {
797                 log_assert(mesh->num_detached_states > 0);
798                 mesh->num_detached_states--;
799         }
800         if(mstate->reply_list || mstate->cb_list) {
801                 log_assert(mesh->num_reply_states > 0);
802                 mesh->num_reply_states--;
803         }
804         ref.node.key = &ref;
805         ref.s = mstate;
806         RBTREE_FOR(super, struct mesh_state_ref*, &mstate->super_set) {
807                 (void)rbtree_delete(&super->s->sub_set, &ref);
808         }
809         (void)rbtree_delete(&mesh->run, mstate);
810         (void)rbtree_delete(&mesh->all, mstate);
811         mesh_state_cleanup(mstate);
812 }
813
814 /** helper recursive rbtree find routine */
815 static int
816 find_in_subsub(struct mesh_state* m, struct mesh_state* tofind, size_t *c)
817 {
818         struct mesh_state_ref* r;
819         if((*c)++ > MESH_MAX_SUBSUB)
820                 return 1;
821         RBTREE_FOR(r, struct mesh_state_ref*, &m->sub_set) {
822                 if(r->s == tofind || find_in_subsub(r->s, tofind, c))
823                         return 1;
824         }
825         return 0;
826 }
827
828 /** find cycle for already looked up mesh_state */
829 static int 
830 mesh_detect_cycle_found(struct module_qstate* qstate, struct mesh_state* dep_m)
831 {
832         struct mesh_state* cyc_m = qstate->mesh_info;
833         size_t counter = 0;
834         if(!dep_m)
835                 return 0;
836         if(dep_m == cyc_m || find_in_subsub(dep_m, cyc_m, &counter)) {
837                 if(counter > MESH_MAX_SUBSUB)
838                         return 2;
839                 return 1;
840         }
841         return 0;
842 }
843
844 void mesh_detach_subs(struct module_qstate* qstate)
845 {
846         struct mesh_area* mesh = qstate->env->mesh;
847         struct mesh_state_ref* ref, lookup;
848 #ifdef UNBOUND_DEBUG
849         struct rbnode_type* n;
850 #endif
851         lookup.node.key = &lookup;
852         lookup.s = qstate->mesh_info;
853         RBTREE_FOR(ref, struct mesh_state_ref*, &qstate->mesh_info->sub_set) {
854 #ifdef UNBOUND_DEBUG
855                 n =
856 #else
857                 (void)
858 #endif
859                 rbtree_delete(&ref->s->super_set, &lookup);
860                 log_assert(n != NULL); /* must have been present */
861                 if(!ref->s->reply_list && !ref->s->cb_list
862                         && ref->s->super_set.count == 0) {
863                         mesh->num_detached_states++;
864                         log_assert(mesh->num_detached_states + 
865                                 mesh->num_reply_states <= mesh->all.count);
866                 }
867         }
868         rbtree_init(&qstate->mesh_info->sub_set, &mesh_state_ref_compare);
869 }
870
871 int mesh_add_sub(struct module_qstate* qstate, struct query_info* qinfo,
872         uint16_t qflags, int prime, int valrec, struct module_qstate** newq,
873         struct mesh_state** sub)
874 {
875         /* find it, if not, create it */
876         struct mesh_area* mesh = qstate->env->mesh;
877         *sub = mesh_area_find(mesh, NULL, qinfo, qflags,
878                 prime, valrec);
879         if(mesh_detect_cycle_found(qstate, *sub)) {
880                 verbose(VERB_ALGO, "attach failed, cycle detected");
881                 return 0;
882         }
883         if(!*sub) {
884 #ifdef UNBOUND_DEBUG
885                 struct rbnode_type* n;
886 #endif
887                 /* create a new one */
888                 *sub = mesh_state_create(qstate->env, qinfo, NULL, qflags, prime,
889                         valrec);
890                 if(!*sub) {
891                         log_err("mesh_attach_sub: out of memory");
892                         return 0;
893                 }
894 #ifdef UNBOUND_DEBUG
895                 n =
896 #else
897                 (void)
898 #endif
899                 rbtree_insert(&mesh->all, &(*sub)->node);
900                 log_assert(n != NULL);
901                 /* set detached (it is now) */
902                 mesh->num_detached_states++;
903                 /* set new query state to run */
904 #ifdef UNBOUND_DEBUG
905                 n =
906 #else
907                 (void)
908 #endif
909                 rbtree_insert(&mesh->run, &(*sub)->run_node);
910                 log_assert(n != NULL);
911                 *newq = &(*sub)->s;
912         } else
913                 *newq = NULL;
914         return 1;
915 }
916
917 int mesh_attach_sub(struct module_qstate* qstate, struct query_info* qinfo,
918         uint16_t qflags, int prime, int valrec, struct module_qstate** newq)
919 {
920         struct mesh_area* mesh = qstate->env->mesh;
921         struct mesh_state* sub = NULL;
922         int was_detached;
923         if(!mesh_add_sub(qstate, qinfo, qflags, prime, valrec, newq, &sub))
924                 return 0;
925         was_detached = (sub->super_set.count == 0);
926         if(!mesh_state_attachment(qstate->mesh_info, sub))
927                 return 0;
928         /* if it was a duplicate  attachment, the count was not zero before */
929         if(!sub->reply_list && !sub->cb_list && was_detached && 
930                 sub->super_set.count == 1) {
931                 /* it used to be detached, before this one got added */
932                 log_assert(mesh->num_detached_states > 0);
933                 mesh->num_detached_states--;
934         }
935         /* *newq will be run when inited after the current module stops */
936         return 1;
937 }
938
939 int mesh_state_attachment(struct mesh_state* super, struct mesh_state* sub)
940 {
941 #ifdef UNBOUND_DEBUG
942         struct rbnode_type* n;
943 #endif
944         struct mesh_state_ref* subref; /* points to sub, inserted in super */
945         struct mesh_state_ref* superref; /* points to super, inserted in sub */
946         if( !(subref = regional_alloc(super->s.region,
947                 sizeof(struct mesh_state_ref))) ||
948                 !(superref = regional_alloc(sub->s.region,
949                 sizeof(struct mesh_state_ref))) ) {
950                 log_err("mesh_state_attachment: out of memory");
951                 return 0;
952         }
953         superref->node.key = superref;
954         superref->s = super;
955         subref->node.key = subref;
956         subref->s = sub;
957         if(!rbtree_insert(&sub->super_set, &superref->node)) {
958                 /* this should not happen, iterator and validator do not
959                  * attach subqueries that are identical. */
960                 /* already attached, we are done, nothing todo.
961                  * since superref and subref already allocated in region,
962                  * we cannot free them */
963                 return 1;
964         }
965 #ifdef UNBOUND_DEBUG
966         n =
967 #else
968         (void)
969 #endif
970         rbtree_insert(&super->sub_set, &subref->node);
971         log_assert(n != NULL); /* we checked above if statement, the reverse
972           administration should not fail now, unless they are out of sync */
973         return 1;
974 }
975
976 /**
977  * callback results to mesh cb entry
978  * @param m: mesh state to send it for.
979  * @param rcode: if not 0, error code.
980  * @param rep: reply to send (or NULL if rcode is set).
981  * @param r: callback entry
982  */
983 static void
984 mesh_do_callback(struct mesh_state* m, int rcode, struct reply_info* rep,
985         struct mesh_cb* r)
986 {
987         int secure;
988         char* reason = NULL;
989         int was_ratelimited = m->s.was_ratelimited;
990         /* bogus messages are not made into servfail, sec_status passed
991          * to the callback function */
992         if(rep && rep->security == sec_status_secure)
993                 secure = 1;
994         else    secure = 0;
995         if(!rep && rcode == LDNS_RCODE_NOERROR)
996                 rcode = LDNS_RCODE_SERVFAIL;
997         if(!rcode && (rep->security == sec_status_bogus ||
998                 rep->security == sec_status_secure_sentinel_fail)) {
999                 if(!(reason = errinf_to_str_bogus(&m->s)))
1000                         rcode = LDNS_RCODE_SERVFAIL;
1001         }
1002         /* send the reply */
1003         if(rcode) {
1004                 if(rcode == LDNS_RCODE_SERVFAIL) {
1005                         if(!inplace_cb_reply_servfail_call(m->s.env, &m->s.qinfo, &m->s,
1006                                 rep, rcode, &r->edns, NULL, m->s.region))
1007                                         r->edns.opt_list = NULL;
1008                 } else {
1009                         if(!inplace_cb_reply_call(m->s.env, &m->s.qinfo, &m->s, rep, rcode,
1010                                 &r->edns, NULL, m->s.region))
1011                                         r->edns.opt_list = NULL;
1012                 }
1013                 fptr_ok(fptr_whitelist_mesh_cb(r->cb));
1014                 (*r->cb)(r->cb_arg, rcode, r->buf, sec_status_unchecked, NULL,
1015                         was_ratelimited);
1016         } else {
1017                 size_t udp_size = r->edns.udp_size;
1018                 sldns_buffer_clear(r->buf);
1019                 r->edns.edns_version = EDNS_ADVERTISED_VERSION;
1020                 r->edns.udp_size = EDNS_ADVERTISED_SIZE;
1021                 r->edns.ext_rcode = 0;
1022                 r->edns.bits &= EDNS_DO;
1023
1024                 if(!inplace_cb_reply_call(m->s.env, &m->s.qinfo, &m->s, rep,
1025                         LDNS_RCODE_NOERROR, &r->edns, NULL, m->s.region) ||
1026                         !reply_info_answer_encode(&m->s.qinfo, rep, r->qid, 
1027                         r->qflags, r->buf, 0, 1, 
1028                         m->s.env->scratch, udp_size, &r->edns, 
1029                         (int)(r->edns.bits & EDNS_DO), secure)) 
1030                 {
1031                         fptr_ok(fptr_whitelist_mesh_cb(r->cb));
1032                         (*r->cb)(r->cb_arg, LDNS_RCODE_SERVFAIL, r->buf,
1033                                 sec_status_unchecked, NULL, 0);
1034                 } else {
1035                         fptr_ok(fptr_whitelist_mesh_cb(r->cb));
1036                         (*r->cb)(r->cb_arg, LDNS_RCODE_NOERROR, r->buf,
1037                                 rep->security, reason, was_ratelimited);
1038                 }
1039         }
1040         free(reason);
1041         m->s.env->mesh->num_reply_addrs--;
1042 }
1043
1044 /**
1045  * Send reply to mesh reply entry
1046  * @param m: mesh state to send it for.
1047  * @param rcode: if not 0, error code.
1048  * @param rep: reply to send (or NULL if rcode is set).
1049  * @param r: reply entry
1050  * @param r_buffer: buffer to use for reply entry.
1051  * @param prev: previous reply, already has its answer encoded in buffer.
1052  * @param prev_buffer: buffer for previous reply.
1053  */
1054 static void
1055 mesh_send_reply(struct mesh_state* m, int rcode, struct reply_info* rep,
1056         struct mesh_reply* r, struct sldns_buffer* r_buffer,
1057         struct mesh_reply* prev, struct sldns_buffer* prev_buffer)
1058 {
1059         struct timeval end_time;
1060         struct timeval duration;
1061         int secure;
1062         /* Copy the client's EDNS for later restore, to make sure the edns
1063          * compare is with the correct edns options. */
1064         struct edns_data edns_bak = r->edns;
1065         /* examine security status */
1066         if(m->s.env->need_to_validate && (!(r->qflags&BIT_CD) ||
1067                 m->s.env->cfg->ignore_cd) && rep && 
1068                 (rep->security <= sec_status_bogus ||
1069                 rep->security == sec_status_secure_sentinel_fail)) {
1070                 rcode = LDNS_RCODE_SERVFAIL;
1071                 if(m->s.env->cfg->stat_extended) 
1072                         m->s.env->mesh->ans_bogus++;
1073         }
1074         if(rep && rep->security == sec_status_secure)
1075                 secure = 1;
1076         else    secure = 0;
1077         if(!rep && rcode == LDNS_RCODE_NOERROR)
1078                 rcode = LDNS_RCODE_SERVFAIL;
1079         /* send the reply */
1080         /* We don't reuse the encoded answer if either the previous or current
1081          * response has a local alias.  We could compare the alias records
1082          * and still reuse the previous answer if they are the same, but that
1083          * would be complicated and error prone for the relatively minor case.
1084          * So we err on the side of safety. */
1085         if(prev && prev_buffer && prev->qflags == r->qflags && 
1086                 !prev->local_alias && !r->local_alias &&
1087                 prev->edns.edns_present == r->edns.edns_present && 
1088                 prev->edns.bits == r->edns.bits && 
1089                 prev->edns.udp_size == r->edns.udp_size &&
1090                 edns_opt_list_compare(prev->edns.opt_list, r->edns.opt_list)
1091                 == 0) {
1092                 /* if the previous reply is identical to this one, fix ID */
1093                 if(prev_buffer != r_buffer)
1094                         sldns_buffer_copy(r_buffer, prev_buffer);
1095                 sldns_buffer_write_at(r_buffer, 0, &r->qid, sizeof(uint16_t));
1096                 sldns_buffer_write_at(r_buffer, 12, r->qname,
1097                         m->s.qinfo.qname_len);
1098                 comm_point_send_reply(&r->query_reply);
1099         } else if(rcode) {
1100                 m->s.qinfo.qname = r->qname;
1101                 m->s.qinfo.local_alias = r->local_alias;
1102                 if(rcode == LDNS_RCODE_SERVFAIL) {
1103                         if(!inplace_cb_reply_servfail_call(m->s.env, &m->s.qinfo, &m->s,
1104                                 rep, rcode, &r->edns, NULL, m->s.region))
1105                                         r->edns.opt_list = NULL;
1106                 } else { 
1107                         if(!inplace_cb_reply_call(m->s.env, &m->s.qinfo, &m->s, rep, rcode,
1108                                 &r->edns, NULL, m->s.region))
1109                                         r->edns.opt_list = NULL;
1110                 }
1111                 error_encode(r_buffer, rcode, &m->s.qinfo, r->qid,
1112                         r->qflags, &r->edns);
1113                 comm_point_send_reply(&r->query_reply);
1114         } else {
1115                 size_t udp_size = r->edns.udp_size;
1116                 r->edns.edns_version = EDNS_ADVERTISED_VERSION;
1117                 r->edns.udp_size = EDNS_ADVERTISED_SIZE;
1118                 r->edns.ext_rcode = 0;
1119                 r->edns.bits &= EDNS_DO;
1120                 m->s.qinfo.qname = r->qname;
1121                 m->s.qinfo.local_alias = r->local_alias;
1122                 if(!inplace_cb_reply_call(m->s.env, &m->s.qinfo, &m->s, rep,
1123                         LDNS_RCODE_NOERROR, &r->edns, NULL, m->s.region) ||
1124                         !apply_edns_options(&r->edns, &edns_bak,
1125                                 m->s.env->cfg, r->query_reply.c,
1126                                 m->s.region) ||
1127                         !reply_info_answer_encode(&m->s.qinfo, rep, r->qid, 
1128                         r->qflags, r_buffer, 0, 1, m->s.env->scratch,
1129                         udp_size, &r->edns, (int)(r->edns.bits & EDNS_DO),
1130                         secure)) 
1131                 {
1132                         if(!inplace_cb_reply_servfail_call(m->s.env, &m->s.qinfo, &m->s,
1133                         rep, LDNS_RCODE_SERVFAIL, &r->edns, NULL, m->s.region))
1134                                 r->edns.opt_list = NULL;
1135                         error_encode(r_buffer, LDNS_RCODE_SERVFAIL,
1136                                 &m->s.qinfo, r->qid, r->qflags, &r->edns);
1137                 }
1138                 r->edns = edns_bak;
1139                 comm_point_send_reply(&r->query_reply);
1140         }
1141         /* account */
1142         m->s.env->mesh->num_reply_addrs--;
1143         end_time = *m->s.env->now_tv;
1144         timeval_subtract(&duration, &end_time, &r->start_time);
1145         verbose(VERB_ALGO, "query took " ARG_LL "d.%6.6d sec",
1146                 (long long)duration.tv_sec, (int)duration.tv_usec);
1147         m->s.env->mesh->replies_sent++;
1148         timeval_add(&m->s.env->mesh->replies_sum_wait, &duration);
1149         timehist_insert(m->s.env->mesh->histogram, &duration);
1150         if(m->s.env->cfg->stat_extended) {
1151                 uint16_t rc = FLAGS_GET_RCODE(sldns_buffer_read_u16_at(
1152                         r_buffer, 2));
1153                 if(secure) m->s.env->mesh->ans_secure++;
1154                 m->s.env->mesh->ans_rcode[ rc ] ++;
1155                 if(rc == 0 && LDNS_ANCOUNT(sldns_buffer_begin(r_buffer)) == 0)
1156                         m->s.env->mesh->ans_nodata++;
1157         }
1158         /* Log reply sent */
1159         if(m->s.env->cfg->log_replies) {
1160                 log_reply_info(0, &m->s.qinfo, &r->query_reply.addr,
1161                         r->query_reply.addrlen, duration, 0, r_buffer);
1162         }
1163 }
1164
1165 void mesh_query_done(struct mesh_state* mstate)
1166 {
1167         struct mesh_reply* r;
1168         struct mesh_reply* prev = NULL;
1169         struct sldns_buffer* prev_buffer = NULL;
1170         struct mesh_cb* c;
1171         struct reply_info* rep = (mstate->s.return_msg?
1172                 mstate->s.return_msg->rep:NULL);
1173         if((mstate->s.return_rcode == LDNS_RCODE_SERVFAIL ||
1174                 (rep && FLAGS_GET_RCODE(rep->flags) == LDNS_RCODE_SERVFAIL))
1175                 && mstate->s.env->cfg->log_servfail
1176                 && !mstate->s.env->cfg->val_log_squelch) {
1177                 char* err = errinf_to_str_servfail(&mstate->s);
1178                 if(err)
1179                         log_err("%s", err);
1180                 free(err);
1181         }
1182         for(r = mstate->reply_list; r; r = r->next) {
1183                 /* if a response-ip address block has been stored the
1184                  *  information should be logged for each client. */
1185                 if(mstate->s.respip_action_info &&
1186                         mstate->s.respip_action_info->addrinfo) {
1187                         respip_inform_print(mstate->s.respip_action_info->addrinfo,
1188                                 r->qname, mstate->s.qinfo.qtype,
1189                                 mstate->s.qinfo.qclass, r->local_alias,
1190                                 &r->query_reply);
1191                 }
1192
1193                 /* if this query is determined to be dropped during the
1194                  * mesh processing, this is the point to take that action. */
1195                 if(mstate->s.is_drop)
1196                         comm_point_drop_reply(&r->query_reply);
1197                 else {
1198                         struct sldns_buffer* r_buffer = r->query_reply.c->buffer;
1199                         if(r->query_reply.c->tcp_req_info) {
1200                                 r_buffer = r->query_reply.c->tcp_req_info->spool_buffer;
1201                                 prev_buffer = NULL;
1202                         }
1203                         mesh_send_reply(mstate, mstate->s.return_rcode, rep,
1204                                 r, r_buffer, prev, prev_buffer);
1205                         if(r->query_reply.c->tcp_req_info) {
1206                                 tcp_req_info_remove_mesh_state(r->query_reply.c->tcp_req_info, mstate);
1207                                 r_buffer = NULL;
1208                         }
1209                         prev = r;
1210                         prev_buffer = r_buffer;
1211                 }
1212         }
1213         mstate->replies_sent = 1;
1214         while((c = mstate->cb_list) != NULL) {
1215                 /* take this cb off the list; so that the list can be
1216                  * changed, eg. by adds from the callback routine */
1217                 if(!mstate->reply_list && mstate->cb_list && !c->next) {
1218                         /* was a reply state, not anymore */
1219                         mstate->s.env->mesh->num_reply_states--;
1220                 }
1221                 mstate->cb_list = c->next;
1222                 if(!mstate->reply_list && !mstate->cb_list &&
1223                         mstate->super_set.count == 0)
1224                         mstate->s.env->mesh->num_detached_states++;
1225                 mesh_do_callback(mstate, mstate->s.return_rcode, rep, c);
1226         }
1227 }
1228
1229 void mesh_walk_supers(struct mesh_area* mesh, struct mesh_state* mstate)
1230 {
1231         struct mesh_state_ref* ref;
1232         RBTREE_FOR(ref, struct mesh_state_ref*, &mstate->super_set)
1233         {
1234                 /* make super runnable */
1235                 (void)rbtree_insert(&mesh->run, &ref->s->run_node);
1236                 /* callback the function to inform super of result */
1237                 fptr_ok(fptr_whitelist_mod_inform_super(
1238                         mesh->mods.mod[ref->s->s.curmod]->inform_super));
1239                 (*mesh->mods.mod[ref->s->s.curmod]->inform_super)(&mstate->s, 
1240                         ref->s->s.curmod, &ref->s->s);
1241                 /* copy state that is always relevant to super */
1242                 copy_state_to_super(&mstate->s, ref->s->s.curmod, &ref->s->s);
1243         }
1244 }
1245
1246 struct mesh_state* mesh_area_find(struct mesh_area* mesh,
1247         struct respip_client_info* cinfo, struct query_info* qinfo,
1248         uint16_t qflags, int prime, int valrec)
1249 {
1250         struct mesh_state key;
1251         struct mesh_state* result;
1252
1253         key.node.key = &key;
1254         key.s.is_priming = prime;
1255         key.s.is_valrec = valrec;
1256         key.s.qinfo = *qinfo;
1257         key.s.query_flags = qflags;
1258         /* We are searching for a similar mesh state when we DO want to
1259          * aggregate the state. Thus unique is set to NULL. (default when we
1260          * desire aggregation).*/
1261         key.unique = NULL;
1262         key.s.client_info = cinfo;
1263         
1264         result = (struct mesh_state*)rbtree_search(&mesh->all, &key);
1265         return result;
1266 }
1267
1268 int mesh_state_add_cb(struct mesh_state* s, struct edns_data* edns,
1269         sldns_buffer* buf, mesh_cb_func_type cb, void* cb_arg,
1270         uint16_t qid, uint16_t qflags)
1271 {
1272         struct mesh_cb* r = regional_alloc(s->s.region, 
1273                 sizeof(struct mesh_cb));
1274         if(!r)
1275                 return 0;
1276         r->buf = buf;
1277         log_assert(fptr_whitelist_mesh_cb(cb)); /* early failure ifmissing*/
1278         r->cb = cb;
1279         r->cb_arg = cb_arg;
1280         r->edns = *edns;
1281         if(edns->opt_list) {
1282                 r->edns.opt_list = edns_opt_copy_region(edns->opt_list,
1283                         s->s.region);
1284                 if(!r->edns.opt_list)
1285                         return 0;
1286         }
1287         r->qid = qid;
1288         r->qflags = qflags;
1289         r->next = s->cb_list;
1290         s->cb_list = r;
1291         return 1;
1292
1293 }
1294
1295 int mesh_state_add_reply(struct mesh_state* s, struct edns_data* edns,
1296         struct comm_reply* rep, uint16_t qid, uint16_t qflags,
1297         const struct query_info* qinfo)
1298 {
1299         struct mesh_reply* r = regional_alloc(s->s.region, 
1300                 sizeof(struct mesh_reply));
1301         if(!r)
1302                 return 0;
1303         r->query_reply = *rep;
1304         r->edns = *edns;
1305         if(edns->opt_list) {
1306                 r->edns.opt_list = edns_opt_copy_region(edns->opt_list,
1307                         s->s.region);
1308                 if(!r->edns.opt_list)
1309                         return 0;
1310         }
1311         r->qid = qid;
1312         r->qflags = qflags;
1313         r->start_time = *s->s.env->now_tv;
1314         r->next = s->reply_list;
1315         r->qname = regional_alloc_init(s->s.region, qinfo->qname,
1316                 s->s.qinfo.qname_len);
1317         if(!r->qname)
1318                 return 0;
1319
1320         /* Data related to local alias stored in 'qinfo' (if any) is ephemeral
1321          * and can be different for different original queries (even if the
1322          * replaced query name is the same).  So we need to make a deep copy
1323          * and store the copy for each reply info. */
1324         if(qinfo->local_alias) {
1325                 struct packed_rrset_data* d;
1326                 struct packed_rrset_data* dsrc;
1327                 r->local_alias = regional_alloc_zero(s->s.region,
1328                         sizeof(*qinfo->local_alias));
1329                 if(!r->local_alias)
1330                         return 0;
1331                 r->local_alias->rrset = regional_alloc_init(s->s.region,
1332                         qinfo->local_alias->rrset,
1333                         sizeof(*qinfo->local_alias->rrset));
1334                 if(!r->local_alias->rrset)
1335                         return 0;
1336                 dsrc = qinfo->local_alias->rrset->entry.data;
1337
1338                 /* In the current implementation, a local alias must be
1339                  * a single CNAME RR (see worker_handle_request()). */
1340                 log_assert(!qinfo->local_alias->next && dsrc->count == 1 &&
1341                         qinfo->local_alias->rrset->rk.type ==
1342                         htons(LDNS_RR_TYPE_CNAME));
1343                 /* Technically, we should make a local copy for the owner
1344                  * name of the RRset, but in the case of the first (and
1345                  * currently only) local alias RRset, the owner name should
1346                  * point to the qname of the corresponding query, which should
1347                  * be valid throughout the lifetime of this mesh_reply.  So
1348                  * we can skip copying. */
1349                 log_assert(qinfo->local_alias->rrset->rk.dname ==
1350                         sldns_buffer_at(rep->c->buffer, LDNS_HEADER_SIZE));
1351
1352                 /* the rrset is not packed, like in the cache, but it is
1353                  * individualy allocated with an allocator from localzone. */
1354                 d = regional_alloc_zero(s->s.region, sizeof(*d));
1355                 if(!d)
1356                         return 0;
1357                 r->local_alias->rrset->entry.data = d;
1358                 if(!rrset_insert_rr(s->s.region, d, dsrc->rr_data[0],
1359                         dsrc->rr_len[0], dsrc->rr_ttl[0], "CNAME local alias"))
1360                         return 0;
1361         } else
1362                 r->local_alias = NULL;
1363
1364         s->reply_list = r;
1365         return 1;
1366 }
1367
1368 /* Extract the query info and flags from 'mstate' into '*qinfop' and '*qflags'.
1369  * Since this is only used for internal refetch of otherwise-expired answer,
1370  * we simply ignore the rare failure mode when memory allocation fails. */
1371 static void
1372 mesh_copy_qinfo(struct mesh_state* mstate, struct query_info** qinfop,
1373         uint16_t* qflags)
1374 {
1375         struct regional* region = mstate->s.env->scratch;
1376         struct query_info* qinfo;
1377
1378         qinfo = regional_alloc_init(region, &mstate->s.qinfo, sizeof(*qinfo));
1379         if(!qinfo)
1380                 return;
1381         qinfo->qname = regional_alloc_init(region, qinfo->qname,
1382                 qinfo->qname_len);
1383         if(!qinfo->qname)
1384                 return;
1385         *qinfop = qinfo;
1386         *qflags = mstate->s.query_flags;
1387 }
1388
1389 /**
1390  * Continue processing the mesh state at another module.
1391  * Handles module to modules transfer of control.
1392  * Handles module finished.
1393  * @param mesh: the mesh area.
1394  * @param mstate: currently active mesh state.
1395  *      Deleted if finished, calls _done and _supers to 
1396  *      send replies to clients and inform other mesh states.
1397  *      This in turn may create additional runnable mesh states.
1398  * @param s: state at which the current module exited.
1399  * @param ev: the event sent to the module.
1400  *      returned is the event to send to the next module.
1401  * @return true if continue processing at the new module.
1402  *      false if not continued processing is needed.
1403  */
1404 static int
1405 mesh_continue(struct mesh_area* mesh, struct mesh_state* mstate,
1406         enum module_ext_state s, enum module_ev* ev)
1407 {
1408         mstate->num_activated++;
1409         if(mstate->num_activated > MESH_MAX_ACTIVATION) {
1410                 /* module is looping. Stop it. */
1411                 log_err("internal error: looping module (%s) stopped",
1412                         mesh->mods.mod[mstate->s.curmod]->name);
1413                 log_query_info(0, "pass error for qstate",
1414                         &mstate->s.qinfo);
1415                 s = module_error;
1416         }
1417         if(s == module_wait_module || s == module_restart_next) {
1418                 /* start next module */
1419                 mstate->s.curmod++;
1420                 if(mesh->mods.num == mstate->s.curmod) {
1421                         log_err("Cannot pass to next module; at last module");
1422                         log_query_info(VERB_QUERY, "pass error for qstate",
1423                                 &mstate->s.qinfo);
1424                         mstate->s.curmod--;
1425                         return mesh_continue(mesh, mstate, module_error, ev);
1426                 }
1427                 if(s == module_restart_next) {
1428                         int curmod = mstate->s.curmod;
1429                         for(; mstate->s.curmod < mesh->mods.num; 
1430                                 mstate->s.curmod++) {
1431                                 fptr_ok(fptr_whitelist_mod_clear(
1432                                         mesh->mods.mod[mstate->s.curmod]->clear));
1433                                 (*mesh->mods.mod[mstate->s.curmod]->clear)
1434                                         (&mstate->s, mstate->s.curmod);
1435                                 mstate->s.minfo[mstate->s.curmod] = NULL;
1436                         }
1437                         mstate->s.curmod = curmod;
1438                 }
1439                 *ev = module_event_pass;
1440                 return 1;
1441         }
1442         if(s == module_wait_subquery && mstate->sub_set.count == 0) {
1443                 log_err("module cannot wait for subquery, subquery list empty");
1444                 log_query_info(VERB_QUERY, "pass error for qstate",
1445                         &mstate->s.qinfo);
1446                 s = module_error;
1447         }
1448         if(s == module_error && mstate->s.return_rcode == LDNS_RCODE_NOERROR) {
1449                 /* error is bad, handle pass back up below */
1450                 mstate->s.return_rcode = LDNS_RCODE_SERVFAIL;
1451         }
1452         if(s == module_error) {
1453                 mesh_query_done(mstate);
1454                 mesh_walk_supers(mesh, mstate);
1455                 mesh_state_delete(&mstate->s);
1456                 return 0;
1457         }
1458         if(s == module_finished) {
1459                 if(mstate->s.curmod == 0) {
1460                         struct query_info* qinfo = NULL;
1461                         uint16_t qflags;
1462
1463                         mesh_query_done(mstate);
1464                         mesh_walk_supers(mesh, mstate);
1465
1466                         /* If the answer to the query needs to be refetched
1467                          * from an external DNS server, we'll need to schedule
1468                          * a prefetch after removing the current state, so
1469                          * we need to make a copy of the query info here. */
1470                         if(mstate->s.need_refetch)
1471                                 mesh_copy_qinfo(mstate, &qinfo, &qflags);
1472
1473                         mesh_state_delete(&mstate->s);
1474                         if(qinfo) {
1475                                 mesh_schedule_prefetch(mesh, qinfo, qflags,
1476                                         0, 1);
1477                         }
1478                         return 0;
1479                 }
1480                 /* pass along the locus of control */
1481                 mstate->s.curmod --;
1482                 *ev = module_event_moddone;
1483                 return 1;
1484         }
1485         return 0;
1486 }
1487
1488 void mesh_run(struct mesh_area* mesh, struct mesh_state* mstate,
1489         enum module_ev ev, struct outbound_entry* e)
1490 {
1491         enum module_ext_state s;
1492         verbose(VERB_ALGO, "mesh_run: start");
1493         while(mstate) {
1494                 /* run the module */
1495                 fptr_ok(fptr_whitelist_mod_operate(
1496                         mesh->mods.mod[mstate->s.curmod]->operate));
1497                 (*mesh->mods.mod[mstate->s.curmod]->operate)
1498                         (&mstate->s, ev, mstate->s.curmod, e);
1499
1500                 /* examine results */
1501                 mstate->s.reply = NULL;
1502                 regional_free_all(mstate->s.env->scratch);
1503                 s = mstate->s.ext_state[mstate->s.curmod];
1504                 verbose(VERB_ALGO, "mesh_run: %s module exit state is %s", 
1505                         mesh->mods.mod[mstate->s.curmod]->name, strextstate(s));
1506                 e = NULL;
1507                 if(mesh_continue(mesh, mstate, s, &ev))
1508                         continue;
1509
1510                 /* run more modules */
1511                 ev = module_event_pass;
1512                 if(mesh->run.count > 0) {
1513                         /* pop random element off the runnable tree */
1514                         mstate = (struct mesh_state*)mesh->run.root->key;
1515                         (void)rbtree_delete(&mesh->run, mstate);
1516                 } else mstate = NULL;
1517         }
1518         if(verbosity >= VERB_ALGO) {
1519                 mesh_stats(mesh, "mesh_run: end");
1520                 mesh_log_list(mesh);
1521         }
1522 }
1523
1524 void 
1525 mesh_log_list(struct mesh_area* mesh)
1526 {
1527         char buf[30];
1528         struct mesh_state* m;
1529         int num = 0;
1530         RBTREE_FOR(m, struct mesh_state*, &mesh->all) {
1531                 snprintf(buf, sizeof(buf), "%d%s%s%s%s%s%s mod%d %s%s", 
1532                         num++, (m->s.is_priming)?"p":"",  /* prime */
1533                         (m->s.is_valrec)?"v":"",  /* prime */
1534                         (m->s.query_flags&BIT_RD)?"RD":"",
1535                         (m->s.query_flags&BIT_CD)?"CD":"",
1536                         (m->super_set.count==0)?"d":"", /* detached */
1537                         (m->sub_set.count!=0)?"c":"",  /* children */
1538                         m->s.curmod, (m->reply_list)?"rep":"", /*hasreply*/
1539                         (m->cb_list)?"cb":"" /* callbacks */
1540                         ); 
1541                 log_query_info(VERB_ALGO, buf, &m->s.qinfo);
1542         }
1543 }
1544
1545 void 
1546 mesh_stats(struct mesh_area* mesh, const char* str)
1547 {
1548         verbose(VERB_DETAIL, "%s %u recursion states (%u with reply, "
1549                 "%u detached), %u waiting replies, %u recursion replies "
1550                 "sent, %d replies dropped, %d states jostled out", 
1551                 str, (unsigned)mesh->all.count, 
1552                 (unsigned)mesh->num_reply_states,
1553                 (unsigned)mesh->num_detached_states,
1554                 (unsigned)mesh->num_reply_addrs,
1555                 (unsigned)mesh->replies_sent,
1556                 (unsigned)mesh->stats_dropped,
1557                 (unsigned)mesh->stats_jostled);
1558         if(mesh->replies_sent > 0) {
1559                 struct timeval avg;
1560                 timeval_divide(&avg, &mesh->replies_sum_wait, 
1561                         mesh->replies_sent);
1562                 log_info("average recursion processing time "
1563                         ARG_LL "d.%6.6d sec",
1564                         (long long)avg.tv_sec, (int)avg.tv_usec);
1565                 log_info("histogram of recursion processing times");
1566                 timehist_log(mesh->histogram, "recursions");
1567         }
1568 }
1569
1570 void 
1571 mesh_stats_clear(struct mesh_area* mesh)
1572 {
1573         if(!mesh)
1574                 return;
1575         mesh->replies_sent = 0;
1576         mesh->replies_sum_wait.tv_sec = 0;
1577         mesh->replies_sum_wait.tv_usec = 0;
1578         mesh->stats_jostled = 0;
1579         mesh->stats_dropped = 0;
1580         timehist_clear(mesh->histogram);
1581         mesh->ans_secure = 0;
1582         mesh->ans_bogus = 0;
1583         memset(&mesh->ans_rcode[0], 0, sizeof(size_t)*16);
1584         mesh->ans_nodata = 0;
1585 }
1586
1587 size_t 
1588 mesh_get_mem(struct mesh_area* mesh)
1589 {
1590         struct mesh_state* m;
1591         size_t s = sizeof(*mesh) + sizeof(struct timehist) +
1592                 sizeof(struct th_buck)*mesh->histogram->num +
1593                 sizeof(sldns_buffer) + sldns_buffer_capacity(mesh->qbuf_bak);
1594         RBTREE_FOR(m, struct mesh_state*, &mesh->all) {
1595                 /* all, including m itself allocated in qstate region */
1596                 s += regional_get_mem(m->s.region);
1597         }
1598         return s;
1599 }
1600
1601 int 
1602 mesh_detect_cycle(struct module_qstate* qstate, struct query_info* qinfo,
1603         uint16_t flags, int prime, int valrec)
1604 {
1605         struct mesh_area* mesh = qstate->env->mesh;
1606         struct mesh_state* dep_m = NULL;
1607         if(!mesh_state_is_unique(qstate->mesh_info))
1608                 dep_m = mesh_area_find(mesh, NULL, qinfo, flags, prime, valrec);
1609         return mesh_detect_cycle_found(qstate, dep_m);
1610 }
1611
1612 void mesh_list_insert(struct mesh_state* m, struct mesh_state** fp,
1613         struct mesh_state** lp)
1614 {
1615         /* insert as last element */
1616         m->prev = *lp;
1617         m->next = NULL;
1618         if(*lp)
1619                 (*lp)->next = m;
1620         else    *fp = m;
1621         *lp = m;
1622 }
1623
1624 void mesh_list_remove(struct mesh_state* m, struct mesh_state** fp,
1625         struct mesh_state** lp)
1626 {
1627         if(m->next)
1628                 m->next->prev = m->prev;
1629         else    *lp = m->prev;
1630         if(m->prev)
1631                 m->prev->next = m->next;
1632         else    *fp = m->next;
1633 }
1634
1635 void mesh_state_remove_reply(struct mesh_area* mesh, struct mesh_state* m,
1636         struct comm_point* cp)
1637 {
1638         struct mesh_reply* n, *prev = NULL;
1639         n = m->reply_list;
1640         /* when in mesh_cleanup, it sets the reply_list to NULL, so that
1641          * there is no accounting twice */
1642         if(!n) return; /* nothing to remove, also no accounting needed */
1643         while(n) {
1644                 if(n->query_reply.c == cp) {
1645                         /* unlink it */
1646                         if(prev) prev->next = n->next;
1647                         else m->reply_list = n->next;
1648                         /* delete it, but allocated in m region */
1649                         mesh->num_reply_addrs--;
1650
1651                         /* prev = prev; */
1652                         n = n->next;
1653                         continue;
1654                 }
1655                 prev = n;
1656                 n = n->next;
1657         }
1658         /* it was not detached (because it had a reply list), could be now */
1659         if(!m->reply_list && !m->cb_list
1660                 && m->super_set.count == 0) {
1661                 mesh->num_detached_states++;
1662         }
1663         /* if not replies any more in mstate, it is no longer a reply_state */
1664         if(!m->reply_list && !m->cb_list) {
1665                 log_assert(mesh->num_reply_states > 0);
1666                 mesh->num_reply_states--;
1667         }
1668 }