]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/kern/kern_cpu.c
MFV r329502: 7614 zfs device evacuation/removal
[FreeBSD/FreeBSD.git] / sys / kern / kern_cpu.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2004-2007 Nate Lawson (SDG)
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/bus.h>
34 #include <sys/cpu.h>
35 #include <sys/eventhandler.h>
36 #include <sys/kernel.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/module.h>
40 #include <sys/proc.h>
41 #include <sys/queue.h>
42 #include <sys/sbuf.h>
43 #include <sys/sched.h>
44 #include <sys/smp.h>
45 #include <sys/sysctl.h>
46 #include <sys/systm.h>
47 #include <sys/sx.h>
48 #include <sys/timetc.h>
49 #include <sys/taskqueue.h>
50
51 #include "cpufreq_if.h"
52
53 /*
54  * Common CPU frequency glue code.  Drivers for specific hardware can
55  * attach this interface to allow users to get/set the CPU frequency.
56  */
57
58 /*
59  * Number of levels we can handle.  Levels are synthesized from settings
60  * so for M settings and N drivers, there may be M*N levels.
61  */
62 #define CF_MAX_LEVELS   64
63
64 struct cf_saved_freq {
65         struct cf_level                 level;
66         int                             priority;
67         SLIST_ENTRY(cf_saved_freq)      link;
68 };
69
70 struct cpufreq_softc {
71         struct sx                       lock;
72         struct cf_level                 curr_level;
73         int                             curr_priority;
74         SLIST_HEAD(, cf_saved_freq)     saved_freq;
75         struct cf_level_lst             all_levels;
76         int                             all_count;
77         int                             max_mhz;
78         device_t                        dev;
79         struct sysctl_ctx_list          sysctl_ctx;
80         struct task                     startup_task;
81         struct cf_level                 *levels_buf;
82 };
83
84 struct cf_setting_array {
85         struct cf_setting               sets[MAX_SETTINGS];
86         int                             count;
87         TAILQ_ENTRY(cf_setting_array)   link;
88 };
89
90 TAILQ_HEAD(cf_setting_lst, cf_setting_array);
91
92 #define CF_MTX_INIT(x)          sx_init((x), "cpufreq lock")
93 #define CF_MTX_LOCK(x)          sx_xlock((x))
94 #define CF_MTX_UNLOCK(x)        sx_xunlock((x))
95 #define CF_MTX_ASSERT(x)        sx_assert((x), SX_XLOCKED)
96
97 #define CF_DEBUG(msg...)        do {            \
98         if (cf_verbose)                         \
99                 printf("cpufreq: " msg);        \
100         } while (0)
101
102 static int      cpufreq_attach(device_t dev);
103 static void     cpufreq_startup_task(void *ctx, int pending);
104 static int      cpufreq_detach(device_t dev);
105 static int      cf_set_method(device_t dev, const struct cf_level *level,
106                     int priority);
107 static int      cf_get_method(device_t dev, struct cf_level *level);
108 static int      cf_levels_method(device_t dev, struct cf_level *levels,
109                     int *count);
110 static int      cpufreq_insert_abs(struct cpufreq_softc *sc,
111                     struct cf_setting *sets, int count);
112 static int      cpufreq_expand_set(struct cpufreq_softc *sc,
113                     struct cf_setting_array *set_arr);
114 static struct cf_level *cpufreq_dup_set(struct cpufreq_softc *sc,
115                     struct cf_level *dup, struct cf_setting *set);
116 static int      cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS);
117 static int      cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS);
118 static int      cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS);
119
120 static device_method_t cpufreq_methods[] = {
121         DEVMETHOD(device_probe,         bus_generic_probe),
122         DEVMETHOD(device_attach,        cpufreq_attach),
123         DEVMETHOD(device_detach,        cpufreq_detach),
124
125         DEVMETHOD(cpufreq_set,          cf_set_method),
126         DEVMETHOD(cpufreq_get,          cf_get_method),
127         DEVMETHOD(cpufreq_levels,       cf_levels_method),
128         {0, 0}
129 };
130 static driver_t cpufreq_driver = {
131         "cpufreq", cpufreq_methods, sizeof(struct cpufreq_softc)
132 };
133 static devclass_t cpufreq_dc;
134 DRIVER_MODULE(cpufreq, cpu, cpufreq_driver, cpufreq_dc, 0, 0);
135
136 static int              cf_lowest_freq;
137 static int              cf_verbose;
138 static SYSCTL_NODE(_debug, OID_AUTO, cpufreq, CTLFLAG_RD, NULL,
139     "cpufreq debugging");
140 SYSCTL_INT(_debug_cpufreq, OID_AUTO, lowest, CTLFLAG_RWTUN, &cf_lowest_freq, 1,
141     "Don't provide levels below this frequency.");
142 SYSCTL_INT(_debug_cpufreq, OID_AUTO, verbose, CTLFLAG_RWTUN, &cf_verbose, 1,
143     "Print verbose debugging messages");
144
145 static int
146 cpufreq_attach(device_t dev)
147 {
148         struct cpufreq_softc *sc;
149         struct pcpu *pc;
150         device_t parent;
151         uint64_t rate;
152         int numdevs;
153
154         CF_DEBUG("initializing %s\n", device_get_nameunit(dev));
155         sc = device_get_softc(dev);
156         parent = device_get_parent(dev);
157         sc->dev = dev;
158         sysctl_ctx_init(&sc->sysctl_ctx);
159         TAILQ_INIT(&sc->all_levels);
160         CF_MTX_INIT(&sc->lock);
161         sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
162         SLIST_INIT(&sc->saved_freq);
163         /* Try to get nominal CPU freq to use it as maximum later if needed */
164         sc->max_mhz = cpu_get_nominal_mhz(dev);
165         /* If that fails, try to measure the current rate */
166         if (sc->max_mhz <= 0) {
167                 pc = cpu_get_pcpu(dev);
168                 if (cpu_est_clockrate(pc->pc_cpuid, &rate) == 0)
169                         sc->max_mhz = rate / 1000000;
170                 else
171                         sc->max_mhz = CPUFREQ_VAL_UNKNOWN;
172         }
173
174         /*
175          * Only initialize one set of sysctls for all CPUs.  In the future,
176          * if multiple CPUs can have different settings, we can move these
177          * sysctls to be under every CPU instead of just the first one.
178          */
179         numdevs = devclass_get_count(cpufreq_dc);
180         if (numdevs > 1)
181                 return (0);
182
183         CF_DEBUG("initializing one-time data for %s\n",
184             device_get_nameunit(dev));
185         sc->levels_buf = malloc(CF_MAX_LEVELS * sizeof(*sc->levels_buf),
186             M_DEVBUF, M_WAITOK);
187         SYSCTL_ADD_PROC(&sc->sysctl_ctx,
188             SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
189             OID_AUTO, "freq", CTLTYPE_INT | CTLFLAG_RW, sc, 0,
190             cpufreq_curr_sysctl, "I", "Current CPU frequency");
191         SYSCTL_ADD_PROC(&sc->sysctl_ctx,
192             SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
193             OID_AUTO, "freq_levels", CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
194             cpufreq_levels_sysctl, "A", "CPU frequency levels");
195
196         /*
197          * Queue a one-shot broadcast that levels have changed.
198          * It will run once the system has completed booting.
199          */
200         TASK_INIT(&sc->startup_task, 0, cpufreq_startup_task, dev);
201         taskqueue_enqueue(taskqueue_thread, &sc->startup_task);
202
203         return (0);
204 }
205
206 /* Handle any work to be done for all drivers that attached during boot. */
207 static void 
208 cpufreq_startup_task(void *ctx, int pending)
209 {
210
211         cpufreq_settings_changed((device_t)ctx);
212 }
213
214 static int
215 cpufreq_detach(device_t dev)
216 {
217         struct cpufreq_softc *sc;
218         struct cf_saved_freq *saved_freq;
219         int numdevs;
220
221         CF_DEBUG("shutdown %s\n", device_get_nameunit(dev));
222         sc = device_get_softc(dev);
223         sysctl_ctx_free(&sc->sysctl_ctx);
224
225         while ((saved_freq = SLIST_FIRST(&sc->saved_freq)) != NULL) {
226                 SLIST_REMOVE_HEAD(&sc->saved_freq, link);
227                 free(saved_freq, M_TEMP);
228         }
229
230         /* Only clean up these resources when the last device is detaching. */
231         numdevs = devclass_get_count(cpufreq_dc);
232         if (numdevs == 1) {
233                 CF_DEBUG("final shutdown for %s\n", device_get_nameunit(dev));
234                 free(sc->levels_buf, M_DEVBUF);
235         }
236
237         return (0);
238 }
239
240 static int
241 cf_set_method(device_t dev, const struct cf_level *level, int priority)
242 {
243         struct cpufreq_softc *sc;
244         const struct cf_setting *set;
245         struct cf_saved_freq *saved_freq, *curr_freq;
246         struct pcpu *pc;
247         int error, i;
248
249         sc = device_get_softc(dev);
250         error = 0;
251         set = NULL;
252         saved_freq = NULL;
253
254         /* We are going to change levels so notify the pre-change handler. */
255         EVENTHANDLER_INVOKE(cpufreq_pre_change, level, &error);
256         if (error != 0) {
257                 EVENTHANDLER_INVOKE(cpufreq_post_change, level, error);
258                 return (error);
259         }
260
261         CF_MTX_LOCK(&sc->lock);
262
263 #ifdef SMP
264 #ifdef EARLY_AP_STARTUP
265         MPASS(mp_ncpus == 1 || smp_started);
266 #else
267         /*
268          * If still booting and secondary CPUs not started yet, don't allow
269          * changing the frequency until they're online.  This is because we
270          * can't switch to them using sched_bind() and thus we'd only be
271          * switching the main CPU.  XXXTODO: Need to think more about how to
272          * handle having different CPUs at different frequencies.  
273          */
274         if (mp_ncpus > 1 && !smp_started) {
275                 device_printf(dev, "rejecting change, SMP not started yet\n");
276                 error = ENXIO;
277                 goto out;
278         }
279 #endif
280 #endif /* SMP */
281
282         /*
283          * If the requested level has a lower priority, don't allow
284          * the new level right now.
285          */
286         if (priority < sc->curr_priority) {
287                 CF_DEBUG("ignoring, curr prio %d less than %d\n", priority,
288                     sc->curr_priority);
289                 error = EPERM;
290                 goto out;
291         }
292
293         /*
294          * If the caller didn't specify a level and one is saved, prepare to
295          * restore the saved level.  If none has been saved, return an error.
296          */
297         if (level == NULL) {
298                 saved_freq = SLIST_FIRST(&sc->saved_freq);
299                 if (saved_freq == NULL) {
300                         CF_DEBUG("NULL level, no saved level\n");
301                         error = ENXIO;
302                         goto out;
303                 }
304                 level = &saved_freq->level;
305                 priority = saved_freq->priority;
306                 CF_DEBUG("restoring saved level, freq %d prio %d\n",
307                     level->total_set.freq, priority);
308         }
309
310         /* Reject levels that are below our specified threshold. */
311         if (level->total_set.freq < cf_lowest_freq) {
312                 CF_DEBUG("rejecting freq %d, less than %d limit\n",
313                     level->total_set.freq, cf_lowest_freq);
314                 error = EINVAL;
315                 goto out;
316         }
317
318         /* If already at this level, just return. */
319         if (sc->curr_level.total_set.freq == level->total_set.freq) {
320                 CF_DEBUG("skipping freq %d, same as current level %d\n",
321                     level->total_set.freq, sc->curr_level.total_set.freq);
322                 goto skip;
323         }
324
325         /* First, set the absolute frequency via its driver. */
326         set = &level->abs_set;
327         if (set->dev) {
328                 if (!device_is_attached(set->dev)) {
329                         error = ENXIO;
330                         goto out;
331                 }
332
333                 /* Bind to the target CPU before switching. */
334                 pc = cpu_get_pcpu(set->dev);
335                 thread_lock(curthread);
336                 sched_bind(curthread, pc->pc_cpuid);
337                 thread_unlock(curthread);
338                 CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq,
339                     device_get_nameunit(set->dev), PCPU_GET(cpuid));
340                 error = CPUFREQ_DRV_SET(set->dev, set);
341                 thread_lock(curthread);
342                 sched_unbind(curthread);
343                 thread_unlock(curthread);
344                 if (error) {
345                         goto out;
346                 }
347         }
348
349         /* Next, set any/all relative frequencies via their drivers. */
350         for (i = 0; i < level->rel_count; i++) {
351                 set = &level->rel_set[i];
352                 if (!device_is_attached(set->dev)) {
353                         error = ENXIO;
354                         goto out;
355                 }
356
357                 /* Bind to the target CPU before switching. */
358                 pc = cpu_get_pcpu(set->dev);
359                 thread_lock(curthread);
360                 sched_bind(curthread, pc->pc_cpuid);
361                 thread_unlock(curthread);
362                 CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq,
363                     device_get_nameunit(set->dev), PCPU_GET(cpuid));
364                 error = CPUFREQ_DRV_SET(set->dev, set);
365                 thread_lock(curthread);
366                 sched_unbind(curthread);
367                 thread_unlock(curthread);
368                 if (error) {
369                         /* XXX Back out any successful setting? */
370                         goto out;
371                 }
372         }
373
374 skip:
375         /*
376          * Before recording the current level, check if we're going to a
377          * higher priority.  If so, save the previous level and priority.
378          */
379         if (sc->curr_level.total_set.freq != CPUFREQ_VAL_UNKNOWN &&
380             priority > sc->curr_priority) {
381                 CF_DEBUG("saving level, freq %d prio %d\n",
382                     sc->curr_level.total_set.freq, sc->curr_priority);
383                 curr_freq = malloc(sizeof(*curr_freq), M_TEMP, M_NOWAIT);
384                 if (curr_freq == NULL) {
385                         error = ENOMEM;
386                         goto out;
387                 }
388                 curr_freq->level = sc->curr_level;
389                 curr_freq->priority = sc->curr_priority;
390                 SLIST_INSERT_HEAD(&sc->saved_freq, curr_freq, link);
391         }
392         sc->curr_level = *level;
393         sc->curr_priority = priority;
394
395         /* If we were restoring a saved state, reset it to "unused". */
396         if (saved_freq != NULL) {
397                 CF_DEBUG("resetting saved level\n");
398                 sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
399                 SLIST_REMOVE_HEAD(&sc->saved_freq, link);
400                 free(saved_freq, M_TEMP);
401         }
402
403 out:
404         CF_MTX_UNLOCK(&sc->lock);
405
406         /*
407          * We changed levels (or attempted to) so notify the post-change
408          * handler of new frequency or error.
409          */
410         EVENTHANDLER_INVOKE(cpufreq_post_change, level, error);
411         if (error && set)
412                 device_printf(set->dev, "set freq failed, err %d\n", error);
413
414         return (error);
415 }
416
417 static int
418 cf_get_method(device_t dev, struct cf_level *level)
419 {
420         struct cpufreq_softc *sc;
421         struct cf_level *levels;
422         struct cf_setting *curr_set, set;
423         struct pcpu *pc;
424         device_t *devs;
425         int bdiff, count, diff, error, i, n, numdevs;
426         uint64_t rate;
427
428         sc = device_get_softc(dev);
429         error = 0;
430         levels = NULL;
431
432         /* If we already know the current frequency, we're done. */
433         CF_MTX_LOCK(&sc->lock);
434         curr_set = &sc->curr_level.total_set;
435         if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
436                 CF_DEBUG("get returning known freq %d\n", curr_set->freq);
437                 goto out;
438         }
439         CF_MTX_UNLOCK(&sc->lock);
440
441         /*
442          * We need to figure out the current level.  Loop through every
443          * driver, getting the current setting.  Then, attempt to get a best
444          * match of settings against each level.
445          */
446         count = CF_MAX_LEVELS;
447         levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
448         if (levels == NULL)
449                 return (ENOMEM);
450         error = CPUFREQ_LEVELS(sc->dev, levels, &count);
451         if (error) {
452                 if (error == E2BIG)
453                         printf("cpufreq: need to increase CF_MAX_LEVELS\n");
454                 free(levels, M_TEMP);
455                 return (error);
456         }
457         error = device_get_children(device_get_parent(dev), &devs, &numdevs);
458         if (error) {
459                 free(levels, M_TEMP);
460                 return (error);
461         }
462
463         /*
464          * Reacquire the lock and search for the given level.
465          *
466          * XXX Note: this is not quite right since we really need to go
467          * through each level and compare both absolute and relative
468          * settings for each driver in the system before making a match.
469          * The estimation code below catches this case though.
470          */
471         CF_MTX_LOCK(&sc->lock);
472         for (n = 0; n < numdevs && curr_set->freq == CPUFREQ_VAL_UNKNOWN; n++) {
473                 if (!device_is_attached(devs[n]))
474                         continue;
475                 if (CPUFREQ_DRV_GET(devs[n], &set) != 0)
476                         continue;
477                 for (i = 0; i < count; i++) {
478                         if (set.freq == levels[i].total_set.freq) {
479                                 sc->curr_level = levels[i];
480                                 break;
481                         }
482                 }
483         }
484         free(devs, M_TEMP);
485         if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
486                 CF_DEBUG("get matched freq %d from drivers\n", curr_set->freq);
487                 goto out;
488         }
489
490         /*
491          * We couldn't find an exact match, so attempt to estimate and then
492          * match against a level.
493          */
494         pc = cpu_get_pcpu(dev);
495         if (pc == NULL) {
496                 error = ENXIO;
497                 goto out;
498         }
499         cpu_est_clockrate(pc->pc_cpuid, &rate);
500         rate /= 1000000;
501         bdiff = 1 << 30;
502         for (i = 0; i < count; i++) {
503                 diff = abs(levels[i].total_set.freq - rate);
504                 if (diff < bdiff) {
505                         bdiff = diff;
506                         sc->curr_level = levels[i];
507                 }
508         }
509         CF_DEBUG("get estimated freq %d\n", curr_set->freq);
510
511 out:
512         if (error == 0)
513                 *level = sc->curr_level;
514
515         CF_MTX_UNLOCK(&sc->lock);
516         if (levels)
517                 free(levels, M_TEMP);
518         return (error);
519 }
520
521 static int
522 cf_levels_method(device_t dev, struct cf_level *levels, int *count)
523 {
524         struct cf_setting_array *set_arr;
525         struct cf_setting_lst rel_sets;
526         struct cpufreq_softc *sc;
527         struct cf_level *lev;
528         struct cf_setting *sets;
529         struct pcpu *pc;
530         device_t *devs;
531         int error, i, numdevs, set_count, type;
532         uint64_t rate;
533
534         if (levels == NULL || count == NULL)
535                 return (EINVAL);
536
537         TAILQ_INIT(&rel_sets);
538         sc = device_get_softc(dev);
539         error = device_get_children(device_get_parent(dev), &devs, &numdevs);
540         if (error)
541                 return (error);
542         sets = malloc(MAX_SETTINGS * sizeof(*sets), M_TEMP, M_NOWAIT);
543         if (sets == NULL) {
544                 free(devs, M_TEMP);
545                 return (ENOMEM);
546         }
547
548         /* Get settings from all cpufreq drivers. */
549         CF_MTX_LOCK(&sc->lock);
550         for (i = 0; i < numdevs; i++) {
551                 /* Skip devices that aren't ready. */
552                 if (!device_is_attached(devs[i]))
553                         continue;
554
555                 /*
556                  * Get settings, skipping drivers that offer no settings or
557                  * provide settings for informational purposes only.
558                  */
559                 error = CPUFREQ_DRV_TYPE(devs[i], &type);
560                 if (error || (type & CPUFREQ_FLAG_INFO_ONLY)) {
561                         if (error == 0) {
562                                 CF_DEBUG("skipping info-only driver %s\n",
563                                     device_get_nameunit(devs[i]));
564                         }
565                         continue;
566                 }
567                 set_count = MAX_SETTINGS;
568                 error = CPUFREQ_DRV_SETTINGS(devs[i], sets, &set_count);
569                 if (error || set_count == 0)
570                         continue;
571
572                 /* Add the settings to our absolute/relative lists. */
573                 switch (type & CPUFREQ_TYPE_MASK) {
574                 case CPUFREQ_TYPE_ABSOLUTE:
575                         error = cpufreq_insert_abs(sc, sets, set_count);
576                         break;
577                 case CPUFREQ_TYPE_RELATIVE:
578                         CF_DEBUG("adding %d relative settings\n", set_count);
579                         set_arr = malloc(sizeof(*set_arr), M_TEMP, M_NOWAIT);
580                         if (set_arr == NULL) {
581                                 error = ENOMEM;
582                                 goto out;
583                         }
584                         bcopy(sets, set_arr->sets, set_count * sizeof(*sets));
585                         set_arr->count = set_count;
586                         TAILQ_INSERT_TAIL(&rel_sets, set_arr, link);
587                         break;
588                 default:
589                         error = EINVAL;
590                 }
591                 if (error)
592                         goto out;
593         }
594
595         /*
596          * If there are no absolute levels, create a fake one at 100%.  We
597          * then cache the clockrate for later use as our base frequency.
598          */
599         if (TAILQ_EMPTY(&sc->all_levels)) {
600                 if (sc->max_mhz == CPUFREQ_VAL_UNKNOWN) {
601                         sc->max_mhz = cpu_get_nominal_mhz(dev);
602                         /*
603                          * If the CPU can't report a rate for 100%, hope
604                          * the CPU is running at its nominal rate right now,
605                          * and use that instead.
606                          */
607                         if (sc->max_mhz <= 0) {
608                                 pc = cpu_get_pcpu(dev);
609                                 cpu_est_clockrate(pc->pc_cpuid, &rate);
610                                 sc->max_mhz = rate / 1000000;
611                         }
612                 }
613                 memset(&sets[0], CPUFREQ_VAL_UNKNOWN, sizeof(*sets));
614                 sets[0].freq = sc->max_mhz;
615                 sets[0].dev = NULL;
616                 error = cpufreq_insert_abs(sc, sets, 1);
617                 if (error)
618                         goto out;
619         }
620
621         /* Create a combined list of absolute + relative levels. */
622         TAILQ_FOREACH(set_arr, &rel_sets, link)
623                 cpufreq_expand_set(sc, set_arr);
624
625         /* If the caller doesn't have enough space, return the actual count. */
626         if (sc->all_count > *count) {
627                 *count = sc->all_count;
628                 error = E2BIG;
629                 goto out;
630         }
631
632         /* Finally, output the list of levels. */
633         i = 0;
634         TAILQ_FOREACH(lev, &sc->all_levels, link) {
635
636                 /* Skip levels that have a frequency that is too low. */
637                 if (lev->total_set.freq < cf_lowest_freq) {
638                         sc->all_count--;
639                         continue;
640                 }
641
642                 levels[i] = *lev;
643                 i++;
644         }
645         *count = sc->all_count;
646         error = 0;
647
648 out:
649         /* Clear all levels since we regenerate them each time. */
650         while ((lev = TAILQ_FIRST(&sc->all_levels)) != NULL) {
651                 TAILQ_REMOVE(&sc->all_levels, lev, link);
652                 free(lev, M_TEMP);
653         }
654         sc->all_count = 0;
655
656         CF_MTX_UNLOCK(&sc->lock);
657         while ((set_arr = TAILQ_FIRST(&rel_sets)) != NULL) {
658                 TAILQ_REMOVE(&rel_sets, set_arr, link);
659                 free(set_arr, M_TEMP);
660         }
661         free(devs, M_TEMP);
662         free(sets, M_TEMP);
663         return (error);
664 }
665
666 /*
667  * Create levels for an array of absolute settings and insert them in
668  * sorted order in the specified list.
669  */
670 static int
671 cpufreq_insert_abs(struct cpufreq_softc *sc, struct cf_setting *sets,
672     int count)
673 {
674         struct cf_level_lst *list;
675         struct cf_level *level, *search;
676         int i;
677
678         CF_MTX_ASSERT(&sc->lock);
679
680         list = &sc->all_levels;
681         for (i = 0; i < count; i++) {
682                 level = malloc(sizeof(*level), M_TEMP, M_NOWAIT | M_ZERO);
683                 if (level == NULL)
684                         return (ENOMEM);
685                 level->abs_set = sets[i];
686                 level->total_set = sets[i];
687                 level->total_set.dev = NULL;
688                 sc->all_count++;
689
690                 if (TAILQ_EMPTY(list)) {
691                         CF_DEBUG("adding abs setting %d at head\n",
692                             sets[i].freq);
693                         TAILQ_INSERT_HEAD(list, level, link);
694                         continue;
695                 }
696
697                 TAILQ_FOREACH_REVERSE(search, list, cf_level_lst, link) {
698                         if (sets[i].freq <= search->total_set.freq) {
699                                 CF_DEBUG("adding abs setting %d after %d\n",
700                                     sets[i].freq, search->total_set.freq);
701                                 TAILQ_INSERT_AFTER(list, search, level, link);
702                                 break;
703                         }
704                 }
705         }
706         return (0);
707 }
708
709 /*
710  * Expand a group of relative settings, creating derived levels from them.
711  */
712 static int
713 cpufreq_expand_set(struct cpufreq_softc *sc, struct cf_setting_array *set_arr)
714 {
715         struct cf_level *fill, *search;
716         struct cf_setting *set;
717         int i;
718
719         CF_MTX_ASSERT(&sc->lock);
720
721         /*
722          * Walk the set of all existing levels in reverse.  This is so we
723          * create derived states from the lowest absolute settings first
724          * and discard duplicates created from higher absolute settings.
725          * For instance, a level of 50 Mhz derived from 100 Mhz + 50% is
726          * preferable to 200 Mhz + 25% because absolute settings are more
727          * efficient since they often change the voltage as well.
728          */
729         TAILQ_FOREACH_REVERSE(search, &sc->all_levels, cf_level_lst, link) {
730                 /* Add each setting to the level, duplicating if necessary. */
731                 for (i = 0; i < set_arr->count; i++) {
732                         set = &set_arr->sets[i];
733
734                         /*
735                          * If this setting is less than 100%, split the level
736                          * into two and add this setting to the new level.
737                          */
738                         fill = search;
739                         if (set->freq < 10000) {
740                                 fill = cpufreq_dup_set(sc, search, set);
741
742                                 /*
743                                  * The new level was a duplicate of an existing
744                                  * level or its absolute setting is too high
745                                  * so we freed it.  For example, we discard a
746                                  * derived level of 1000 MHz/25% if a level
747                                  * of 500 MHz/100% already exists.
748                                  */
749                                 if (fill == NULL)
750                                         break;
751                         }
752
753                         /* Add this setting to the existing or new level. */
754                         KASSERT(fill->rel_count < MAX_SETTINGS,
755                             ("cpufreq: too many relative drivers (%d)",
756                             MAX_SETTINGS));
757                         fill->rel_set[fill->rel_count] = *set;
758                         fill->rel_count++;
759                         CF_DEBUG(
760                         "expand set added rel setting %d%% to %d level\n",
761                             set->freq / 100, fill->total_set.freq);
762                 }
763         }
764
765         return (0);
766 }
767
768 static struct cf_level *
769 cpufreq_dup_set(struct cpufreq_softc *sc, struct cf_level *dup,
770     struct cf_setting *set)
771 {
772         struct cf_level_lst *list;
773         struct cf_level *fill, *itr;
774         struct cf_setting *fill_set, *itr_set;
775         int i;
776
777         CF_MTX_ASSERT(&sc->lock);
778
779         /*
780          * Create a new level, copy it from the old one, and update the
781          * total frequency and power by the percentage specified in the
782          * relative setting.
783          */
784         fill = malloc(sizeof(*fill), M_TEMP, M_NOWAIT);
785         if (fill == NULL)
786                 return (NULL);
787         *fill = *dup;
788         fill_set = &fill->total_set;
789         fill_set->freq =
790             ((uint64_t)fill_set->freq * set->freq) / 10000;
791         if (fill_set->power != CPUFREQ_VAL_UNKNOWN) {
792                 fill_set->power = ((uint64_t)fill_set->power * set->freq)
793                     / 10000;
794         }
795         if (set->lat != CPUFREQ_VAL_UNKNOWN) {
796                 if (fill_set->lat != CPUFREQ_VAL_UNKNOWN)
797                         fill_set->lat += set->lat;
798                 else
799                         fill_set->lat = set->lat;
800         }
801         CF_DEBUG("dup set considering derived setting %d\n", fill_set->freq);
802
803         /*
804          * If we copied an old level that we already modified (say, at 100%),
805          * we need to remove that setting before adding this one.  Since we
806          * process each setting array in order, we know any settings for this
807          * driver will be found at the end.
808          */
809         for (i = fill->rel_count; i != 0; i--) {
810                 if (fill->rel_set[i - 1].dev != set->dev)
811                         break;
812                 CF_DEBUG("removed last relative driver: %s\n",
813                     device_get_nameunit(set->dev));
814                 fill->rel_count--;
815         }
816
817         /*
818          * Insert the new level in sorted order.  If it is a duplicate of an
819          * existing level (1) or has an absolute setting higher than the
820          * existing level (2), do not add it.  We can do this since any such
821          * level is guaranteed use less power.  For example (1), a level with
822          * one absolute setting of 800 Mhz uses less power than one composed
823          * of an absolute setting of 1600 Mhz and a relative setting at 50%.
824          * Also for example (2), a level of 800 Mhz/75% is preferable to
825          * 1600 Mhz/25% even though the latter has a lower total frequency.
826          */
827         list = &sc->all_levels;
828         KASSERT(!TAILQ_EMPTY(list), ("all levels list empty in dup set"));
829         TAILQ_FOREACH_REVERSE(itr, list, cf_level_lst, link) {
830                 itr_set = &itr->total_set;
831                 if (CPUFREQ_CMP(fill_set->freq, itr_set->freq)) {
832                         CF_DEBUG("dup set rejecting %d (dupe)\n",
833                             fill_set->freq);
834                         itr = NULL;
835                         break;
836                 } else if (fill_set->freq < itr_set->freq) {
837                         if (fill->abs_set.freq <= itr->abs_set.freq) {
838                                 CF_DEBUG(
839                         "dup done, inserting new level %d after %d\n",
840                                     fill_set->freq, itr_set->freq);
841                                 TAILQ_INSERT_AFTER(list, itr, fill, link);
842                                 sc->all_count++;
843                         } else {
844                                 CF_DEBUG("dup set rejecting %d (abs too big)\n",
845                                     fill_set->freq);
846                                 itr = NULL;
847                         }
848                         break;
849                 }
850         }
851
852         /* We didn't find a good place for this new level so free it. */
853         if (itr == NULL) {
854                 CF_DEBUG("dup set freeing new level %d (not optimal)\n",
855                     fill_set->freq);
856                 free(fill, M_TEMP);
857                 fill = NULL;
858         }
859
860         return (fill);
861 }
862
863 static int
864 cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS)
865 {
866         struct cpufreq_softc *sc;
867         struct cf_level *levels;
868         int best, count, diff, bdiff, devcount, error, freq, i, n;
869         device_t *devs;
870
871         devs = NULL;
872         sc = oidp->oid_arg1;
873         levels = sc->levels_buf;
874
875         error = CPUFREQ_GET(sc->dev, &levels[0]);
876         if (error)
877                 goto out;
878         freq = levels[0].total_set.freq;
879         error = sysctl_handle_int(oidp, &freq, 0, req);
880         if (error != 0 || req->newptr == NULL)
881                 goto out;
882
883         /*
884          * While we only call cpufreq_get() on one device (assuming all
885          * CPUs have equal levels), we call cpufreq_set() on all CPUs.
886          * This is needed for some MP systems.
887          */
888         error = devclass_get_devices(cpufreq_dc, &devs, &devcount);
889         if (error)
890                 goto out;
891         for (n = 0; n < devcount; n++) {
892                 count = CF_MAX_LEVELS;
893                 error = CPUFREQ_LEVELS(devs[n], levels, &count);
894                 if (error) {
895                         if (error == E2BIG)
896                                 printf(
897                         "cpufreq: need to increase CF_MAX_LEVELS\n");
898                         break;
899                 }
900                 best = 0;
901                 bdiff = 1 << 30;
902                 for (i = 0; i < count; i++) {
903                         diff = abs(levels[i].total_set.freq - freq);
904                         if (diff < bdiff) {
905                                 bdiff = diff;
906                                 best = i;
907                         }
908                 }
909                 error = CPUFREQ_SET(devs[n], &levels[best], CPUFREQ_PRIO_USER);
910         }
911
912 out:
913         if (devs)
914                 free(devs, M_TEMP);
915         return (error);
916 }
917
918 static int
919 cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS)
920 {
921         struct cpufreq_softc *sc;
922         struct cf_level *levels;
923         struct cf_setting *set;
924         struct sbuf sb;
925         int count, error, i;
926
927         sc = oidp->oid_arg1;
928         sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
929
930         /* Get settings from the device and generate the output string. */
931         count = CF_MAX_LEVELS;
932         levels = sc->levels_buf;
933         if (levels == NULL) {
934                 sbuf_delete(&sb);
935                 return (ENOMEM);
936         }
937         error = CPUFREQ_LEVELS(sc->dev, levels, &count);
938         if (error) {
939                 if (error == E2BIG)
940                         printf("cpufreq: need to increase CF_MAX_LEVELS\n");
941                 goto out;
942         }
943         if (count) {
944                 for (i = 0; i < count; i++) {
945                         set = &levels[i].total_set;
946                         sbuf_printf(&sb, "%d/%d ", set->freq, set->power);
947                 }
948         } else
949                 sbuf_cpy(&sb, "0");
950         sbuf_trim(&sb);
951         sbuf_finish(&sb);
952         error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
953
954 out:
955         sbuf_delete(&sb);
956         return (error);
957 }
958
959 static int
960 cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS)
961 {
962         device_t dev;
963         struct cf_setting *sets;
964         struct sbuf sb;
965         int error, i, set_count;
966
967         dev = oidp->oid_arg1;
968         sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
969
970         /* Get settings from the device and generate the output string. */
971         set_count = MAX_SETTINGS;
972         sets = malloc(set_count * sizeof(*sets), M_TEMP, M_NOWAIT);
973         if (sets == NULL) {
974                 sbuf_delete(&sb);
975                 return (ENOMEM);
976         }
977         error = CPUFREQ_DRV_SETTINGS(dev, sets, &set_count);
978         if (error)
979                 goto out;
980         if (set_count) {
981                 for (i = 0; i < set_count; i++)
982                         sbuf_printf(&sb, "%d/%d ", sets[i].freq, sets[i].power);
983         } else
984                 sbuf_cpy(&sb, "0");
985         sbuf_trim(&sb);
986         sbuf_finish(&sb);
987         error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
988
989 out:
990         free(sets, M_TEMP);
991         sbuf_delete(&sb);
992         return (error);
993 }
994
995 int
996 cpufreq_register(device_t dev)
997 {
998         struct cpufreq_softc *sc;
999         device_t cf_dev, cpu_dev;
1000
1001         /* Add a sysctl to get each driver's settings separately. */
1002         SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
1003             SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
1004             OID_AUTO, "freq_settings", CTLTYPE_STRING | CTLFLAG_RD, dev, 0,
1005             cpufreq_settings_sysctl, "A", "CPU frequency driver settings");
1006
1007         /*
1008          * Add only one cpufreq device to each CPU.  Currently, all CPUs
1009          * must offer the same levels and be switched at the same time.
1010          */
1011         cpu_dev = device_get_parent(dev);
1012         if ((cf_dev = device_find_child(cpu_dev, "cpufreq", -1))) {
1013                 sc = device_get_softc(cf_dev);
1014                 sc->max_mhz = CPUFREQ_VAL_UNKNOWN;
1015                 return (0);
1016         }
1017
1018         /* Add the child device and possibly sysctls. */
1019         cf_dev = BUS_ADD_CHILD(cpu_dev, 0, "cpufreq", -1);
1020         if (cf_dev == NULL)
1021                 return (ENOMEM);
1022         device_quiet(cf_dev);
1023
1024         return (device_probe_and_attach(cf_dev));
1025 }
1026
1027 int
1028 cpufreq_unregister(device_t dev)
1029 {
1030         device_t cf_dev, *devs;
1031         int cfcount, devcount, error, i, type;
1032
1033         /*
1034          * If this is the last cpufreq child device, remove the control
1035          * device as well.  We identify cpufreq children by calling a method
1036          * they support.
1037          */
1038         error = device_get_children(device_get_parent(dev), &devs, &devcount);
1039         if (error)
1040                 return (error);
1041         cf_dev = device_find_child(device_get_parent(dev), "cpufreq", -1);
1042         if (cf_dev == NULL) {
1043                 device_printf(dev,
1044         "warning: cpufreq_unregister called with no cpufreq device active\n");
1045                 free(devs, M_TEMP);
1046                 return (0);
1047         }
1048         cfcount = 0;
1049         for (i = 0; i < devcount; i++) {
1050                 if (!device_is_attached(devs[i]))
1051                         continue;
1052                 if (CPUFREQ_DRV_TYPE(devs[i], &type) == 0)
1053                         cfcount++;
1054         }
1055         if (cfcount <= 1)
1056                 device_delete_child(device_get_parent(cf_dev), cf_dev);
1057         free(devs, M_TEMP);
1058
1059         return (0);
1060 }
1061
1062 int
1063 cpufreq_settings_changed(device_t dev)
1064 {
1065
1066         EVENTHANDLER_INVOKE(cpufreq_levels_changed,
1067             device_get_unit(device_get_parent(dev)));
1068         return (0);
1069 }