]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/kern/kern_cpu.c
This commit was generated by cvs2svn to compensate for changes in r147455,
[FreeBSD/FreeBSD.git] / sys / kern / kern_cpu.c
1 /*-
2  * Copyright (c) 2004-2005 Nate Lawson (SDG)
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include <sys/param.h>
31 #include <sys/bus.h>
32 #include <sys/cpu.h>
33 #include <sys/eventhandler.h>
34 #include <sys/kernel.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/proc.h>
39 #include <sys/queue.h>
40 #include <sys/sched.h>
41 #include <sys/sysctl.h>
42 #include <sys/systm.h>
43 #include <sys/sbuf.h>
44 #include <sys/sx.h>
45 #include <sys/timetc.h>
46
47 #include "cpufreq_if.h"
48
49 /*
50  * Common CPU frequency glue code.  Drivers for specific hardware can
51  * attach this interface to allow users to get/set the CPU frequency.
52  */
53
54 /*
55  * Number of levels we can handle.  Levels are synthesized from settings
56  * so for M settings and N drivers, there may be M*N levels.
57  */
58 #define CF_MAX_LEVELS   64
59
60 struct cpufreq_softc {
61         struct sx                       lock;
62         struct cf_level                 curr_level;
63         int                             curr_priority;
64         struct cf_level                 saved_level;
65         int                             saved_priority;
66         struct cf_level_lst             all_levels;
67         int                             all_count;
68         int                             max_mhz;
69         device_t                        dev;
70         struct sysctl_ctx_list          sysctl_ctx;
71 };
72
73 struct cf_setting_array {
74         struct cf_setting               sets[MAX_SETTINGS];
75         int                             count;
76         TAILQ_ENTRY(cf_setting_array)   link;
77 };
78
79 TAILQ_HEAD(cf_setting_lst, cf_setting_array);
80
81 #define CF_MTX_INIT(x)          sx_init((x), "cpufreq lock")
82 #define CF_MTX_LOCK(x)          sx_xlock((x))
83 #define CF_MTX_UNLOCK(x)        sx_xunlock((x))
84 #define CF_MTX_ASSERT(x)        sx_assert((x), SX_XLOCKED)
85
86 #define CF_DEBUG(msg...)        do {            \
87         if (cf_verbose)                         \
88                 printf("cpufreq: " msg);        \
89         } while (0)
90
91 static int      cpufreq_attach(device_t dev);
92 static int      cpufreq_detach(device_t dev);
93 static void     cpufreq_evaluate(void *arg);
94 static int      cf_set_method(device_t dev, const struct cf_level *level,
95                     int priority);
96 static int      cf_get_method(device_t dev, struct cf_level *level);
97 static int      cf_levels_method(device_t dev, struct cf_level *levels,
98                     int *count);
99 static int      cpufreq_insert_abs(struct cpufreq_softc *sc,
100                     struct cf_setting *sets, int count);
101 static int      cpufreq_expand_set(struct cpufreq_softc *sc,
102                     struct cf_setting_array *set_arr);
103 static struct cf_level *cpufreq_dup_set(struct cpufreq_softc *sc,
104                     struct cf_level *dup, struct cf_setting *set);
105 static int      cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS);
106 static int      cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS);
107 static int      cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS);
108
109 static device_method_t cpufreq_methods[] = {
110         DEVMETHOD(device_probe,         bus_generic_probe),
111         DEVMETHOD(device_attach,        cpufreq_attach),
112         DEVMETHOD(device_detach,        cpufreq_detach),
113
114         DEVMETHOD(cpufreq_set,          cf_set_method),
115         DEVMETHOD(cpufreq_get,          cf_get_method),
116         DEVMETHOD(cpufreq_levels,       cf_levels_method),
117         {0, 0}
118 };
119 static driver_t cpufreq_driver = {
120         "cpufreq", cpufreq_methods, sizeof(struct cpufreq_softc)
121 };
122 static devclass_t cpufreq_dc;
123 DRIVER_MODULE(cpufreq, cpu, cpufreq_driver, cpufreq_dc, 0, 0);
124
125 static eventhandler_tag cf_ev_tag;
126
127 static int              cf_lowest_freq;
128 static int              cf_verbose;
129 TUNABLE_INT("debug.cpufreq.lowest", &cf_lowest_freq);
130 TUNABLE_INT("debug.cpufreq.verbose", &cf_verbose);
131 SYSCTL_NODE(_debug, OID_AUTO, cpufreq, CTLFLAG_RD, NULL, "cpufreq debugging");
132 SYSCTL_INT(_debug_cpufreq, OID_AUTO, lowest, CTLFLAG_RW, &cf_lowest_freq, 1,
133     "Don't provide levels below this frequency.");
134 SYSCTL_INT(_debug_cpufreq, OID_AUTO, verbose, CTLFLAG_RW, &cf_verbose, 1,
135     "Print verbose debugging messages");
136
137 static int
138 cpufreq_attach(device_t dev)
139 {
140         struct cpufreq_softc *sc;
141         device_t parent;
142         int numdevs;
143
144         CF_DEBUG("initializing %s\n", device_get_nameunit(dev));
145         sc = device_get_softc(dev);
146         parent = device_get_parent(dev);
147         sc->dev = dev;
148         sysctl_ctx_init(&sc->sysctl_ctx);
149         TAILQ_INIT(&sc->all_levels);
150         CF_MTX_INIT(&sc->lock);
151         sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
152         sc->saved_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
153         sc->max_mhz = CPUFREQ_VAL_UNKNOWN;
154
155         /*
156          * Only initialize one set of sysctls for all CPUs.  In the future,
157          * if multiple CPUs can have different settings, we can move these
158          * sysctls to be under every CPU instead of just the first one.
159          */
160         numdevs = devclass_get_count(cpufreq_dc);
161         if (numdevs > 1)
162                 return (0);
163
164         CF_DEBUG("initializing one-time data for %s\n",
165             device_get_nameunit(dev));
166         SYSCTL_ADD_PROC(&sc->sysctl_ctx,
167             SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
168             OID_AUTO, "freq", CTLTYPE_INT | CTLFLAG_RW, sc, 0,
169             cpufreq_curr_sysctl, "I", "Current CPU frequency");
170         SYSCTL_ADD_PROC(&sc->sysctl_ctx,
171             SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
172             OID_AUTO, "freq_levels", CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
173             cpufreq_levels_sysctl, "A", "CPU frequency levels");
174         cf_ev_tag = EVENTHANDLER_REGISTER(cpufreq_changed, cpufreq_evaluate,
175             NULL, EVENTHANDLER_PRI_ANY);
176
177         return (0);
178 }
179
180 static int
181 cpufreq_detach(device_t dev)
182 {
183         struct cpufreq_softc *sc;
184         int numdevs;
185
186         CF_DEBUG("shutdown %s\n", device_get_nameunit(dev));
187         sc = device_get_softc(dev);
188         sysctl_ctx_free(&sc->sysctl_ctx);
189
190         /* Only clean up these resources when the last device is detaching. */
191         numdevs = devclass_get_count(cpufreq_dc);
192         if (numdevs == 1) {
193                 CF_DEBUG("final shutdown for %s\n", device_get_nameunit(dev));
194                 EVENTHANDLER_DEREGISTER(cpufreq_changed, cf_ev_tag);
195         }
196
197         return (0);
198 }
199
200 static void
201 cpufreq_evaluate(void *arg)
202 {
203         /* TODO: Re-evaluate when notified of changes to drivers. */
204 }
205
206 static int
207 cf_set_method(device_t dev, const struct cf_level *level, int priority)
208 {
209         struct cpufreq_softc *sc;
210         const struct cf_setting *set;
211         struct pcpu *pc;
212         int cpu_id, error, i;
213
214         sc = device_get_softc(dev);
215         error = 0;
216         set = NULL;
217
218         /*
219          * Check that the TSC isn't being used as a timecounter.
220          * If it is, then return EBUSY and refuse to change the
221          * clock speed.
222          */
223         if (strcmp(timecounter->tc_name, "TSC") == 0)
224                 return (EBUSY);
225
226         /*
227          * If the caller didn't specify a level and one is saved, prepare to
228          * restore the saved level.  If none has been saved, return an error.
229          * If they did specify one, but the requested level has a lower
230          * priority, don't allow the new level right now.
231          */
232         CF_MTX_LOCK(&sc->lock);
233         if (level == NULL) {
234                 if (sc->saved_level.total_set.freq != CPUFREQ_VAL_UNKNOWN) {
235                         level = &sc->saved_level;
236                         priority = sc->saved_priority;
237                         CF_DEBUG("restoring saved level, freq %d prio %d\n",
238                             level->total_set.freq, priority);
239                 } else {
240                         CF_DEBUG("NULL level, no saved level\n");
241                         error = ENXIO;
242                         goto out;
243                 }
244         } else if (priority < sc->curr_priority) {
245                 CF_DEBUG("ignoring, curr prio %d less than %d\n", priority,
246                     sc->curr_priority);
247                 error = EPERM;
248                 goto out;
249         }
250
251         /* Reject levels that are below our specified threshold. */
252         if (level->total_set.freq <= cf_lowest_freq) {
253                 CF_DEBUG("rejecting freq %d, less than %d limit\n",
254                     level->total_set.freq, cf_lowest_freq);
255                 error = EINVAL;
256                 goto out;
257         }
258
259         /* If already at this level, just return. */
260         if (CPUFREQ_CMP(sc->curr_level.total_set.freq, level->total_set.freq)) {
261                 CF_DEBUG("skipping freq %d, same as current level %d\n",
262                     level->total_set.freq, sc->curr_level.total_set.freq);
263                 goto out;
264         }
265
266         /* First, set the absolute frequency via its driver. */
267         set = &level->abs_set;
268         if (set->dev) {
269                 if (!device_is_attached(set->dev)) {
270                         error = ENXIO;
271                         goto out;
272                 }
273
274                 /* Bind to the target CPU before switching, if necessary. */
275                 cpu_id = PCPU_GET(cpuid);
276                 pc = cpu_get_pcpu(set->dev);
277                 if (cpu_id != pc->pc_cpuid) {
278                         mtx_lock_spin(&sched_lock);
279                         sched_bind(curthread, pc->pc_cpuid);
280                         mtx_unlock_spin(&sched_lock);
281                 }
282                 CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq,
283                     device_get_nameunit(set->dev), PCPU_GET(cpuid));
284                 error = CPUFREQ_DRV_SET(set->dev, set);
285                 if (cpu_id != pc->pc_cpuid) {
286                         mtx_lock_spin(&sched_lock);
287                         sched_unbind(curthread);
288                         mtx_unlock_spin(&sched_lock);
289                 }
290                 if (error) {
291                         goto out;
292                 }
293         }
294
295         /* Next, set any/all relative frequencies via their drivers. */
296         for (i = 0; i < level->rel_count; i++) {
297                 set = &level->rel_set[i];
298                 if (!device_is_attached(set->dev)) {
299                         error = ENXIO;
300                         goto out;
301                 }
302
303                 /* Bind to the target CPU before switching, if necessary. */
304                 cpu_id = PCPU_GET(cpuid);
305                 pc = cpu_get_pcpu(set->dev);
306                 if (cpu_id != pc->pc_cpuid) {
307                         mtx_lock_spin(&sched_lock);
308                         sched_bind(curthread, pc->pc_cpuid);
309                         mtx_unlock_spin(&sched_lock);
310                 }
311                 CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq,
312                     device_get_nameunit(set->dev), PCPU_GET(cpuid));
313                 error = CPUFREQ_DRV_SET(set->dev, set);
314                 if (cpu_id != pc->pc_cpuid) {
315                         mtx_lock_spin(&sched_lock);
316                         sched_unbind(curthread);
317                         mtx_unlock_spin(&sched_lock);
318                 }
319                 if (error) {
320                         /* XXX Back out any successful setting? */
321                         goto out;
322                 }
323         }
324
325         /* If we were restoring a saved state, reset it to "unused". */
326         if (level == &sc->saved_level) {
327                 CF_DEBUG("resetting saved level\n");
328                 sc->saved_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
329                 sc->saved_priority = 0;
330         }
331
332         /*
333          * Before recording the current level, check if we're going to a
334          * higher priority and have not saved a level yet.  If so, save the
335          * previous level and priority.
336          */
337         if (sc->curr_level.total_set.freq != CPUFREQ_VAL_UNKNOWN &&
338             sc->saved_level.total_set.freq == CPUFREQ_VAL_UNKNOWN &&
339             priority > sc->curr_priority) {
340                 CF_DEBUG("saving level, freq %d prio %d\n",
341                     sc->curr_level.total_set.freq, sc->curr_priority);
342                 sc->saved_level = sc->curr_level;
343                 sc->saved_priority = sc->curr_priority;
344         }
345         sc->curr_level = *level;
346         sc->curr_priority = priority;
347         error = 0;
348
349 out:
350         CF_MTX_UNLOCK(&sc->lock);
351         if (error && set)
352                 device_printf(set->dev, "set freq failed, err %d\n", error);
353         return (error);
354 }
355
356 static int
357 cf_get_method(device_t dev, struct cf_level *level)
358 {
359         struct cpufreq_softc *sc;
360         struct cf_level *levels;
361         struct cf_setting *curr_set, set;
362         struct pcpu *pc;
363         device_t *devs;
364         int count, error, i, numdevs;
365         uint64_t rate;
366
367         sc = device_get_softc(dev);
368         error = 0;
369         levels = NULL;
370
371         /* If we already know the current frequency, we're done. */
372         CF_MTX_LOCK(&sc->lock);
373         curr_set = &sc->curr_level.total_set;
374         if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
375                 CF_DEBUG("get returning known freq %d\n", curr_set->freq);
376                 goto out;
377         }
378         CF_MTX_UNLOCK(&sc->lock);
379
380         /*
381          * We need to figure out the current level.  Loop through every
382          * driver, getting the current setting.  Then, attempt to get a best
383          * match of settings against each level.
384          */
385         count = CF_MAX_LEVELS;
386         levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
387         if (levels == NULL)
388                 return (ENOMEM);
389         error = CPUFREQ_LEVELS(sc->dev, levels, &count);
390         if (error) {
391                 if (error == E2BIG)
392                         printf("cpufreq: need to increase CF_MAX_LEVELS\n");
393                 free(levels, M_TEMP);
394                 return (error);
395         }
396         error = device_get_children(device_get_parent(dev), &devs, &numdevs);
397         if (error) {
398                 free(levels, M_TEMP);
399                 return (error);
400         }
401
402         /*
403          * Reacquire the lock and search for the given level.
404          *
405          * XXX Note: this is not quite right since we really need to go
406          * through each level and compare both absolute and relative
407          * settings for each driver in the system before making a match.
408          * The estimation code below catches this case though.
409          */
410         CF_MTX_LOCK(&sc->lock);
411         for (i = 0; i < numdevs && curr_set->freq == CPUFREQ_VAL_UNKNOWN; i++) {
412                 if (!device_is_attached(devs[i]))
413                         continue;
414                 error = CPUFREQ_DRV_GET(devs[i], &set);
415                 if (error)
416                         continue;
417                 for (i = 0; i < count; i++) {
418                         if (CPUFREQ_CMP(set.freq, levels[i].total_set.freq)) {
419                                 sc->curr_level = levels[i];
420                                 break;
421                         }
422                 }
423         }
424         free(devs, M_TEMP);
425         if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
426                 CF_DEBUG("get matched freq %d from drivers\n", curr_set->freq);
427                 goto out;
428         }
429
430         /*
431          * We couldn't find an exact match, so attempt to estimate and then
432          * match against a level.
433          */
434         pc = cpu_get_pcpu(dev);
435         if (pc == NULL) {
436                 error = ENXIO;
437                 goto out;
438         }
439         cpu_est_clockrate(pc->pc_cpuid, &rate);
440         rate /= 1000000;
441         for (i = 0; i < count; i++) {
442                 if (CPUFREQ_CMP(rate, levels[i].total_set.freq)) {
443                         sc->curr_level = levels[i];
444                         CF_DEBUG("get estimated freq %d\n", curr_set->freq);
445                         break;
446                 }
447         }
448
449 out:
450         if (error == 0)
451                 *level = sc->curr_level;
452
453         CF_MTX_UNLOCK(&sc->lock);
454         if (levels)
455                 free(levels, M_TEMP);
456         return (error);
457 }
458
459 static int
460 cf_levels_method(device_t dev, struct cf_level *levels, int *count)
461 {
462         struct cf_setting_array *set_arr;
463         struct cf_setting_lst rel_sets;
464         struct cpufreq_softc *sc;
465         struct cf_level *lev;
466         struct cf_setting *sets;
467         struct pcpu *pc;
468         device_t *devs;
469         int error, i, numdevs, set_count, type;
470         uint64_t rate;
471
472         if (levels == NULL || count == NULL)
473                 return (EINVAL);
474
475         TAILQ_INIT(&rel_sets);
476         sc = device_get_softc(dev);
477         error = device_get_children(device_get_parent(dev), &devs, &numdevs);
478         if (error)
479                 return (error);
480         sets = malloc(MAX_SETTINGS * sizeof(*sets), M_TEMP, M_NOWAIT);
481         if (sets == NULL) {
482                 free(devs, M_TEMP);
483                 return (ENOMEM);
484         }
485
486         /* Get settings from all cpufreq drivers. */
487         CF_MTX_LOCK(&sc->lock);
488         for (i = 0; i < numdevs; i++) {
489                 /* Skip devices that aren't ready. */
490                 if (!device_is_attached(devs[i]))
491                         continue;
492
493                 /*
494                  * Get settings, skipping drivers that offer no settings or
495                  * provide settings for informational purposes only.
496                  */
497                 error = CPUFREQ_DRV_TYPE(devs[i], &type);
498                 if (error || (type & CPUFREQ_FLAG_INFO_ONLY)) {
499                         if (error == 0) {
500                                 CF_DEBUG("skipping info-only driver %s\n",
501                                     device_get_nameunit(devs[i]));
502                         }
503                         continue;
504                 }
505                 set_count = MAX_SETTINGS;
506                 error = CPUFREQ_DRV_SETTINGS(devs[i], sets, &set_count);
507                 if (error || set_count == 0)
508                         continue;
509
510                 /* Add the settings to our absolute/relative lists. */
511                 switch (type & CPUFREQ_TYPE_MASK) {
512                 case CPUFREQ_TYPE_ABSOLUTE:
513                         error = cpufreq_insert_abs(sc, sets, set_count);
514                         break;
515                 case CPUFREQ_TYPE_RELATIVE:
516                         CF_DEBUG("adding %d relative settings\n", set_count);
517                         set_arr = malloc(sizeof(*set_arr), M_TEMP, M_NOWAIT);
518                         if (set_arr == NULL) {
519                                 error = ENOMEM;
520                                 goto out;
521                         }
522                         bcopy(sets, set_arr->sets, set_count * sizeof(*sets));
523                         set_arr->count = set_count;
524                         TAILQ_INSERT_TAIL(&rel_sets, set_arr, link);
525                         break;
526                 default:
527                         error = EINVAL;
528                 }
529                 if (error)
530                         goto out;
531         }
532
533         /*
534          * If there are no absolute levels, create a fake one at 100%.  We
535          * then cache the clockrate for later use as our base frequency.
536          *
537          * XXX This assumes that the first time through, if we only have
538          * relative drivers, the CPU is currently running at 100%.
539          */
540         if (TAILQ_EMPTY(&sc->all_levels)) {
541                 if (sc->max_mhz == CPUFREQ_VAL_UNKNOWN) {
542                         pc = cpu_get_pcpu(dev);
543                         cpu_est_clockrate(pc->pc_cpuid, &rate);
544                         sc->max_mhz = rate / 1000000;
545                 }
546                 memset(&sets[0], CPUFREQ_VAL_UNKNOWN, sizeof(*sets));
547                 sets[0].freq = sc->max_mhz;
548                 sets[0].dev = NULL;
549                 error = cpufreq_insert_abs(sc, sets, 1);
550                 if (error)
551                         goto out;
552         }
553
554         /* Create a combined list of absolute + relative levels. */
555         TAILQ_FOREACH(set_arr, &rel_sets, link)
556                 cpufreq_expand_set(sc, set_arr);
557
558         /* If the caller doesn't have enough space, return the actual count. */
559         if (sc->all_count > *count) {
560                 *count = sc->all_count;
561                 error = E2BIG;
562                 goto out;
563         }
564
565         /* Finally, output the list of levels. */
566         i = 0;
567         TAILQ_FOREACH(lev, &sc->all_levels, link) {
568                 /* Skip levels that have a frequency that is too low. */
569                 if (lev->total_set.freq <= cf_lowest_freq) {
570                         sc->all_count--;
571                         continue;
572                 }
573
574                 levels[i] = *lev;
575                 i++;
576         }
577         *count = sc->all_count;
578         error = 0;
579
580 out:
581         /* Clear all levels since we regenerate them each time. */
582         while ((lev = TAILQ_FIRST(&sc->all_levels)) != NULL) {
583                 TAILQ_REMOVE(&sc->all_levels, lev, link);
584                 free(lev, M_TEMP);
585         }
586         sc->all_count = 0;
587
588         CF_MTX_UNLOCK(&sc->lock);
589         while ((set_arr = TAILQ_FIRST(&rel_sets)) != NULL) {
590                 TAILQ_REMOVE(&rel_sets, set_arr, link);
591                 free(set_arr, M_TEMP);
592         }
593         free(devs, M_TEMP);
594         free(sets, M_TEMP);
595         return (error);
596 }
597
598 /*
599  * Create levels for an array of absolute settings and insert them in
600  * sorted order in the specified list.
601  */
602 static int
603 cpufreq_insert_abs(struct cpufreq_softc *sc, struct cf_setting *sets,
604     int count)
605 {
606         struct cf_level_lst *list;
607         struct cf_level *level, *search;
608         int i;
609
610         CF_MTX_ASSERT(&sc->lock);
611
612         list = &sc->all_levels;
613         for (i = 0; i < count; i++) {
614                 level = malloc(sizeof(*level), M_TEMP, M_NOWAIT | M_ZERO);
615                 if (level == NULL)
616                         return (ENOMEM);
617                 level->abs_set = sets[i];
618                 level->total_set = sets[i];
619                 level->total_set.dev = NULL;
620                 sc->all_count++;
621
622                 if (TAILQ_EMPTY(list)) {
623                         CF_DEBUG("adding abs setting %d at head\n",
624                             sets[i].freq);
625                         TAILQ_INSERT_HEAD(list, level, link);
626                         continue;
627                 }
628
629                 TAILQ_FOREACH_REVERSE(search, list, cf_level_lst, link) {
630                         if (sets[i].freq <= search->total_set.freq) {
631                                 CF_DEBUG("adding abs setting %d after %d\n",
632                                     sets[i].freq, search->total_set.freq);
633                                 TAILQ_INSERT_AFTER(list, search, level, link);
634                                 break;
635                         }
636                 }
637         }
638         return (0);
639 }
640
641 /*
642  * Expand a group of relative settings, creating derived levels from them.
643  */
644 static int
645 cpufreq_expand_set(struct cpufreq_softc *sc, struct cf_setting_array *set_arr)
646 {
647         struct cf_level *fill, *search;
648         struct cf_setting *set;
649         int i;
650
651         CF_MTX_ASSERT(&sc->lock);
652
653         TAILQ_FOREACH(search, &sc->all_levels, link) {
654                 /* Skip this level if we've already modified it. */
655                 for (i = 0; i < search->rel_count; i++) {
656                         if (search->rel_set[i].dev == set_arr->sets[0].dev)
657                                 break;
658                 }
659                 if (i != search->rel_count) {
660                         CF_DEBUG("skipping modified level, freq %d (dev %s)\n",
661                             search->total_set.freq,
662                             device_get_nameunit(search->rel_set[i].dev));
663                         continue;
664                 }
665
666                 /* Add each setting to the level, duplicating if necessary. */
667                 for (i = 0; i < set_arr->count; i++) {
668                         set = &set_arr->sets[i];
669
670                         /*
671                          * If this setting is less than 100%, split the level
672                          * into two and add this setting to the new level.
673                          */
674                         fill = search;
675                         if (set->freq < 10000)
676                                 fill = cpufreq_dup_set(sc, search, set);
677
678                         /*
679                          * The new level was a duplicate of an existing level
680                          * so we freed it.  Go to the next setting.
681                          */
682                         if (fill == NULL)
683                                 continue;
684
685                         /* Add this setting to the existing or new level. */
686                         KASSERT(fill->rel_count < MAX_SETTINGS,
687                             ("cpufreq: too many relative drivers (%d)",
688                             MAX_SETTINGS));
689                         fill->rel_set[fill->rel_count] = *set;
690                         fill->rel_count++;
691                         CF_DEBUG(
692                         "expand set added rel setting %d%% to %d level\n",
693                             set->freq / 100, fill->total_set.freq);
694                 }
695         }
696
697         return (0);
698 }
699
700 static struct cf_level *
701 cpufreq_dup_set(struct cpufreq_softc *sc, struct cf_level *dup,
702     struct cf_setting *set)
703 {
704         struct cf_level_lst *list;
705         struct cf_level *fill, *itr;
706         struct cf_setting *fill_set, *itr_set;
707         int i;
708
709         CF_MTX_ASSERT(&sc->lock);
710
711         /*
712          * Create a new level, copy it from the old one, and update the
713          * total frequency and power by the percentage specified in the
714          * relative setting.
715          */
716         fill = malloc(sizeof(*fill), M_TEMP, M_NOWAIT);
717         if (fill == NULL)
718                 return (NULL);
719         *fill = *dup;
720         fill_set = &fill->total_set;
721         fill_set->freq =
722             ((uint64_t)fill_set->freq * set->freq) / 10000;
723         if (fill_set->power != CPUFREQ_VAL_UNKNOWN) {
724                 fill_set->power = ((uint64_t)fill_set->power * set->freq)
725                     / 10000;
726         }
727         if (set->lat != CPUFREQ_VAL_UNKNOWN) {
728                 if (fill_set->lat != CPUFREQ_VAL_UNKNOWN)
729                         fill_set->lat += set->lat;
730                 else
731                         fill_set->lat = set->lat;
732         }
733         CF_DEBUG("dup set considering derived setting %d\n", fill_set->freq);
734
735         /*
736          * If we copied an old level that we already modified (say, at 100%),
737          * we need to remove that setting before adding this one.  Since we
738          * process each setting array in order, we know any settings for this
739          * driver will be found at the end.
740          */
741         for (i = fill->rel_count; i != 0; i--) {
742                 if (fill->rel_set[i - 1].dev != set->dev)
743                         break;
744                 CF_DEBUG("removed last relative driver: %s\n",
745                     device_get_nameunit(set->dev));
746                 fill->rel_count--;
747         }
748
749         /*
750          * Insert the new level in sorted order.  If we find a duplicate,
751          * free the new level.  We can do this since any existing level will
752          * be guaranteed to have the same or less settings and thus consume
753          * less power.  For example, a level with one absolute setting of
754          * 800 Mhz uses less power than one composed of an absolute setting
755          * of 1600 Mhz and a relative setting at 50%.
756          */
757         list = &sc->all_levels;
758         if (TAILQ_EMPTY(list)) {
759                 CF_DEBUG("dup done, inserted %d at head\n", fill_set->freq);
760                 TAILQ_INSERT_HEAD(list, fill, link);
761         } else {
762                 TAILQ_FOREACH_REVERSE(itr, list, cf_level_lst, link) {
763                         itr_set = &itr->total_set;
764                         if (CPUFREQ_CMP(fill_set->freq, itr_set->freq)) {
765                                 CF_DEBUG(
766                         "dup done, freeing new level %d, matches %d\n",
767                                     fill_set->freq, itr_set->freq);
768                                 free(fill, M_TEMP);
769                                 fill = NULL;
770                                 break;
771                         } else if (fill_set->freq < itr_set->freq) {
772                                 CF_DEBUG(
773                         "dup done, inserting new level %d after %d\n",
774                                     fill_set->freq, itr_set->freq);
775                                 TAILQ_INSERT_AFTER(list, itr, fill, link);
776                                 sc->all_count++;
777                                 break;
778                         }
779                 }
780         }
781
782         return (fill);
783 }
784
785 static int
786 cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS)
787 {
788         struct cpufreq_softc *sc;
789         struct cf_level *levels;
790         int count, devcount, error, freq, i, n;
791         device_t *devs;
792
793         devs = NULL;
794         sc = oidp->oid_arg1;
795         levels = malloc(CF_MAX_LEVELS * sizeof(*levels), M_TEMP, M_NOWAIT);
796         if (levels == NULL)
797                 return (ENOMEM);
798
799         error = CPUFREQ_GET(sc->dev, &levels[0]);
800         if (error)
801                 goto out;
802         freq = levels[0].total_set.freq;
803         error = sysctl_handle_int(oidp, &freq, 0, req);
804         if (error != 0 || req->newptr == NULL)
805                 goto out;
806
807         /*
808          * While we only call cpufreq_get() on one device (assuming all
809          * CPUs have equal levels), we call cpufreq_set() on all CPUs.
810          * This is needed for some MP systems.
811          */
812         error = devclass_get_devices(cpufreq_dc, &devs, &devcount);
813         if (error)
814                 goto out;
815         for (n = 0; n < devcount; n++) {
816                 count = CF_MAX_LEVELS;
817                 error = CPUFREQ_LEVELS(devs[n], levels, &count);
818                 if (error) {
819                         if (error == E2BIG)
820                                 printf(
821                         "cpufreq: need to increase CF_MAX_LEVELS\n");
822                         break;
823                 }
824                 for (i = 0; i < count; i++) {
825                         if (CPUFREQ_CMP(levels[i].total_set.freq, freq)) {
826                                 error = CPUFREQ_SET(devs[n], &levels[i],
827                                     CPUFREQ_PRIO_USER);
828                                 break;
829                         }
830                 }
831                 if (i == count) {
832                         error = EINVAL;
833                         break;
834                 }
835         }
836
837 out:
838         if (devs)
839                 free(devs, M_TEMP);
840         if (levels)
841                 free(levels, M_TEMP);
842         return (error);
843 }
844
845 static int
846 cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS)
847 {
848         struct cpufreq_softc *sc;
849         struct cf_level *levels;
850         struct cf_setting *set;
851         struct sbuf sb;
852         int count, error, i;
853
854         sc = oidp->oid_arg1;
855         sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
856
857         /* Get settings from the device and generate the output string. */
858         count = CF_MAX_LEVELS;
859         levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
860         if (levels == NULL)
861                 return (ENOMEM);
862         error = CPUFREQ_LEVELS(sc->dev, levels, &count);
863         if (error) {
864                 if (error == E2BIG)
865                         printf("cpufreq: need to increase CF_MAX_LEVELS\n");
866                 goto out;
867         }
868         if (count) {
869                 for (i = 0; i < count; i++) {
870                         set = &levels[i].total_set;
871                         sbuf_printf(&sb, "%d/%d ", set->freq, set->power);
872                 }
873         } else
874                 sbuf_cpy(&sb, "0");
875         sbuf_trim(&sb);
876         sbuf_finish(&sb);
877         error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
878
879 out:
880         free(levels, M_TEMP);
881         sbuf_delete(&sb);
882         return (error);
883 }
884
885 static int
886 cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS)
887 {
888         device_t dev;
889         struct cf_setting *sets;
890         struct sbuf sb;
891         int error, i, set_count;
892
893         dev = oidp->oid_arg1;
894         sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
895
896         /* Get settings from the device and generate the output string. */
897         set_count = MAX_SETTINGS;
898         sets = malloc(set_count * sizeof(*sets), M_TEMP, M_NOWAIT);
899         if (sets == NULL)
900                 return (ENOMEM);
901         error = CPUFREQ_DRV_SETTINGS(dev, sets, &set_count);
902         if (error)
903                 goto out;
904         if (set_count) {
905                 for (i = 0; i < set_count; i++)
906                         sbuf_printf(&sb, "%d/%d ", sets[i].freq, sets[i].power);
907         } else
908                 sbuf_cpy(&sb, "0");
909         sbuf_trim(&sb);
910         sbuf_finish(&sb);
911         error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
912
913 out:
914         free(sets, M_TEMP);
915         sbuf_delete(&sb);
916         return (error);
917 }
918
919 int
920 cpufreq_register(device_t dev)
921 {
922         struct cpufreq_softc *sc;
923         device_t cf_dev, cpu_dev;
924
925         /* Add a sysctl to get each driver's settings separately. */
926         SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
927             SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
928             OID_AUTO, "freq_settings", CTLTYPE_STRING | CTLFLAG_RD, dev, 0,
929             cpufreq_settings_sysctl, "A", "CPU frequency driver settings");
930
931         /*
932          * Add only one cpufreq device to each CPU.  Currently, all CPUs
933          * must offer the same levels and be switched at the same time.
934          */
935         cpu_dev = device_get_parent(dev);
936         if ((cf_dev = device_find_child(cpu_dev, "cpufreq", -1))) {
937                 sc = device_get_softc(cf_dev);
938                 sc->max_mhz = CPUFREQ_VAL_UNKNOWN;
939                 return (0);
940         }
941
942         /* Add the child device and possibly sysctls. */
943         cf_dev = BUS_ADD_CHILD(cpu_dev, 0, "cpufreq", -1);
944         if (cf_dev == NULL)
945                 return (ENOMEM);
946         device_quiet(cf_dev);
947
948         return (device_probe_and_attach(cf_dev));
949 }
950
951 int
952 cpufreq_unregister(device_t dev)
953 {
954         device_t cf_dev, *devs;
955         int cfcount, devcount, error, i, type;
956
957         /*
958          * If this is the last cpufreq child device, remove the control
959          * device as well.  We identify cpufreq children by calling a method
960          * they support.
961          */
962         error = device_get_children(device_get_parent(dev), &devs, &devcount);
963         if (error)
964                 return (error);
965         cf_dev = device_find_child(device_get_parent(dev), "cpufreq", -1);
966         if (cf_dev == NULL) {
967                 device_printf(dev,
968         "warning: cpufreq_unregister called with no cpufreq device active\n");
969                 return (0);
970         }
971         cfcount = 0;
972         for (i = 0; i < devcount; i++) {
973                 if (!device_is_attached(devs[i]))
974                         continue;
975                 if (CPUFREQ_DRV_TYPE(devs[i], &type) == 0)
976                         cfcount++;
977         }
978         if (cfcount <= 1)
979                 device_delete_child(device_get_parent(cf_dev), cf_dev);
980         free(devs, M_TEMP);
981
982         return (0);
983 }