Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
[sfrench/cifs-2.6.git] / tools / power / x86 / turbostat / turbostat.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * turbostat -- show CPU frequency and C-state residency
4  * on modern Intel and AMD processors.
5  *
6  * Copyright (c) 2024 Intel Corporation.
7  * Len Brown <len.brown@intel.com>
8  */
9
10 #define _GNU_SOURCE
11 #include MSRHEADER
12 #include INTEL_FAMILY_HEADER
13 #include <stdarg.h>
14 #include <stdio.h>
15 #include <err.h>
16 #include <unistd.h>
17 #include <sys/types.h>
18 #include <sys/wait.h>
19 #include <sys/stat.h>
20 #include <sys/select.h>
21 #include <sys/resource.h>
22 #include <fcntl.h>
23 #include <signal.h>
24 #include <sys/time.h>
25 #include <stdlib.h>
26 #include <getopt.h>
27 #include <dirent.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include <sched.h>
31 #include <time.h>
32 #include <cpuid.h>
33 #include <sys/capability.h>
34 #include <errno.h>
35 #include <math.h>
36 #include <linux/perf_event.h>
37 #include <asm/unistd.h>
38 #include <stdbool.h>
39 #include <assert.h>
40 #include <linux/kernel.h>
41
42 #define UNUSED(x) (void)(x)
43
44 /*
45  * This list matches the column headers, except
46  * 1. built-in only, the sysfs counters are not here -- we learn of those at run-time
47  * 2. Core and CPU are moved to the end, we can't have strings that contain them
48  *    matching on them for --show and --hide.
49  */
50
51 /*
52  * buffer size used by sscanf() for added column names
53  * Usually truncated to 7 characters, but also handles 18 columns for raw 64-bit counters
54  */
55 #define NAME_BYTES 20
56 #define PATH_BYTES 128
57
58 #define MAX_NOFILE 0x8000
59
60 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE };
61 enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC };
62 enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT };
63 enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR };
64 enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR };
65
66 struct msr_counter {
67         unsigned int msr_num;
68         char name[NAME_BYTES];
69         char path[PATH_BYTES];
70         unsigned int width;
71         enum counter_type type;
72         enum counter_format format;
73         struct msr_counter *next;
74         unsigned int flags;
75 #define FLAGS_HIDE      (1 << 0)
76 #define FLAGS_SHOW      (1 << 1)
77 #define SYSFS_PERCPU    (1 << 1)
78 };
79
80 struct msr_counter bic[] = {
81         { 0x0, "usec", "", 0, 0, 0, NULL, 0 },
82         { 0x0, "Time_Of_Day_Seconds", "", 0, 0, 0, NULL, 0 },
83         { 0x0, "Package", "", 0, 0, 0, NULL, 0 },
84         { 0x0, "Node", "", 0, 0, 0, NULL, 0 },
85         { 0x0, "Avg_MHz", "", 0, 0, 0, NULL, 0 },
86         { 0x0, "Busy%", "", 0, 0, 0, NULL, 0 },
87         { 0x0, "Bzy_MHz", "", 0, 0, 0, NULL, 0 },
88         { 0x0, "TSC_MHz", "", 0, 0, 0, NULL, 0 },
89         { 0x0, "IRQ", "", 0, 0, 0, NULL, 0 },
90         { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL, 0 },
91         { 0x0, "sysfs", "", 0, 0, 0, NULL, 0 },
92         { 0x0, "CPU%c1", "", 0, 0, 0, NULL, 0 },
93         { 0x0, "CPU%c3", "", 0, 0, 0, NULL, 0 },
94         { 0x0, "CPU%c6", "", 0, 0, 0, NULL, 0 },
95         { 0x0, "CPU%c7", "", 0, 0, 0, NULL, 0 },
96         { 0x0, "ThreadC", "", 0, 0, 0, NULL, 0 },
97         { 0x0, "CoreTmp", "", 0, 0, 0, NULL, 0 },
98         { 0x0, "CoreCnt", "", 0, 0, 0, NULL, 0 },
99         { 0x0, "PkgTmp", "", 0, 0, 0, NULL, 0 },
100         { 0x0, "GFX%rc6", "", 0, 0, 0, NULL, 0 },
101         { 0x0, "GFXMHz", "", 0, 0, 0, NULL, 0 },
102         { 0x0, "Pkg%pc2", "", 0, 0, 0, NULL, 0 },
103         { 0x0, "Pkg%pc3", "", 0, 0, 0, NULL, 0 },
104         { 0x0, "Pkg%pc6", "", 0, 0, 0, NULL, 0 },
105         { 0x0, "Pkg%pc7", "", 0, 0, 0, NULL, 0 },
106         { 0x0, "Pkg%pc8", "", 0, 0, 0, NULL, 0 },
107         { 0x0, "Pkg%pc9", "", 0, 0, 0, NULL, 0 },
108         { 0x0, "Pk%pc10", "", 0, 0, 0, NULL, 0 },
109         { 0x0, "CPU%LPI", "", 0, 0, 0, NULL, 0 },
110         { 0x0, "SYS%LPI", "", 0, 0, 0, NULL, 0 },
111         { 0x0, "PkgWatt", "", 0, 0, 0, NULL, 0 },
112         { 0x0, "CorWatt", "", 0, 0, 0, NULL, 0 },
113         { 0x0, "GFXWatt", "", 0, 0, 0, NULL, 0 },
114         { 0x0, "PkgCnt", "", 0, 0, 0, NULL, 0 },
115         { 0x0, "RAMWatt", "", 0, 0, 0, NULL, 0 },
116         { 0x0, "PKG_%", "", 0, 0, 0, NULL, 0 },
117         { 0x0, "RAM_%", "", 0, 0, 0, NULL, 0 },
118         { 0x0, "Pkg_J", "", 0, 0, 0, NULL, 0 },
119         { 0x0, "Cor_J", "", 0, 0, 0, NULL, 0 },
120         { 0x0, "GFX_J", "", 0, 0, 0, NULL, 0 },
121         { 0x0, "RAM_J", "", 0, 0, 0, NULL, 0 },
122         { 0x0, "Mod%c6", "", 0, 0, 0, NULL, 0 },
123         { 0x0, "Totl%C0", "", 0, 0, 0, NULL, 0 },
124         { 0x0, "Any%C0", "", 0, 0, 0, NULL, 0 },
125         { 0x0, "GFX%C0", "", 0, 0, 0, NULL, 0 },
126         { 0x0, "CPUGFX%", "", 0, 0, 0, NULL, 0 },
127         { 0x0, "Core", "", 0, 0, 0, NULL, 0 },
128         { 0x0, "CPU", "", 0, 0, 0, NULL, 0 },
129         { 0x0, "APIC", "", 0, 0, 0, NULL, 0 },
130         { 0x0, "X2APIC", "", 0, 0, 0, NULL, 0 },
131         { 0x0, "Die", "", 0, 0, 0, NULL, 0 },
132         { 0x0, "GFXAMHz", "", 0, 0, 0, NULL, 0 },
133         { 0x0, "IPC", "", 0, 0, 0, NULL, 0 },
134         { 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 },
135         { 0x0, "UncMHz", "", 0, 0, 0, NULL, 0 },
136         { 0x0, "SAM%mc6", "", 0, 0, 0, NULL, 0 },
137         { 0x0, "SAMMHz", "", 0, 0, 0, NULL, 0 },
138         { 0x0, "SAMAMHz", "", 0, 0, 0, NULL, 0 },
139 };
140
141 #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter))
142 #define BIC_USEC        (1ULL << 0)
143 #define BIC_TOD         (1ULL << 1)
144 #define BIC_Package     (1ULL << 2)
145 #define BIC_Node        (1ULL << 3)
146 #define BIC_Avg_MHz     (1ULL << 4)
147 #define BIC_Busy        (1ULL << 5)
148 #define BIC_Bzy_MHz     (1ULL << 6)
149 #define BIC_TSC_MHz     (1ULL << 7)
150 #define BIC_IRQ         (1ULL << 8)
151 #define BIC_SMI         (1ULL << 9)
152 #define BIC_sysfs       (1ULL << 10)
153 #define BIC_CPU_c1      (1ULL << 11)
154 #define BIC_CPU_c3      (1ULL << 12)
155 #define BIC_CPU_c6      (1ULL << 13)
156 #define BIC_CPU_c7      (1ULL << 14)
157 #define BIC_ThreadC     (1ULL << 15)
158 #define BIC_CoreTmp     (1ULL << 16)
159 #define BIC_CoreCnt     (1ULL << 17)
160 #define BIC_PkgTmp      (1ULL << 18)
161 #define BIC_GFX_rc6     (1ULL << 19)
162 #define BIC_GFXMHz      (1ULL << 20)
163 #define BIC_Pkgpc2      (1ULL << 21)
164 #define BIC_Pkgpc3      (1ULL << 22)
165 #define BIC_Pkgpc6      (1ULL << 23)
166 #define BIC_Pkgpc7      (1ULL << 24)
167 #define BIC_Pkgpc8      (1ULL << 25)
168 #define BIC_Pkgpc9      (1ULL << 26)
169 #define BIC_Pkgpc10     (1ULL << 27)
170 #define BIC_CPU_LPI     (1ULL << 28)
171 #define BIC_SYS_LPI     (1ULL << 29)
172 #define BIC_PkgWatt     (1ULL << 30)
173 #define BIC_CorWatt     (1ULL << 31)
174 #define BIC_GFXWatt     (1ULL << 32)
175 #define BIC_PkgCnt      (1ULL << 33)
176 #define BIC_RAMWatt     (1ULL << 34)
177 #define BIC_PKG__       (1ULL << 35)
178 #define BIC_RAM__       (1ULL << 36)
179 #define BIC_Pkg_J       (1ULL << 37)
180 #define BIC_Cor_J       (1ULL << 38)
181 #define BIC_GFX_J       (1ULL << 39)
182 #define BIC_RAM_J       (1ULL << 40)
183 #define BIC_Mod_c6      (1ULL << 41)
184 #define BIC_Totl_c0     (1ULL << 42)
185 #define BIC_Any_c0      (1ULL << 43)
186 #define BIC_GFX_c0      (1ULL << 44)
187 #define BIC_CPUGFX      (1ULL << 45)
188 #define BIC_Core        (1ULL << 46)
189 #define BIC_CPU         (1ULL << 47)
190 #define BIC_APIC        (1ULL << 48)
191 #define BIC_X2APIC      (1ULL << 49)
192 #define BIC_Die         (1ULL << 50)
193 #define BIC_GFXACTMHz   (1ULL << 51)
194 #define BIC_IPC         (1ULL << 52)
195 #define BIC_CORE_THROT_CNT      (1ULL << 53)
196 #define BIC_UNCORE_MHZ          (1ULL << 54)
197 #define BIC_SAM_mc6             (1ULL << 55)
198 #define BIC_SAMMHz              (1ULL << 56)
199 #define BIC_SAMACTMHz           (1ULL << 57)
200
201 #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die )
202 #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__)
203 #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ)
204 #define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6)
205 #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
206
207 #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC)
208
209 unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT);
210 unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC_X2APIC;
211
212 #define DO_BIC(COUNTER_NAME) (bic_enabled & bic_present & COUNTER_NAME)
213 #define DO_BIC_READ(COUNTER_NAME) (bic_present & COUNTER_NAME)
214 #define ENABLE_BIC(COUNTER_NAME) (bic_enabled |= COUNTER_NAME)
215 #define BIC_PRESENT(COUNTER_BIT) (bic_present |= COUNTER_BIT)
216 #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT)
217 #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT)
218
219 struct amperf_group_fd;
220
221 char *proc_stat = "/proc/stat";
222 FILE *outf;
223 int *fd_percpu;
224 int *fd_instr_count_percpu;
225 struct amperf_group_fd *fd_amperf_percpu;       /* File descriptors for perf group with APERF and MPERF counters. */
226 struct timeval interval_tv = { 5, 0 };
227 struct timespec interval_ts = { 5, 0 };
228
229 unsigned int num_iterations;
230 unsigned int header_iterations;
231 unsigned int debug;
232 unsigned int quiet;
233 unsigned int shown;
234 unsigned int sums_need_wide_columns;
235 unsigned int rapl_joules;
236 unsigned int summary_only;
237 unsigned int list_header_only;
238 unsigned int dump_only;
239 unsigned int has_aperf;
240 unsigned int has_epb;
241 unsigned int has_turbo;
242 unsigned int is_hybrid;
243 unsigned int units = 1000000;   /* MHz etc */
244 unsigned int genuine_intel;
245 unsigned int authentic_amd;
246 unsigned int hygon_genuine;
247 unsigned int max_level, max_extended_level;
248 unsigned int has_invariant_tsc;
249 unsigned int aperf_mperf_multiplier = 1;
250 double bclk;
251 double base_hz;
252 unsigned int has_base_hz;
253 double tsc_tweak = 1.0;
254 unsigned int show_pkg_only;
255 unsigned int show_core_only;
256 char *output_buffer, *outp;
257 unsigned int do_dts;
258 unsigned int do_ptm;
259 unsigned int do_ipc;
260 unsigned long long cpuidle_cur_cpu_lpi_us;
261 unsigned long long cpuidle_cur_sys_lpi_us;
262 unsigned int tj_max;
263 unsigned int tj_max_override;
264 double rapl_power_units, rapl_time_units;
265 double rapl_dram_energy_units, rapl_energy_units;
266 double rapl_joule_counter_range;
267 unsigned int crystal_hz;
268 unsigned long long tsc_hz;
269 int base_cpu;
270 unsigned int has_hwp;           /* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */
271                         /* IA32_HWP_REQUEST, IA32_HWP_STATUS */
272 unsigned int has_hwp_notify;    /* IA32_HWP_INTERRUPT */
273 unsigned int has_hwp_activity_window;   /* IA32_HWP_REQUEST[bits 41:32] */
274 unsigned int has_hwp_epp;       /* IA32_HWP_REQUEST[bits 31:24] */
275 unsigned int has_hwp_pkg;       /* IA32_HWP_REQUEST_PKG */
276 unsigned int first_counter_read = 1;
277 int ignore_stdin;
278 bool no_msr;
279 bool no_perf;
280 enum amperf_source amperf_source;
281
282 enum gfx_sysfs_idx {
283         GFX_rc6,
284         GFX_MHz,
285         GFX_ACTMHz,
286         SAM_mc6,
287         SAM_MHz,
288         SAM_ACTMHz,
289         GFX_MAX
290 };
291
292 struct gfx_sysfs_info {
293         const char *path;
294         FILE *fp;
295         unsigned int val;
296         unsigned long long val_ull;
297 };
298
299 static struct gfx_sysfs_info gfx_info[GFX_MAX];
300
301 int get_msr(int cpu, off_t offset, unsigned long long *msr);
302
303 /* Model specific support Start */
304
305 /* List of features that may diverge among different platforms */
306 struct platform_features {
307         bool has_msr_misc_feature_control;      /* MSR_MISC_FEATURE_CONTROL */
308         bool has_msr_misc_pwr_mgmt;     /* MSR_MISC_PWR_MGMT */
309         bool has_nhm_msrs;      /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, MSR_IA32_POWER_CTL, TRL MSRs */
310         bool has_config_tdp;    /* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */
311         int bclk_freq;          /* CPU base clock */
312         int crystal_freq;       /* Crystal clock to use when not available from CPUID.15 */
313         int supported_cstates;  /* Core cstates and Package cstates supported */
314         int cst_limit;          /* MSR_PKG_CST_CONFIG_CONTROL */
315         bool has_cst_auto_convension;   /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */
316         bool has_irtl_msrs;     /* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */
317         bool has_msr_core_c1_res;       /* MSR_CORE_C1_RES */
318         bool has_msr_module_c6_res_ms;  /* MSR_MODULE_C6_RES_MS */
319         bool has_msr_c6_demotion_policy_config; /* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */
320         bool has_msr_atom_pkg_c6_residency;     /* MSR_ATOM_PKG_C6_RESIDENCY */
321         bool has_msr_knl_core_c6_residency;     /* MSR_KNL_CORE_C6_RESIDENCY */
322         bool has_ext_cst_msrs;  /* MSR_PKG_WEIGHTED_CORE_C0_RES/MSR_PKG_ANY_CORE_C0_RES/MSR_PKG_ANY_GFXE_C0_RES/MSR_PKG_BOTH_CORE_GFXE_C0_RES */
323         bool has_cst_prewake_bit;       /* Cstate prewake bit in MSR_IA32_POWER_CTL */
324         int trl_msrs;           /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
325         int plr_msrs;           /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
326         int rapl_msrs;          /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
327         bool has_per_core_rapl; /* Indicates cores energy collection is per-core, not per-package. AMD specific for now */
328         bool has_rapl_divisor;  /* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */
329         bool has_fixed_rapl_unit;       /* Fixed Energy Unit used for DRAM RAPL Domain */
330         int rapl_quirk_tdp;     /* Hardcoded TDP value when cannot be retrieved from hardware */
331         int tcc_offset_bits;    /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
332         bool enable_tsc_tweak;  /* Use CPU Base freq instead of TSC freq for aperf/mperf counter */
333         bool need_perf_multiplier;      /* mperf/aperf multiplier */
334 };
335
336 struct platform_data {
337         unsigned int model;
338         const struct platform_features *features;
339 };
340
341 /* For BCLK */
342 enum bclk_freq {
343         BCLK_100MHZ = 1,
344         BCLK_133MHZ,
345         BCLK_SLV,
346 };
347
348 #define SLM_BCLK_FREQS 5
349 double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 };
350
351 double slm_bclk(void)
352 {
353         unsigned long long msr = 3;
354         unsigned int i;
355         double freq;
356
357         if (get_msr(base_cpu, MSR_FSB_FREQ, &msr))
358                 fprintf(outf, "SLM BCLK: unknown\n");
359
360         i = msr & 0xf;
361         if (i >= SLM_BCLK_FREQS) {
362                 fprintf(outf, "SLM BCLK[%d] invalid\n", i);
363                 i = 3;
364         }
365         freq = slm_freq_table[i];
366
367         if (!quiet)
368                 fprintf(outf, "SLM BCLK: %.1f Mhz\n", freq);
369
370         return freq;
371 }
372
373 /* For Package cstate limit */
374 enum package_cstate_limit {
375         CST_LIMIT_NHM = 1,
376         CST_LIMIT_SNB,
377         CST_LIMIT_HSW,
378         CST_LIMIT_SKX,
379         CST_LIMIT_ICX,
380         CST_LIMIT_SLV,
381         CST_LIMIT_AMT,
382         CST_LIMIT_KNL,
383         CST_LIMIT_GMT,
384 };
385
386 /* For Turbo Ratio Limit MSRs */
387 enum turbo_ratio_limit_msrs {
388         TRL_BASE = BIT(0),
389         TRL_LIMIT1 = BIT(1),
390         TRL_LIMIT2 = BIT(2),
391         TRL_ATOM = BIT(3),
392         TRL_KNL = BIT(4),
393         TRL_CORECOUNT = BIT(5),
394 };
395
396 /* For Perf Limit Reason MSRs */
397 enum perf_limit_reason_msrs {
398         PLR_CORE = BIT(0),
399         PLR_GFX = BIT(1),
400         PLR_RING = BIT(2),
401 };
402
403 /* For RAPL MSRs */
404 enum rapl_msrs {
405         RAPL_PKG_POWER_LIMIT = BIT(0),  /* 0x610 MSR_PKG_POWER_LIMIT */
406         RAPL_PKG_ENERGY_STATUS = BIT(1),        /* 0x611 MSR_PKG_ENERGY_STATUS */
407         RAPL_PKG_PERF_STATUS = BIT(2),  /* 0x613 MSR_PKG_PERF_STATUS */
408         RAPL_PKG_POWER_INFO = BIT(3),   /* 0x614 MSR_PKG_POWER_INFO */
409         RAPL_DRAM_POWER_LIMIT = BIT(4), /* 0x618 MSR_DRAM_POWER_LIMIT */
410         RAPL_DRAM_ENERGY_STATUS = BIT(5),       /* 0x619 MSR_DRAM_ENERGY_STATUS */
411         RAPL_DRAM_PERF_STATUS = BIT(6), /* 0x61b MSR_DRAM_PERF_STATUS */
412         RAPL_DRAM_POWER_INFO = BIT(7),  /* 0x61c MSR_DRAM_POWER_INFO */
413         RAPL_CORE_POWER_LIMIT = BIT(8), /* 0x638 MSR_PP0_POWER_LIMIT */
414         RAPL_CORE_ENERGY_STATUS = BIT(9),       /* 0x639 MSR_PP0_ENERGY_STATUS */
415         RAPL_CORE_POLICY = BIT(10),     /* 0x63a MSR_PP0_POLICY */
416         RAPL_GFX_POWER_LIMIT = BIT(11), /* 0x640 MSR_PP1_POWER_LIMIT */
417         RAPL_GFX_ENERGY_STATUS = BIT(12),       /* 0x641 MSR_PP1_ENERGY_STATUS */
418         RAPL_GFX_POLICY = BIT(13),      /* 0x642 MSR_PP1_POLICY */
419         RAPL_AMD_PWR_UNIT = BIT(14),    /* 0xc0010299 MSR_AMD_RAPL_POWER_UNIT */
420         RAPL_AMD_CORE_ENERGY_STAT = BIT(15),    /* 0xc001029a MSR_AMD_CORE_ENERGY_STATUS */
421         RAPL_AMD_PKG_ENERGY_STAT = BIT(16),     /* 0xc001029b MSR_AMD_PKG_ENERGY_STATUS */
422 };
423
424 #define RAPL_PKG        (RAPL_PKG_ENERGY_STATUS | RAPL_PKG_POWER_LIMIT)
425 #define RAPL_DRAM       (RAPL_DRAM_ENERGY_STATUS | RAPL_DRAM_POWER_LIMIT)
426 #define RAPL_CORE       (RAPL_CORE_ENERGY_STATUS | RAPL_CORE_POWER_LIMIT)
427 #define RAPL_GFX        (RAPL_GFX_POWER_LIMIT | RAPL_GFX_ENERGY_STATUS)
428
429 #define RAPL_PKG_ALL    (RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO)
430 #define RAPL_DRAM_ALL   (RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_DRAM_POWER_INFO)
431 #define RAPL_CORE_ALL   (RAPL_CORE | RAPL_CORE_POLICY)
432 #define RAPL_GFX_ALL    (RAPL_GFX | RAPL_GFX_POLIGY)
433
434 #define RAPL_AMD_F17H   (RAPL_AMD_PWR_UNIT | RAPL_AMD_CORE_ENERGY_STAT | RAPL_AMD_PKG_ENERGY_STAT)
435
436 /* For Cstates */
437 enum cstates {
438         CC1 = BIT(0),
439         CC3 = BIT(1),
440         CC6 = BIT(2),
441         CC7 = BIT(3),
442         PC2 = BIT(4),
443         PC3 = BIT(5),
444         PC6 = BIT(6),
445         PC7 = BIT(7),
446         PC8 = BIT(8),
447         PC9 = BIT(9),
448         PC10 = BIT(10),
449 };
450
451 static const struct platform_features nhm_features = {
452         .has_msr_misc_pwr_mgmt = 1,
453         .has_nhm_msrs = 1,
454         .bclk_freq = BCLK_133MHZ,
455         .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
456         .cst_limit = CST_LIMIT_NHM,
457         .trl_msrs = TRL_BASE,
458 };
459
460 static const struct platform_features nhx_features = {
461         .has_msr_misc_pwr_mgmt = 1,
462         .has_nhm_msrs = 1,
463         .bclk_freq = BCLK_133MHZ,
464         .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
465         .cst_limit = CST_LIMIT_NHM,
466 };
467
468 static const struct platform_features snb_features = {
469         .has_msr_misc_feature_control = 1,
470         .has_msr_misc_pwr_mgmt = 1,
471         .has_nhm_msrs = 1,
472         .bclk_freq = BCLK_100MHZ,
473         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
474         .cst_limit = CST_LIMIT_SNB,
475         .has_irtl_msrs = 1,
476         .trl_msrs = TRL_BASE,
477         .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
478 };
479
480 static const struct platform_features snx_features = {
481         .has_msr_misc_feature_control = 1,
482         .has_msr_misc_pwr_mgmt = 1,
483         .has_nhm_msrs = 1,
484         .bclk_freq = BCLK_100MHZ,
485         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
486         .cst_limit = CST_LIMIT_SNB,
487         .has_irtl_msrs = 1,
488         .trl_msrs = TRL_BASE,
489         .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
490 };
491
492 static const struct platform_features ivb_features = {
493         .has_msr_misc_feature_control = 1,
494         .has_msr_misc_pwr_mgmt = 1,
495         .has_nhm_msrs = 1,
496         .has_config_tdp = 1,
497         .bclk_freq = BCLK_100MHZ,
498         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
499         .cst_limit = CST_LIMIT_SNB,
500         .has_irtl_msrs = 1,
501         .trl_msrs = TRL_BASE,
502         .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
503 };
504
505 static const struct platform_features ivx_features = {
506         .has_msr_misc_feature_control = 1,
507         .has_msr_misc_pwr_mgmt = 1,
508         .has_nhm_msrs = 1,
509         .bclk_freq = BCLK_100MHZ,
510         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
511         .cst_limit = CST_LIMIT_SNB,
512         .has_irtl_msrs = 1,
513         .trl_msrs = TRL_BASE | TRL_LIMIT1,
514         .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
515 };
516
517 static const struct platform_features hsw_features = {
518         .has_msr_misc_feature_control = 1,
519         .has_msr_misc_pwr_mgmt = 1,
520         .has_nhm_msrs = 1,
521         .has_config_tdp = 1,
522         .bclk_freq = BCLK_100MHZ,
523         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
524         .cst_limit = CST_LIMIT_HSW,
525         .has_irtl_msrs = 1,
526         .trl_msrs = TRL_BASE,
527         .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
528         .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
529 };
530
531 static const struct platform_features hsx_features = {
532         .has_msr_misc_feature_control = 1,
533         .has_msr_misc_pwr_mgmt = 1,
534         .has_nhm_msrs = 1,
535         .has_config_tdp = 1,
536         .bclk_freq = BCLK_100MHZ,
537         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
538         .cst_limit = CST_LIMIT_HSW,
539         .has_irtl_msrs = 1,
540         .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
541         .plr_msrs = PLR_CORE | PLR_RING,
542         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
543         .has_fixed_rapl_unit = 1,
544 };
545
546 static const struct platform_features hswl_features = {
547         .has_msr_misc_feature_control = 1,
548         .has_msr_misc_pwr_mgmt = 1,
549         .has_nhm_msrs = 1,
550         .has_config_tdp = 1,
551         .bclk_freq = BCLK_100MHZ,
552         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
553         .cst_limit = CST_LIMIT_HSW,
554         .has_irtl_msrs = 1,
555         .trl_msrs = TRL_BASE,
556         .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
557         .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
558 };
559
560 static const struct platform_features hswg_features = {
561         .has_msr_misc_feature_control = 1,
562         .has_msr_misc_pwr_mgmt = 1,
563         .has_nhm_msrs = 1,
564         .has_config_tdp = 1,
565         .bclk_freq = BCLK_100MHZ,
566         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
567         .cst_limit = CST_LIMIT_HSW,
568         .has_irtl_msrs = 1,
569         .trl_msrs = TRL_BASE,
570         .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
571         .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
572 };
573
574 static const struct platform_features bdw_features = {
575         .has_msr_misc_feature_control = 1,
576         .has_msr_misc_pwr_mgmt = 1,
577         .has_nhm_msrs = 1,
578         .has_config_tdp = 1,
579         .bclk_freq = BCLK_100MHZ,
580         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
581         .cst_limit = CST_LIMIT_HSW,
582         .has_irtl_msrs = 1,
583         .trl_msrs = TRL_BASE,
584         .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
585 };
586
587 static const struct platform_features bdwg_features = {
588         .has_msr_misc_feature_control = 1,
589         .has_msr_misc_pwr_mgmt = 1,
590         .has_nhm_msrs = 1,
591         .has_config_tdp = 1,
592         .bclk_freq = BCLK_100MHZ,
593         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
594         .cst_limit = CST_LIMIT_HSW,
595         .has_irtl_msrs = 1,
596         .trl_msrs = TRL_BASE,
597         .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
598 };
599
600 static const struct platform_features bdx_features = {
601         .has_msr_misc_feature_control = 1,
602         .has_msr_misc_pwr_mgmt = 1,
603         .has_nhm_msrs = 1,
604         .has_config_tdp = 1,
605         .bclk_freq = BCLK_100MHZ,
606         .supported_cstates = CC1 | CC3 | CC6 | PC2 | PC3 | PC6,
607         .cst_limit = CST_LIMIT_HSW,
608         .has_irtl_msrs = 1,
609         .has_cst_auto_convension = 1,
610         .trl_msrs = TRL_BASE,
611         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
612         .has_fixed_rapl_unit = 1,
613 };
614
615 static const struct platform_features skl_features = {
616         .has_msr_misc_feature_control = 1,
617         .has_msr_misc_pwr_mgmt = 1,
618         .has_nhm_msrs = 1,
619         .has_config_tdp = 1,
620         .bclk_freq = BCLK_100MHZ,
621         .crystal_freq = 24000000,
622         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
623         .cst_limit = CST_LIMIT_HSW,
624         .has_irtl_msrs = 1,
625         .has_ext_cst_msrs = 1,
626         .trl_msrs = TRL_BASE,
627         .tcc_offset_bits = 6,
628         .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
629         .enable_tsc_tweak = 1,
630 };
631
632 static const struct platform_features cnl_features = {
633         .has_msr_misc_feature_control = 1,
634         .has_msr_misc_pwr_mgmt = 1,
635         .has_nhm_msrs = 1,
636         .has_config_tdp = 1,
637         .bclk_freq = BCLK_100MHZ,
638         .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
639         .cst_limit = CST_LIMIT_HSW,
640         .has_irtl_msrs = 1,
641         .has_msr_core_c1_res = 1,
642         .has_ext_cst_msrs = 1,
643         .trl_msrs = TRL_BASE,
644         .tcc_offset_bits = 6,
645         .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
646         .enable_tsc_tweak = 1,
647 };
648
649 static const struct platform_features adl_features = {
650         .has_msr_misc_feature_control = 1,
651         .has_msr_misc_pwr_mgmt = 1,
652         .has_nhm_msrs = 1,
653         .has_config_tdp = 1,
654         .bclk_freq = BCLK_100MHZ,
655         .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC8 | PC10,
656         .cst_limit = CST_LIMIT_HSW,
657         .has_irtl_msrs = 1,
658         .has_msr_core_c1_res = 1,
659         .has_ext_cst_msrs = 1,
660         .trl_msrs = TRL_BASE,
661         .tcc_offset_bits = 6,
662         .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
663         .enable_tsc_tweak = 1,
664 };
665
666 static const struct platform_features skx_features = {
667         .has_msr_misc_feature_control = 1,
668         .has_msr_misc_pwr_mgmt = 1,
669         .has_nhm_msrs = 1,
670         .has_config_tdp = 1,
671         .bclk_freq = BCLK_100MHZ,
672         .supported_cstates = CC1 | CC6 | PC2 | PC6,
673         .cst_limit = CST_LIMIT_SKX,
674         .has_irtl_msrs = 1,
675         .has_cst_auto_convension = 1,
676         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
677         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
678         .has_fixed_rapl_unit = 1,
679 };
680
681 static const struct platform_features icx_features = {
682         .has_msr_misc_feature_control = 1,
683         .has_msr_misc_pwr_mgmt = 1,
684         .has_nhm_msrs = 1,
685         .has_config_tdp = 1,
686         .bclk_freq = BCLK_100MHZ,
687         .supported_cstates = CC1 | CC6 | PC2 | PC6,
688         .cst_limit = CST_LIMIT_ICX,
689         .has_msr_core_c1_res = 1,
690         .has_irtl_msrs = 1,
691         .has_cst_prewake_bit = 1,
692         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
693         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
694         .has_fixed_rapl_unit = 1,
695 };
696
697 static const struct platform_features spr_features = {
698         .has_msr_misc_feature_control = 1,
699         .has_msr_misc_pwr_mgmt = 1,
700         .has_nhm_msrs = 1,
701         .has_config_tdp = 1,
702         .bclk_freq = BCLK_100MHZ,
703         .supported_cstates = CC1 | CC6 | PC2 | PC6,
704         .cst_limit = CST_LIMIT_SKX,
705         .has_msr_core_c1_res = 1,
706         .has_irtl_msrs = 1,
707         .has_cst_prewake_bit = 1,
708         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
709         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
710 };
711
712 static const struct platform_features srf_features = {
713         .has_msr_misc_feature_control = 1,
714         .has_msr_misc_pwr_mgmt = 1,
715         .has_nhm_msrs = 1,
716         .has_config_tdp = 1,
717         .bclk_freq = BCLK_100MHZ,
718         .supported_cstates = CC1 | CC6 | PC2 | PC6,
719         .cst_limit = CST_LIMIT_SKX,
720         .has_msr_core_c1_res = 1,
721         .has_msr_module_c6_res_ms = 1,
722         .has_irtl_msrs = 1,
723         .has_cst_prewake_bit = 1,
724         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
725         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
726 };
727
728 static const struct platform_features grr_features = {
729         .has_msr_misc_feature_control = 1,
730         .has_msr_misc_pwr_mgmt = 1,
731         .has_nhm_msrs = 1,
732         .has_config_tdp = 1,
733         .bclk_freq = BCLK_100MHZ,
734         .supported_cstates = CC1 | CC6,
735         .cst_limit = CST_LIMIT_SKX,
736         .has_msr_core_c1_res = 1,
737         .has_msr_module_c6_res_ms = 1,
738         .has_irtl_msrs = 1,
739         .has_cst_prewake_bit = 1,
740         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
741         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
742 };
743
744 static const struct platform_features slv_features = {
745         .has_nhm_msrs = 1,
746         .bclk_freq = BCLK_SLV,
747         .supported_cstates = CC1 | CC6 | PC6,
748         .cst_limit = CST_LIMIT_SLV,
749         .has_msr_core_c1_res = 1,
750         .has_msr_module_c6_res_ms = 1,
751         .has_msr_c6_demotion_policy_config = 1,
752         .has_msr_atom_pkg_c6_residency = 1,
753         .trl_msrs = TRL_ATOM,
754         .rapl_msrs = RAPL_PKG | RAPL_CORE,
755         .has_rapl_divisor = 1,
756         .rapl_quirk_tdp = 30,
757 };
758
759 static const struct platform_features slvd_features = {
760         .has_msr_misc_pwr_mgmt = 1,
761         .has_nhm_msrs = 1,
762         .bclk_freq = BCLK_SLV,
763         .supported_cstates = CC1 | CC6 | PC3 | PC6,
764         .cst_limit = CST_LIMIT_SLV,
765         .has_msr_atom_pkg_c6_residency = 1,
766         .trl_msrs = TRL_BASE,
767         .rapl_msrs = RAPL_PKG | RAPL_CORE,
768         .rapl_quirk_tdp = 30,
769 };
770
771 static const struct platform_features amt_features = {
772         .has_nhm_msrs = 1,
773         .bclk_freq = BCLK_133MHZ,
774         .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
775         .cst_limit = CST_LIMIT_AMT,
776         .trl_msrs = TRL_BASE,
777 };
778
779 static const struct platform_features gmt_features = {
780         .has_msr_misc_pwr_mgmt = 1,
781         .has_nhm_msrs = 1,
782         .bclk_freq = BCLK_100MHZ,
783         .crystal_freq = 19200000,
784         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
785         .cst_limit = CST_LIMIT_GMT,
786         .has_irtl_msrs = 1,
787         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
788         .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
789 };
790
791 static const struct platform_features gmtd_features = {
792         .has_msr_misc_pwr_mgmt = 1,
793         .has_nhm_msrs = 1,
794         .bclk_freq = BCLK_100MHZ,
795         .crystal_freq = 25000000,
796         .supported_cstates = CC1 | CC6 | PC2 | PC6,
797         .cst_limit = CST_LIMIT_GMT,
798         .has_irtl_msrs = 1,
799         .has_msr_core_c1_res = 1,
800         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
801         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS,
802 };
803
804 static const struct platform_features gmtp_features = {
805         .has_msr_misc_pwr_mgmt = 1,
806         .has_nhm_msrs = 1,
807         .bclk_freq = BCLK_100MHZ,
808         .crystal_freq = 19200000,
809         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
810         .cst_limit = CST_LIMIT_GMT,
811         .has_irtl_msrs = 1,
812         .trl_msrs = TRL_BASE,
813         .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
814 };
815
816 static const struct platform_features tmt_features = {
817         .has_msr_misc_pwr_mgmt = 1,
818         .has_nhm_msrs = 1,
819         .bclk_freq = BCLK_100MHZ,
820         .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
821         .cst_limit = CST_LIMIT_GMT,
822         .has_irtl_msrs = 1,
823         .trl_msrs = TRL_BASE,
824         .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
825         .enable_tsc_tweak = 1,
826 };
827
828 static const struct platform_features tmtd_features = {
829         .has_msr_misc_pwr_mgmt = 1,
830         .has_nhm_msrs = 1,
831         .bclk_freq = BCLK_100MHZ,
832         .supported_cstates = CC1 | CC6,
833         .cst_limit = CST_LIMIT_GMT,
834         .has_irtl_msrs = 1,
835         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
836         .rapl_msrs = RAPL_PKG_ALL,
837 };
838
839 static const struct platform_features knl_features = {
840         .has_msr_misc_pwr_mgmt = 1,
841         .has_nhm_msrs = 1,
842         .has_config_tdp = 1,
843         .bclk_freq = BCLK_100MHZ,
844         .supported_cstates = CC1 | CC6 | PC3 | PC6,
845         .cst_limit = CST_LIMIT_KNL,
846         .has_msr_knl_core_c6_residency = 1,
847         .trl_msrs = TRL_KNL,
848         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
849         .has_fixed_rapl_unit = 1,
850         .need_perf_multiplier = 1,
851 };
852
853 static const struct platform_features default_features = {
854 };
855
856 static const struct platform_features amd_features_with_rapl = {
857         .rapl_msrs = RAPL_AMD_F17H,
858         .has_per_core_rapl = 1,
859         .rapl_quirk_tdp = 280,  /* This is the max stock TDP of HEDT/Server Fam17h+ chips */
860 };
861
862 static const struct platform_data turbostat_pdata[] = {
863         { INTEL_FAM6_NEHALEM, &nhm_features },
864         { INTEL_FAM6_NEHALEM_G, &nhm_features },
865         { INTEL_FAM6_NEHALEM_EP, &nhm_features },
866         { INTEL_FAM6_NEHALEM_EX, &nhx_features },
867         { INTEL_FAM6_WESTMERE, &nhm_features },
868         { INTEL_FAM6_WESTMERE_EP, &nhm_features },
869         { INTEL_FAM6_WESTMERE_EX, &nhx_features },
870         { INTEL_FAM6_SANDYBRIDGE, &snb_features },
871         { INTEL_FAM6_SANDYBRIDGE_X, &snx_features },
872         { INTEL_FAM6_IVYBRIDGE, &ivb_features },
873         { INTEL_FAM6_IVYBRIDGE_X, &ivx_features },
874         { INTEL_FAM6_HASWELL, &hsw_features },
875         { INTEL_FAM6_HASWELL_X, &hsx_features },
876         { INTEL_FAM6_HASWELL_L, &hswl_features },
877         { INTEL_FAM6_HASWELL_G, &hswg_features },
878         { INTEL_FAM6_BROADWELL, &bdw_features },
879         { INTEL_FAM6_BROADWELL_G, &bdwg_features },
880         { INTEL_FAM6_BROADWELL_X, &bdx_features },
881         { INTEL_FAM6_BROADWELL_D, &bdx_features },
882         { INTEL_FAM6_SKYLAKE_L, &skl_features },
883         { INTEL_FAM6_SKYLAKE, &skl_features },
884         { INTEL_FAM6_SKYLAKE_X, &skx_features },
885         { INTEL_FAM6_KABYLAKE_L, &skl_features },
886         { INTEL_FAM6_KABYLAKE, &skl_features },
887         { INTEL_FAM6_COMETLAKE, &skl_features },
888         { INTEL_FAM6_COMETLAKE_L, &skl_features },
889         { INTEL_FAM6_CANNONLAKE_L, &cnl_features },
890         { INTEL_FAM6_ICELAKE_X, &icx_features },
891         { INTEL_FAM6_ICELAKE_D, &icx_features },
892         { INTEL_FAM6_ICELAKE_L, &cnl_features },
893         { INTEL_FAM6_ICELAKE_NNPI, &cnl_features },
894         { INTEL_FAM6_ROCKETLAKE, &cnl_features },
895         { INTEL_FAM6_TIGERLAKE_L, &cnl_features },
896         { INTEL_FAM6_TIGERLAKE, &cnl_features },
897         { INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features },
898         { INTEL_FAM6_EMERALDRAPIDS_X, &spr_features },
899         { INTEL_FAM6_GRANITERAPIDS_X, &spr_features },
900         { INTEL_FAM6_LAKEFIELD, &cnl_features },
901         { INTEL_FAM6_ALDERLAKE, &adl_features },
902         { INTEL_FAM6_ALDERLAKE_L, &adl_features },
903         { INTEL_FAM6_RAPTORLAKE, &adl_features },
904         { INTEL_FAM6_RAPTORLAKE_P, &adl_features },
905         { INTEL_FAM6_RAPTORLAKE_S, &adl_features },
906         { INTEL_FAM6_METEORLAKE, &cnl_features },
907         { INTEL_FAM6_METEORLAKE_L, &cnl_features },
908         { INTEL_FAM6_ARROWLAKE, &cnl_features },
909         { INTEL_FAM6_LUNARLAKE_M, &cnl_features },
910         { INTEL_FAM6_ATOM_SILVERMONT, &slv_features },
911         { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features },
912         { INTEL_FAM6_ATOM_AIRMONT, &amt_features },
913         { INTEL_FAM6_ATOM_GOLDMONT, &gmt_features },
914         { INTEL_FAM6_ATOM_GOLDMONT_D, &gmtd_features },
915         { INTEL_FAM6_ATOM_GOLDMONT_PLUS, &gmtp_features },
916         { INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features },
917         { INTEL_FAM6_ATOM_TREMONT, &tmt_features },
918         { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features },
919         { INTEL_FAM6_ATOM_GRACEMONT, &adl_features },
920         { INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features },
921         { INTEL_FAM6_ATOM_CRESTMONT, &grr_features },
922         { INTEL_FAM6_XEON_PHI_KNL, &knl_features },
923         { INTEL_FAM6_XEON_PHI_KNM, &knl_features },
924         /*
925          * Missing support for
926          * INTEL_FAM6_ICELAKE
927          * INTEL_FAM6_ATOM_SILVERMONT_MID
928          * INTEL_FAM6_ATOM_AIRMONT_MID
929          * INTEL_FAM6_ATOM_AIRMONT_NP
930          */
931         { 0, NULL },
932 };
933
934 static const struct platform_features *platform;
935
936 void probe_platform_features(unsigned int family, unsigned int model)
937 {
938         int i;
939
940         platform = &default_features;
941
942         if (authentic_amd || hygon_genuine) {
943                 if (max_extended_level >= 0x80000007) {
944                         unsigned int eax, ebx, ecx, edx;
945
946                         __cpuid(0x80000007, eax, ebx, ecx, edx);
947                         /* RAPL (Fam 17h+) */
948                         if ((edx & (1 << 14)) && family >= 0x17)
949                                 platform = &amd_features_with_rapl;
950                 }
951                 return;
952         }
953
954         if (!genuine_intel || family != 6)
955                 return;
956
957         for (i = 0; turbostat_pdata[i].features; i++) {
958                 if (turbostat_pdata[i].model == model) {
959                         platform = turbostat_pdata[i].features;
960                         return;
961                 }
962         }
963 }
964
965 /* Model specific support End */
966
967 #define TJMAX_DEFAULT   100
968
969 /* MSRs that are not yet in the kernel-provided header. */
970 #define MSR_RAPL_PWR_UNIT       0xc0010299
971 #define MSR_CORE_ENERGY_STAT    0xc001029a
972 #define MSR_PKG_ENERGY_STAT     0xc001029b
973
974 #define MAX(a, b) ((a) > (b) ? (a) : (b))
975
976 int backwards_count;
977 char *progname;
978
979 #define CPU_SUBSET_MAXCPUS      1024    /* need to use before probe... */
980 cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset;
981 size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size;
982 #define MAX_ADDED_COUNTERS 8
983 #define MAX_ADDED_THREAD_COUNTERS 24
984 #define BITMASK_SIZE 32
985
986 /* Indexes used to map data read from perf and MSRs into global variables */
987 enum rapl_rci_index {
988         RAPL_RCI_INDEX_ENERGY_PKG = 0,
989         RAPL_RCI_INDEX_ENERGY_CORES = 1,
990         RAPL_RCI_INDEX_DRAM = 2,
991         RAPL_RCI_INDEX_GFX = 3,
992         RAPL_RCI_INDEX_PKG_PERF_STATUS = 4,
993         RAPL_RCI_INDEX_DRAM_PERF_STATUS = 5,
994         RAPL_RCI_INDEX_CORE_ENERGY = 6,
995         NUM_RAPL_COUNTERS,
996 };
997
998 enum rapl_unit {
999         RAPL_UNIT_INVALID,
1000         RAPL_UNIT_JOULES,
1001         RAPL_UNIT_WATTS,
1002 };
1003
1004 struct rapl_counter_info_t {
1005         unsigned long long data[NUM_RAPL_COUNTERS];
1006         enum rapl_source source[NUM_RAPL_COUNTERS];
1007         unsigned long long flags[NUM_RAPL_COUNTERS];
1008         double scale[NUM_RAPL_COUNTERS];
1009         enum rapl_unit unit[NUM_RAPL_COUNTERS];
1010
1011         union {
1012                 /* Active when source == RAPL_SOURCE_MSR */
1013                 struct {
1014                         unsigned long long msr[NUM_RAPL_COUNTERS];
1015                         unsigned long long msr_mask[NUM_RAPL_COUNTERS];
1016                         int msr_shift[NUM_RAPL_COUNTERS];
1017                 };
1018         };
1019
1020         int fd_perf;
1021 };
1022
1023 /* struct rapl_counter_info_t for each RAPL domain */
1024 struct rapl_counter_info_t *rapl_counter_info_perdomain;
1025
1026 #define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1)
1027
1028 struct rapl_counter_arch_info {
1029         int feature_mask;       /* Mask for testing if the counter is supported on host */
1030         const char *perf_subsys;
1031         const char *perf_name;
1032         unsigned long long msr;
1033         unsigned long long msr_mask;
1034         int msr_shift;          /* Positive mean shift right, negative mean shift left */
1035         double *platform_rapl_msr_scale;        /* Scale applied to values read by MSR (platform dependent, filled at runtime) */
1036         unsigned int rci_index; /* Maps data from perf counters to global variables */
1037         unsigned long long bic;
1038         double compat_scale;    /* Some counters require constant scaling to be in the same range as other, similar ones */
1039         unsigned long long flags;
1040 };
1041
1042 static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
1043         {
1044          .feature_mask = RAPL_PKG,
1045          .perf_subsys = "power",
1046          .perf_name = "energy-pkg",
1047          .msr = MSR_PKG_ENERGY_STATUS,
1048          .msr_mask = 0xFFFFFFFFFFFFFFFF,
1049          .msr_shift = 0,
1050          .platform_rapl_msr_scale = &rapl_energy_units,
1051          .rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
1052          .bic = BIC_PkgWatt | BIC_Pkg_J,
1053          .compat_scale = 1.0,
1054          .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1055           },
1056         {
1057          .feature_mask = RAPL_AMD_F17H,
1058          .perf_subsys = "power",
1059          .perf_name = "energy-pkg",
1060          .msr = MSR_PKG_ENERGY_STAT,
1061          .msr_mask = 0xFFFFFFFFFFFFFFFF,
1062          .msr_shift = 0,
1063          .platform_rapl_msr_scale = &rapl_energy_units,
1064          .rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
1065          .bic = BIC_PkgWatt | BIC_Pkg_J,
1066          .compat_scale = 1.0,
1067          .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1068           },
1069         {
1070          .feature_mask = RAPL_CORE_ENERGY_STATUS,
1071          .perf_subsys = "power",
1072          .perf_name = "energy-cores",
1073          .msr = MSR_PP0_ENERGY_STATUS,
1074          .msr_mask = 0xFFFFFFFFFFFFFFFF,
1075          .msr_shift = 0,
1076          .platform_rapl_msr_scale = &rapl_energy_units,
1077          .rci_index = RAPL_RCI_INDEX_ENERGY_CORES,
1078          .bic = BIC_CorWatt | BIC_Cor_J,
1079          .compat_scale = 1.0,
1080          .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1081           },
1082         {
1083          .feature_mask = RAPL_DRAM,
1084          .perf_subsys = "power",
1085          .perf_name = "energy-ram",
1086          .msr = MSR_DRAM_ENERGY_STATUS,
1087          .msr_mask = 0xFFFFFFFFFFFFFFFF,
1088          .msr_shift = 0,
1089          .platform_rapl_msr_scale = &rapl_dram_energy_units,
1090          .rci_index = RAPL_RCI_INDEX_DRAM,
1091          .bic = BIC_RAMWatt | BIC_RAM_J,
1092          .compat_scale = 1.0,
1093          .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1094           },
1095         {
1096          .feature_mask = RAPL_GFX,
1097          .perf_subsys = "power",
1098          .perf_name = "energy-gpu",
1099          .msr = MSR_PP1_ENERGY_STATUS,
1100          .msr_mask = 0xFFFFFFFFFFFFFFFF,
1101          .msr_shift = 0,
1102          .platform_rapl_msr_scale = &rapl_energy_units,
1103          .rci_index = RAPL_RCI_INDEX_GFX,
1104          .bic = BIC_GFXWatt | BIC_GFX_J,
1105          .compat_scale = 1.0,
1106          .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1107           },
1108         {
1109          .feature_mask = RAPL_PKG_PERF_STATUS,
1110          .perf_subsys = NULL,
1111          .perf_name = NULL,
1112          .msr = MSR_PKG_PERF_STATUS,
1113          .msr_mask = 0xFFFFFFFFFFFFFFFF,
1114          .msr_shift = 0,
1115          .platform_rapl_msr_scale = &rapl_time_units,
1116          .rci_index = RAPL_RCI_INDEX_PKG_PERF_STATUS,
1117          .bic = BIC_PKG__,
1118          .compat_scale = 100.0,
1119          .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1120           },
1121         {
1122          .feature_mask = RAPL_DRAM_PERF_STATUS,
1123          .perf_subsys = NULL,
1124          .perf_name = NULL,
1125          .msr = MSR_DRAM_PERF_STATUS,
1126          .msr_mask = 0xFFFFFFFFFFFFFFFF,
1127          .msr_shift = 0,
1128          .platform_rapl_msr_scale = &rapl_time_units,
1129          .rci_index = RAPL_RCI_INDEX_DRAM_PERF_STATUS,
1130          .bic = BIC_RAM__,
1131          .compat_scale = 100.0,
1132          .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1133           },
1134         {
1135          .feature_mask = RAPL_AMD_F17H,
1136          .perf_subsys = NULL,
1137          .perf_name = NULL,
1138          .msr = MSR_CORE_ENERGY_STAT,
1139          .msr_mask = 0xFFFFFFFF,
1140          .msr_shift = 0,
1141          .platform_rapl_msr_scale = &rapl_energy_units,
1142          .rci_index = RAPL_RCI_INDEX_CORE_ENERGY,
1143          .bic = BIC_CorWatt | BIC_Cor_J,
1144          .compat_scale = 1.0,
1145          .flags = 0,
1146           },
1147 };
1148
1149 struct rapl_counter {
1150         unsigned long long raw_value;
1151         enum rapl_unit unit;
1152         double scale;
1153 };
1154
1155 struct thread_data {
1156         struct timeval tv_begin;
1157         struct timeval tv_end;
1158         struct timeval tv_delta;
1159         unsigned long long tsc;
1160         unsigned long long aperf;
1161         unsigned long long mperf;
1162         unsigned long long c1;
1163         unsigned long long instr_count;
1164         unsigned long long irq_count;
1165         unsigned int smi_count;
1166         unsigned int cpu_id;
1167         unsigned int apic_id;
1168         unsigned int x2apic_id;
1169         unsigned int flags;
1170         bool is_atom;
1171         unsigned long long counter[MAX_ADDED_THREAD_COUNTERS];
1172 } *thread_even, *thread_odd;
1173
1174 struct core_data {
1175         int base_cpu;
1176         unsigned long long c3;
1177         unsigned long long c6;
1178         unsigned long long c7;
1179         unsigned long long mc6_us;      /* duplicate as per-core for now, even though per module */
1180         unsigned int core_temp_c;
1181         struct rapl_counter core_energy;        /* MSR_CORE_ENERGY_STAT */
1182         unsigned int core_id;
1183         unsigned long long core_throt_cnt;
1184         unsigned long long counter[MAX_ADDED_COUNTERS];
1185 } *core_even, *core_odd;
1186
1187 struct pkg_data {
1188         int base_cpu;
1189         unsigned long long pc2;
1190         unsigned long long pc3;
1191         unsigned long long pc6;
1192         unsigned long long pc7;
1193         unsigned long long pc8;
1194         unsigned long long pc9;
1195         unsigned long long pc10;
1196         long long cpu_lpi;
1197         long long sys_lpi;
1198         unsigned long long pkg_wtd_core_c0;
1199         unsigned long long pkg_any_core_c0;
1200         unsigned long long pkg_any_gfxe_c0;
1201         unsigned long long pkg_both_core_gfxe_c0;
1202         long long gfx_rc6_ms;
1203         unsigned int gfx_mhz;
1204         unsigned int gfx_act_mhz;
1205         long long sam_mc6_ms;
1206         unsigned int sam_mhz;
1207         unsigned int sam_act_mhz;
1208         unsigned int package_id;
1209         struct rapl_counter energy_pkg; /* MSR_PKG_ENERGY_STATUS */
1210         struct rapl_counter energy_dram;        /* MSR_DRAM_ENERGY_STATUS */
1211         struct rapl_counter energy_cores;       /* MSR_PP0_ENERGY_STATUS */
1212         struct rapl_counter energy_gfx; /* MSR_PP1_ENERGY_STATUS */
1213         struct rapl_counter rapl_pkg_perf_status;       /* MSR_PKG_PERF_STATUS */
1214         struct rapl_counter rapl_dram_perf_status;      /* MSR_DRAM_PERF_STATUS */
1215         unsigned int pkg_temp_c;
1216         unsigned int uncore_mhz;
1217         unsigned long long counter[MAX_ADDED_COUNTERS];
1218 } *package_even, *package_odd;
1219
1220 #define ODD_COUNTERS thread_odd, core_odd, package_odd
1221 #define EVEN_COUNTERS thread_even, core_even, package_even
1222
1223 #define GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no)          \
1224         ((thread_base) +                                                      \
1225          ((pkg_no) *                                                          \
1226           topo.nodes_per_pkg * topo.cores_per_node * topo.threads_per_core) + \
1227          ((node_no) * topo.cores_per_node * topo.threads_per_core) +          \
1228          ((core_no) * topo.threads_per_core) +                                \
1229          (thread_no))
1230
1231 #define GET_CORE(core_base, core_no, node_no, pkg_no)                   \
1232         ((core_base) +                                                  \
1233          ((pkg_no) *  topo.nodes_per_pkg * topo.cores_per_node) +       \
1234          ((node_no) * topo.cores_per_node) +                            \
1235          (core_no))
1236
1237 #define GET_PKG(pkg_base, pkg_no) (pkg_base + pkg_no)
1238
1239 /*
1240  * The accumulated sum of MSR is defined as a monotonic
1241  * increasing MSR, it will be accumulated periodically,
1242  * despite its register's bit width.
1243  */
1244 enum {
1245         IDX_PKG_ENERGY,
1246         IDX_DRAM_ENERGY,
1247         IDX_PP0_ENERGY,
1248         IDX_PP1_ENERGY,
1249         IDX_PKG_PERF,
1250         IDX_DRAM_PERF,
1251         IDX_COUNT,
1252 };
1253
1254 int get_msr_sum(int cpu, off_t offset, unsigned long long *msr);
1255
1256 struct msr_sum_array {
1257         /* get_msr_sum() = sum + (get_msr() - last) */
1258         struct {
1259                 /*The accumulated MSR value is updated by the timer */
1260                 unsigned long long sum;
1261                 /*The MSR footprint recorded in last timer */
1262                 unsigned long long last;
1263         } entries[IDX_COUNT];
1264 };
1265
1266 /* The percpu MSR sum array.*/
1267 struct msr_sum_array *per_cpu_msr_sum;
1268
1269 off_t idx_to_offset(int idx)
1270 {
1271         off_t offset;
1272
1273         switch (idx) {
1274         case IDX_PKG_ENERGY:
1275                 if (platform->rapl_msrs & RAPL_AMD_F17H)
1276                         offset = MSR_PKG_ENERGY_STAT;
1277                 else
1278                         offset = MSR_PKG_ENERGY_STATUS;
1279                 break;
1280         case IDX_DRAM_ENERGY:
1281                 offset = MSR_DRAM_ENERGY_STATUS;
1282                 break;
1283         case IDX_PP0_ENERGY:
1284                 offset = MSR_PP0_ENERGY_STATUS;
1285                 break;
1286         case IDX_PP1_ENERGY:
1287                 offset = MSR_PP1_ENERGY_STATUS;
1288                 break;
1289         case IDX_PKG_PERF:
1290                 offset = MSR_PKG_PERF_STATUS;
1291                 break;
1292         case IDX_DRAM_PERF:
1293                 offset = MSR_DRAM_PERF_STATUS;
1294                 break;
1295         default:
1296                 offset = -1;
1297         }
1298         return offset;
1299 }
1300
1301 int offset_to_idx(off_t offset)
1302 {
1303         int idx;
1304
1305         switch (offset) {
1306         case MSR_PKG_ENERGY_STATUS:
1307         case MSR_PKG_ENERGY_STAT:
1308                 idx = IDX_PKG_ENERGY;
1309                 break;
1310         case MSR_DRAM_ENERGY_STATUS:
1311                 idx = IDX_DRAM_ENERGY;
1312                 break;
1313         case MSR_PP0_ENERGY_STATUS:
1314                 idx = IDX_PP0_ENERGY;
1315                 break;
1316         case MSR_PP1_ENERGY_STATUS:
1317                 idx = IDX_PP1_ENERGY;
1318                 break;
1319         case MSR_PKG_PERF_STATUS:
1320                 idx = IDX_PKG_PERF;
1321                 break;
1322         case MSR_DRAM_PERF_STATUS:
1323                 idx = IDX_DRAM_PERF;
1324                 break;
1325         default:
1326                 idx = -1;
1327         }
1328         return idx;
1329 }
1330
1331 int idx_valid(int idx)
1332 {
1333         switch (idx) {
1334         case IDX_PKG_ENERGY:
1335                 return platform->rapl_msrs & (RAPL_PKG | RAPL_AMD_F17H);
1336         case IDX_DRAM_ENERGY:
1337                 return platform->rapl_msrs & RAPL_DRAM;
1338         case IDX_PP0_ENERGY:
1339                 return platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS;
1340         case IDX_PP1_ENERGY:
1341                 return platform->rapl_msrs & RAPL_GFX;
1342         case IDX_PKG_PERF:
1343                 return platform->rapl_msrs & RAPL_PKG_PERF_STATUS;
1344         case IDX_DRAM_PERF:
1345                 return platform->rapl_msrs & RAPL_DRAM_PERF_STATUS;
1346         default:
1347                 return 0;
1348         }
1349 }
1350
1351 struct sys_counters {
1352         unsigned int added_thread_counters;
1353         unsigned int added_core_counters;
1354         unsigned int added_package_counters;
1355         struct msr_counter *tp;
1356         struct msr_counter *cp;
1357         struct msr_counter *pp;
1358 } sys;
1359
1360 void free_sys_counters(void)
1361 {
1362         struct msr_counter *p = sys.tp, *pnext = NULL;
1363
1364         while (p) {
1365                 pnext = p->next;
1366                 free(p);
1367                 p = pnext;
1368         }
1369
1370         p = sys.cp, pnext = NULL;
1371         while (p) {
1372                 pnext = p->next;
1373                 free(p);
1374                 p = pnext;
1375         }
1376
1377         p = sys.pp, pnext = NULL;
1378         while (p) {
1379                 pnext = p->next;
1380                 free(p);
1381                 p = pnext;
1382         }
1383
1384         sys.added_thread_counters = 0;
1385         sys.added_core_counters = 0;
1386         sys.added_package_counters = 0;
1387         sys.tp = NULL;
1388         sys.cp = NULL;
1389         sys.pp = NULL;
1390 }
1391
1392 struct system_summary {
1393         struct thread_data threads;
1394         struct core_data cores;
1395         struct pkg_data packages;
1396 } average;
1397
1398 struct cpu_topology {
1399         int physical_package_id;
1400         int die_id;
1401         int logical_cpu_id;
1402         int physical_node_id;
1403         int logical_node_id;    /* 0-based count within the package */
1404         int physical_core_id;
1405         int thread_id;
1406         cpu_set_t *put_ids;     /* Processing Unit/Thread IDs */
1407 } *cpus;
1408
1409 struct topo_params {
1410         int num_packages;
1411         int num_die;
1412         int num_cpus;
1413         int num_cores;
1414         int allowed_packages;
1415         int allowed_cpus;
1416         int allowed_cores;
1417         int max_cpu_num;
1418         int max_node_num;
1419         int nodes_per_pkg;
1420         int cores_per_node;
1421         int threads_per_core;
1422 } topo;
1423
1424 struct timeval tv_even, tv_odd, tv_delta;
1425
1426 int *irq_column_2_cpu;          /* /proc/interrupts column numbers */
1427 int *irqs_per_cpu;              /* indexed by cpu_num */
1428
1429 void setup_all_buffers(bool startup);
1430
1431 char *sys_lpi_file;
1432 char *sys_lpi_file_sysfs = "/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us";
1433 char *sys_lpi_file_debugfs = "/sys/kernel/debug/pmc_core/slp_s0_residency_usec";
1434
1435 int cpu_is_not_present(int cpu)
1436 {
1437         return !CPU_ISSET_S(cpu, cpu_present_setsize, cpu_present_set);
1438 }
1439
1440 int cpu_is_not_allowed(int cpu)
1441 {
1442         return !CPU_ISSET_S(cpu, cpu_allowed_setsize, cpu_allowed_set);
1443 }
1444
1445 /*
1446  * run func(thread, core, package) in topology order
1447  * skip non-present cpus
1448  */
1449
1450 int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pkg_data *),
1451                  struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base)
1452 {
1453         int retval, pkg_no, core_no, thread_no, node_no;
1454
1455         for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
1456                 for (node_no = 0; node_no < topo.nodes_per_pkg; node_no++) {
1457                         for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
1458                                 for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
1459                                         struct thread_data *t;
1460                                         struct core_data *c;
1461                                         struct pkg_data *p;
1462                                         t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
1463
1464                                         if (cpu_is_not_allowed(t->cpu_id))
1465                                                 continue;
1466
1467                                         c = GET_CORE(core_base, core_no, node_no, pkg_no);
1468                                         p = GET_PKG(pkg_base, pkg_no);
1469
1470                                         retval = func(t, c, p);
1471                                         if (retval)
1472                                                 return retval;
1473                                 }
1474                         }
1475                 }
1476         }
1477         return 0;
1478 }
1479
1480 int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1481 {
1482         UNUSED(p);
1483
1484         return ((int)t->cpu_id == c->base_cpu || c->base_cpu < 0);
1485 }
1486
1487 int is_cpu_first_core_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1488 {
1489         UNUSED(c);
1490
1491         return ((int)t->cpu_id == p->base_cpu || p->base_cpu < 0);
1492 }
1493
1494 int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1495 {
1496         return is_cpu_first_thread_in_core(t, c, p) && is_cpu_first_core_in_package(t, c, p);
1497 }
1498
1499 int cpu_migrate(int cpu)
1500 {
1501         CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
1502         CPU_SET_S(cpu, cpu_affinity_setsize, cpu_affinity_set);
1503         if (sched_setaffinity(0, cpu_affinity_setsize, cpu_affinity_set) == -1)
1504                 return -1;
1505         else
1506                 return 0;
1507 }
1508
1509 int get_msr_fd(int cpu)
1510 {
1511         char pathname[32];
1512         int fd;
1513
1514         fd = fd_percpu[cpu];
1515
1516         if (fd)
1517                 return fd;
1518
1519         sprintf(pathname, "/dev/cpu/%d/msr", cpu);
1520         fd = open(pathname, O_RDONLY);
1521         if (fd < 0)
1522                 err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, "
1523                     "or run with --no-msr, or run as root", pathname);
1524
1525         fd_percpu[cpu] = fd;
1526
1527         return fd;
1528 }
1529
1530 static void bic_disable_msr_access(void)
1531 {
1532         const unsigned long bic_msrs =
1533             BIC_SMI |
1534             BIC_CPU_c1 |
1535             BIC_CPU_c3 |
1536             BIC_CPU_c6 |
1537             BIC_CPU_c7 |
1538             BIC_Mod_c6 |
1539             BIC_CoreTmp |
1540             BIC_Totl_c0 |
1541             BIC_Any_c0 |
1542             BIC_GFX_c0 |
1543             BIC_CPUGFX |
1544             BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp;
1545
1546         bic_enabled &= ~bic_msrs;
1547
1548         free_sys_counters();
1549 }
1550
1551 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
1552 {
1553         assert(!no_perf);
1554
1555         return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
1556 }
1557
1558 static long open_perf_counter(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format)
1559 {
1560         struct perf_event_attr attr;
1561         const pid_t pid = -1;
1562         const unsigned long flags = 0;
1563
1564         assert(!no_perf);
1565
1566         memset(&attr, 0, sizeof(struct perf_event_attr));
1567
1568         attr.type = type;
1569         attr.size = sizeof(struct perf_event_attr);
1570         attr.config = config;
1571         attr.disabled = 0;
1572         attr.sample_type = PERF_SAMPLE_IDENTIFIER;
1573         attr.read_format = read_format;
1574
1575         const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags);
1576
1577         return fd;
1578 }
1579
1580 int get_instr_count_fd(int cpu)
1581 {
1582         if (fd_instr_count_percpu[cpu])
1583                 return fd_instr_count_percpu[cpu];
1584
1585         fd_instr_count_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0);
1586
1587         return fd_instr_count_percpu[cpu];
1588 }
1589
1590 int get_msr(int cpu, off_t offset, unsigned long long *msr)
1591 {
1592         ssize_t retval;
1593
1594         assert(!no_msr);
1595
1596         retval = pread(get_msr_fd(cpu), msr, sizeof(*msr), offset);
1597
1598         if (retval != sizeof *msr)
1599                 err(-1, "cpu%d: msr offset 0x%llx read failed", cpu, (unsigned long long)offset);
1600
1601         return 0;
1602 }
1603
1604 int probe_msr(int cpu, off_t offset)
1605 {
1606         ssize_t retval;
1607         unsigned long long dummy;
1608
1609         assert(!no_msr);
1610
1611         retval = pread(get_msr_fd(cpu), &dummy, sizeof(dummy), offset);
1612
1613         if (retval != sizeof(dummy))
1614                 return 1;
1615
1616         return 0;
1617 }
1618
1619 #define MAX_DEFERRED 16
1620 char *deferred_add_names[MAX_DEFERRED];
1621 char *deferred_skip_names[MAX_DEFERRED];
1622 int deferred_add_index;
1623 int deferred_skip_index;
1624
1625 /*
1626  * HIDE_LIST - hide this list of counters, show the rest [default]
1627  * SHOW_LIST - show this list of counters, hide the rest
1628  */
1629 enum show_hide_mode { SHOW_LIST, HIDE_LIST } global_show_hide_mode = HIDE_LIST;
1630
1631 void help(void)
1632 {
1633         fprintf(outf,
1634                 "Usage: turbostat [OPTIONS][(--interval seconds) | COMMAND ...]\n"
1635                 "\n"
1636                 "Turbostat forks the specified COMMAND and prints statistics\n"
1637                 "when COMMAND completes.\n"
1638                 "If no COMMAND is specified, turbostat wakes every 5-seconds\n"
1639                 "to print statistics, until interrupted.\n"
1640                 "  -a, --add    add a counter\n"
1641                 "                 eg. --add msr0x10,u64,cpu,delta,MY_TSC\n"
1642                 "  -c, --cpu    cpu-set limit output to summary plus cpu-set:\n"
1643                 "                 {core | package | j,k,l..m,n-p }\n"
1644                 "  -d, --debug  displays usec, Time_Of_Day_Seconds and more debugging\n"
1645                 "  -D, --Dump   displays the raw counter values\n"
1646                 "  -e, --enable [all | column]\n"
1647                 "               shows all or the specified disabled column\n"
1648                 "  -H, --hide [column|column,column,...]\n"
1649                 "               hide the specified column(s)\n"
1650                 "  -i, --interval sec.subsec\n"
1651                 "               Override default 5-second measurement interval\n"
1652                 "  -J, --Joules displays energy in Joules instead of Watts\n"
1653                 "  -l, --list   list column headers only\n"
1654                 "  -M, --no-msr Disable all uses of the MSR driver\n"
1655                 "  -P, --no-perf Disable all uses of the perf API\n"
1656                 "  -n, --num_iterations num\n"
1657                 "               number of the measurement iterations\n"
1658                 "  -N, --header_iterations num\n"
1659                 "               print header every num iterations\n"
1660                 "  -o, --out file\n"
1661                 "               create or truncate \"file\" for all output\n"
1662                 "  -q, --quiet  skip decoding system configuration header\n"
1663                 "  -s, --show [column|column,column,...]\n"
1664                 "               show only the specified column(s)\n"
1665                 "  -S, --Summary\n"
1666                 "               limits output to 1-line system summary per interval\n"
1667                 "  -T, --TCC temperature\n"
1668                 "               sets the Thermal Control Circuit temperature in\n"
1669                 "                 degrees Celsius\n"
1670                 "  -h, --help   print this help message\n"
1671                 "  -v, --version        print version information\n" "\n" "For more help, run \"man turbostat\"\n");
1672 }
1673
1674 /*
1675  * bic_lookup
1676  * for all the strings in comma separate name_list,
1677  * set the approprate bit in return value.
1678  */
1679 unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode)
1680 {
1681         unsigned int i;
1682         unsigned long long retval = 0;
1683
1684         while (name_list) {
1685                 char *comma;
1686
1687                 comma = strchr(name_list, ',');
1688
1689                 if (comma)
1690                         *comma = '\0';
1691
1692                 for (i = 0; i < MAX_BIC; ++i) {
1693                         if (!strcmp(name_list, bic[i].name)) {
1694                                 retval |= (1ULL << i);
1695                                 break;
1696                         }
1697                         if (!strcmp(name_list, "all")) {
1698                                 retval |= ~0;
1699                                 break;
1700                         } else if (!strcmp(name_list, "topology")) {
1701                                 retval |= BIC_TOPOLOGY;
1702                                 break;
1703                         } else if (!strcmp(name_list, "power")) {
1704                                 retval |= BIC_THERMAL_PWR;
1705                                 break;
1706                         } else if (!strcmp(name_list, "idle")) {
1707                                 retval |= BIC_IDLE;
1708                                 break;
1709                         } else if (!strcmp(name_list, "frequency")) {
1710                                 retval |= BIC_FREQUENCY;
1711                                 break;
1712                         } else if (!strcmp(name_list, "other")) {
1713                                 retval |= BIC_OTHER;
1714                                 break;
1715                         }
1716
1717                 }
1718                 if (i == MAX_BIC) {
1719                         if (mode == SHOW_LIST) {
1720                                 deferred_add_names[deferred_add_index++] = name_list;
1721                                 if (deferred_add_index >= MAX_DEFERRED) {
1722                                         fprintf(stderr, "More than max %d un-recognized --add options '%s'\n",
1723                                                 MAX_DEFERRED, name_list);
1724                                         help();
1725                                         exit(1);
1726                                 }
1727                         } else {
1728                                 deferred_skip_names[deferred_skip_index++] = name_list;
1729                                 if (debug)
1730                                         fprintf(stderr, "deferred \"%s\"\n", name_list);
1731                                 if (deferred_skip_index >= MAX_DEFERRED) {
1732                                         fprintf(stderr, "More than max %d un-recognized --skip options '%s'\n",
1733                                                 MAX_DEFERRED, name_list);
1734                                         help();
1735                                         exit(1);
1736                                 }
1737                         }
1738                 }
1739
1740                 name_list = comma;
1741                 if (name_list)
1742                         name_list++;
1743
1744         }
1745         return retval;
1746 }
1747
1748 void print_header(char *delim)
1749 {
1750         struct msr_counter *mp;
1751         int printed = 0;
1752
1753         if (DO_BIC(BIC_USEC))
1754                 outp += sprintf(outp, "%susec", (printed++ ? delim : ""));
1755         if (DO_BIC(BIC_TOD))
1756                 outp += sprintf(outp, "%sTime_Of_Day_Seconds", (printed++ ? delim : ""));
1757         if (DO_BIC(BIC_Package))
1758                 outp += sprintf(outp, "%sPackage", (printed++ ? delim : ""));
1759         if (DO_BIC(BIC_Die))
1760                 outp += sprintf(outp, "%sDie", (printed++ ? delim : ""));
1761         if (DO_BIC(BIC_Node))
1762                 outp += sprintf(outp, "%sNode", (printed++ ? delim : ""));
1763         if (DO_BIC(BIC_Core))
1764                 outp += sprintf(outp, "%sCore", (printed++ ? delim : ""));
1765         if (DO_BIC(BIC_CPU))
1766                 outp += sprintf(outp, "%sCPU", (printed++ ? delim : ""));
1767         if (DO_BIC(BIC_APIC))
1768                 outp += sprintf(outp, "%sAPIC", (printed++ ? delim : ""));
1769         if (DO_BIC(BIC_X2APIC))
1770                 outp += sprintf(outp, "%sX2APIC", (printed++ ? delim : ""));
1771         if (DO_BIC(BIC_Avg_MHz))
1772                 outp += sprintf(outp, "%sAvg_MHz", (printed++ ? delim : ""));
1773         if (DO_BIC(BIC_Busy))
1774                 outp += sprintf(outp, "%sBusy%%", (printed++ ? delim : ""));
1775         if (DO_BIC(BIC_Bzy_MHz))
1776                 outp += sprintf(outp, "%sBzy_MHz", (printed++ ? delim : ""));
1777         if (DO_BIC(BIC_TSC_MHz))
1778                 outp += sprintf(outp, "%sTSC_MHz", (printed++ ? delim : ""));
1779
1780         if (DO_BIC(BIC_IPC))
1781                 outp += sprintf(outp, "%sIPC", (printed++ ? delim : ""));
1782
1783         if (DO_BIC(BIC_IRQ)) {
1784                 if (sums_need_wide_columns)
1785                         outp += sprintf(outp, "%s     IRQ", (printed++ ? delim : ""));
1786                 else
1787                         outp += sprintf(outp, "%sIRQ", (printed++ ? delim : ""));
1788         }
1789
1790         if (DO_BIC(BIC_SMI))
1791                 outp += sprintf(outp, "%sSMI", (printed++ ? delim : ""));
1792
1793         for (mp = sys.tp; mp; mp = mp->next) {
1794
1795                 if (mp->format == FORMAT_RAW) {
1796                         if (mp->width == 64)
1797                                 outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), mp->name);
1798                         else
1799                                 outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), mp->name);
1800                 } else {
1801                         if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
1802                                 outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), mp->name);
1803                         else
1804                                 outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), mp->name);
1805                 }
1806         }
1807
1808         if (DO_BIC(BIC_CPU_c1))
1809                 outp += sprintf(outp, "%sCPU%%c1", (printed++ ? delim : ""));
1810         if (DO_BIC(BIC_CPU_c3))
1811                 outp += sprintf(outp, "%sCPU%%c3", (printed++ ? delim : ""));
1812         if (DO_BIC(BIC_CPU_c6))
1813                 outp += sprintf(outp, "%sCPU%%c6", (printed++ ? delim : ""));
1814         if (DO_BIC(BIC_CPU_c7))
1815                 outp += sprintf(outp, "%sCPU%%c7", (printed++ ? delim : ""));
1816
1817         if (DO_BIC(BIC_Mod_c6))
1818                 outp += sprintf(outp, "%sMod%%c6", (printed++ ? delim : ""));
1819
1820         if (DO_BIC(BIC_CoreTmp))
1821                 outp += sprintf(outp, "%sCoreTmp", (printed++ ? delim : ""));
1822
1823         if (DO_BIC(BIC_CORE_THROT_CNT))
1824                 outp += sprintf(outp, "%sCoreThr", (printed++ ? delim : ""));
1825
1826         if (platform->rapl_msrs && !rapl_joules) {
1827                 if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
1828                         outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
1829         } else if (platform->rapl_msrs && rapl_joules) {
1830                 if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl)
1831                         outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
1832         }
1833
1834         for (mp = sys.cp; mp; mp = mp->next) {
1835                 if (mp->format == FORMAT_RAW) {
1836                         if (mp->width == 64)
1837                                 outp += sprintf(outp, "%s%18.18s", delim, mp->name);
1838                         else
1839                                 outp += sprintf(outp, "%s%10.10s", delim, mp->name);
1840                 } else {
1841                         if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
1842                                 outp += sprintf(outp, "%s%8s", delim, mp->name);
1843                         else
1844                                 outp += sprintf(outp, "%s%s", delim, mp->name);
1845                 }
1846         }
1847
1848         if (DO_BIC(BIC_PkgTmp))
1849                 outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : ""));
1850
1851         if (DO_BIC(BIC_GFX_rc6))
1852                 outp += sprintf(outp, "%sGFX%%rc6", (printed++ ? delim : ""));
1853
1854         if (DO_BIC(BIC_GFXMHz))
1855                 outp += sprintf(outp, "%sGFXMHz", (printed++ ? delim : ""));
1856
1857         if (DO_BIC(BIC_GFXACTMHz))
1858                 outp += sprintf(outp, "%sGFXAMHz", (printed++ ? delim : ""));
1859
1860         if (DO_BIC(BIC_SAM_mc6))
1861                 outp += sprintf(outp, "%sSAM%%mc6", (printed++ ? delim : ""));
1862
1863         if (DO_BIC(BIC_SAMMHz))
1864                 outp += sprintf(outp, "%sSAMMHz", (printed++ ? delim : ""));
1865
1866         if (DO_BIC(BIC_SAMACTMHz))
1867                 outp += sprintf(outp, "%sSAMAMHz", (printed++ ? delim : ""));
1868
1869         if (DO_BIC(BIC_Totl_c0))
1870                 outp += sprintf(outp, "%sTotl%%C0", (printed++ ? delim : ""));
1871         if (DO_BIC(BIC_Any_c0))
1872                 outp += sprintf(outp, "%sAny%%C0", (printed++ ? delim : ""));
1873         if (DO_BIC(BIC_GFX_c0))
1874                 outp += sprintf(outp, "%sGFX%%C0", (printed++ ? delim : ""));
1875         if (DO_BIC(BIC_CPUGFX))
1876                 outp += sprintf(outp, "%sCPUGFX%%", (printed++ ? delim : ""));
1877
1878         if (DO_BIC(BIC_Pkgpc2))
1879                 outp += sprintf(outp, "%sPkg%%pc2", (printed++ ? delim : ""));
1880         if (DO_BIC(BIC_Pkgpc3))
1881                 outp += sprintf(outp, "%sPkg%%pc3", (printed++ ? delim : ""));
1882         if (DO_BIC(BIC_Pkgpc6))
1883                 outp += sprintf(outp, "%sPkg%%pc6", (printed++ ? delim : ""));
1884         if (DO_BIC(BIC_Pkgpc7))
1885                 outp += sprintf(outp, "%sPkg%%pc7", (printed++ ? delim : ""));
1886         if (DO_BIC(BIC_Pkgpc8))
1887                 outp += sprintf(outp, "%sPkg%%pc8", (printed++ ? delim : ""));
1888         if (DO_BIC(BIC_Pkgpc9))
1889                 outp += sprintf(outp, "%sPkg%%pc9", (printed++ ? delim : ""));
1890         if (DO_BIC(BIC_Pkgpc10))
1891                 outp += sprintf(outp, "%sPk%%pc10", (printed++ ? delim : ""));
1892         if (DO_BIC(BIC_CPU_LPI))
1893                 outp += sprintf(outp, "%sCPU%%LPI", (printed++ ? delim : ""));
1894         if (DO_BIC(BIC_SYS_LPI))
1895                 outp += sprintf(outp, "%sSYS%%LPI", (printed++ ? delim : ""));
1896
1897         if (platform->rapl_msrs && !rapl_joules) {
1898                 if (DO_BIC(BIC_PkgWatt))
1899                         outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : ""));
1900                 if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
1901                         outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
1902                 if (DO_BIC(BIC_GFXWatt))
1903                         outp += sprintf(outp, "%sGFXWatt", (printed++ ? delim : ""));
1904                 if (DO_BIC(BIC_RAMWatt))
1905                         outp += sprintf(outp, "%sRAMWatt", (printed++ ? delim : ""));
1906                 if (DO_BIC(BIC_PKG__))
1907                         outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : ""));
1908                 if (DO_BIC(BIC_RAM__))
1909                         outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : ""));
1910         } else if (platform->rapl_msrs && rapl_joules) {
1911                 if (DO_BIC(BIC_Pkg_J))
1912                         outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : ""));
1913                 if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
1914                         outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
1915                 if (DO_BIC(BIC_GFX_J))
1916                         outp += sprintf(outp, "%sGFX_J", (printed++ ? delim : ""));
1917                 if (DO_BIC(BIC_RAM_J))
1918                         outp += sprintf(outp, "%sRAM_J", (printed++ ? delim : ""));
1919                 if (DO_BIC(BIC_PKG__))
1920                         outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : ""));
1921                 if (DO_BIC(BIC_RAM__))
1922                         outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : ""));
1923         }
1924         if (DO_BIC(BIC_UNCORE_MHZ))
1925                 outp += sprintf(outp, "%sUncMHz", (printed++ ? delim : ""));
1926
1927         for (mp = sys.pp; mp; mp = mp->next) {
1928                 if (mp->format == FORMAT_RAW) {
1929                         if (mp->width == 64)
1930                                 outp += sprintf(outp, "%s%18.18s", delim, mp->name);
1931                         else
1932                                 outp += sprintf(outp, "%s%10.10s", delim, mp->name);
1933                 } else {
1934                         if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
1935                                 outp += sprintf(outp, "%s%8s", delim, mp->name);
1936                         else
1937                                 outp += sprintf(outp, "%s%s", delim, mp->name);
1938                 }
1939         }
1940
1941         outp += sprintf(outp, "\n");
1942 }
1943
1944 int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1945 {
1946         int i;
1947         struct msr_counter *mp;
1948
1949         outp += sprintf(outp, "t %p, c %p, p %p\n", t, c, p);
1950
1951         if (t) {
1952                 outp += sprintf(outp, "CPU: %d flags 0x%x\n", t->cpu_id, t->flags);
1953                 outp += sprintf(outp, "TSC: %016llX\n", t->tsc);
1954                 outp += sprintf(outp, "aperf: %016llX\n", t->aperf);
1955                 outp += sprintf(outp, "mperf: %016llX\n", t->mperf);
1956                 outp += sprintf(outp, "c1: %016llX\n", t->c1);
1957
1958                 if (DO_BIC(BIC_IPC))
1959                         outp += sprintf(outp, "IPC: %lld\n", t->instr_count);
1960
1961                 if (DO_BIC(BIC_IRQ))
1962                         outp += sprintf(outp, "IRQ: %lld\n", t->irq_count);
1963                 if (DO_BIC(BIC_SMI))
1964                         outp += sprintf(outp, "SMI: %d\n", t->smi_count);
1965
1966                 for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
1967                         outp +=
1968                             sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
1969                                     t->counter[i], mp->path);
1970                 }
1971         }
1972
1973         if (c && is_cpu_first_thread_in_core(t, c, p)) {
1974                 outp += sprintf(outp, "core: %d\n", c->core_id);
1975                 outp += sprintf(outp, "c3: %016llX\n", c->c3);
1976                 outp += sprintf(outp, "c6: %016llX\n", c->c6);
1977                 outp += sprintf(outp, "c7: %016llX\n", c->c7);
1978                 outp += sprintf(outp, "DTS: %dC\n", c->core_temp_c);
1979                 outp += sprintf(outp, "cpu_throt_count: %016llX\n", c->core_throt_cnt);
1980
1981                 const unsigned long long energy_value = c->core_energy.raw_value * c->core_energy.scale;
1982                 const double energy_scale = c->core_energy.scale;
1983
1984                 if (c->core_energy.unit == RAPL_UNIT_JOULES)
1985                         outp += sprintf(outp, "Joules: %0llX (scale: %lf)\n", energy_value, energy_scale);
1986
1987                 for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
1988                         outp +=
1989                             sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
1990                                     c->counter[i], mp->path);
1991                 }
1992                 outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us);
1993         }
1994
1995         if (p && is_cpu_first_core_in_package(t, c, p)) {
1996                 outp += sprintf(outp, "package: %d\n", p->package_id);
1997
1998                 outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0);
1999                 outp += sprintf(outp, "Any cores: %016llX\n", p->pkg_any_core_c0);
2000                 outp += sprintf(outp, "Any GFX: %016llX\n", p->pkg_any_gfxe_c0);
2001                 outp += sprintf(outp, "CPU + GFX: %016llX\n", p->pkg_both_core_gfxe_c0);
2002
2003                 outp += sprintf(outp, "pc2: %016llX\n", p->pc2);
2004                 if (DO_BIC(BIC_Pkgpc3))
2005                         outp += sprintf(outp, "pc3: %016llX\n", p->pc3);
2006                 if (DO_BIC(BIC_Pkgpc6))
2007                         outp += sprintf(outp, "pc6: %016llX\n", p->pc6);
2008                 if (DO_BIC(BIC_Pkgpc7))
2009                         outp += sprintf(outp, "pc7: %016llX\n", p->pc7);
2010                 outp += sprintf(outp, "pc8: %016llX\n", p->pc8);
2011                 outp += sprintf(outp, "pc9: %016llX\n", p->pc9);
2012                 outp += sprintf(outp, "pc10: %016llX\n", p->pc10);
2013                 outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi);
2014                 outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi);
2015                 outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg.raw_value);
2016                 outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores.raw_value);
2017                 outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx.raw_value);
2018                 outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram.raw_value);
2019                 outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status.raw_value);
2020                 outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status.raw_value);
2021                 outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c);
2022
2023                 for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
2024                         outp +=
2025                             sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
2026                                     p->counter[i], mp->path);
2027                 }
2028         }
2029
2030         outp += sprintf(outp, "\n");
2031
2032         return 0;
2033 }
2034
2035 double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desired_unit, double interval)
2036 {
2037         assert(desired_unit != RAPL_UNIT_INVALID);
2038
2039         /*
2040          * For now we don't expect anything other than joules,
2041          * so just simplify the logic.
2042          */
2043         assert(c->unit == RAPL_UNIT_JOULES);
2044
2045         const double scaled = c->raw_value * c->scale;
2046
2047         if (desired_unit == RAPL_UNIT_WATTS)
2048                 return scaled / interval;
2049         return scaled;
2050 }
2051
2052 /*
2053  * column formatting convention & formats
2054  */
2055 int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
2056 {
2057         double interval_float, tsc;
2058         char *fmt8;
2059         int i;
2060         struct msr_counter *mp;
2061         char *delim = "\t";
2062         int printed = 0;
2063
2064         /* if showing only 1st thread in core and this isn't one, bail out */
2065         if (show_core_only && !is_cpu_first_thread_in_core(t, c, p))
2066                 return 0;
2067
2068         /* if showing only 1st thread in pkg and this isn't one, bail out */
2069         if (show_pkg_only && !is_cpu_first_core_in_package(t, c, p))
2070                 return 0;
2071
2072         /*if not summary line and --cpu is used */
2073         if ((t != &average.threads) && (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset)))
2074                 return 0;
2075
2076         if (DO_BIC(BIC_USEC)) {
2077                 /* on each row, print how many usec each timestamp took to gather */
2078                 struct timeval tv;
2079
2080                 timersub(&t->tv_end, &t->tv_begin, &tv);
2081                 outp += sprintf(outp, "%5ld\t", tv.tv_sec * 1000000 + tv.tv_usec);
2082         }
2083
2084         /* Time_Of_Day_Seconds: on each row, print sec.usec last timestamp taken */
2085         if (DO_BIC(BIC_TOD))
2086                 outp += sprintf(outp, "%10ld.%06ld\t", t->tv_end.tv_sec, t->tv_end.tv_usec);
2087
2088         interval_float = t->tv_delta.tv_sec + t->tv_delta.tv_usec / 1000000.0;
2089
2090         tsc = t->tsc * tsc_tweak;
2091
2092         /* topo columns, print blanks on 1st (average) line */
2093         if (t == &average.threads) {
2094                 if (DO_BIC(BIC_Package))
2095                         outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2096                 if (DO_BIC(BIC_Die))
2097                         outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2098                 if (DO_BIC(BIC_Node))
2099                         outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2100                 if (DO_BIC(BIC_Core))
2101                         outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2102                 if (DO_BIC(BIC_CPU))
2103                         outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2104                 if (DO_BIC(BIC_APIC))
2105                         outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2106                 if (DO_BIC(BIC_X2APIC))
2107                         outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2108         } else {
2109                 if (DO_BIC(BIC_Package)) {
2110                         if (p)
2111                                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->package_id);
2112                         else
2113                                 outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2114                 }
2115                 if (DO_BIC(BIC_Die)) {
2116                         if (c)
2117                                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), cpus[t->cpu_id].die_id);
2118                         else
2119                                 outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2120                 }
2121                 if (DO_BIC(BIC_Node)) {
2122                         if (t)
2123                                 outp += sprintf(outp, "%s%d",
2124                                                 (printed++ ? delim : ""), cpus[t->cpu_id].physical_node_id);
2125                         else
2126                                 outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2127                 }
2128                 if (DO_BIC(BIC_Core)) {
2129                         if (c)
2130                                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_id);
2131                         else
2132                                 outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2133                 }
2134                 if (DO_BIC(BIC_CPU))
2135                         outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->cpu_id);
2136                 if (DO_BIC(BIC_APIC))
2137                         outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->apic_id);
2138                 if (DO_BIC(BIC_X2APIC))
2139                         outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->x2apic_id);
2140         }
2141
2142         if (DO_BIC(BIC_Avg_MHz))
2143                 outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 / units * t->aperf / interval_float);
2144
2145         if (DO_BIC(BIC_Busy))
2146                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->mperf / tsc);
2147
2148         if (DO_BIC(BIC_Bzy_MHz)) {
2149                 if (has_base_hz)
2150                         outp +=
2151                             sprintf(outp, "%s%.0f", (printed++ ? delim : ""), base_hz / units * t->aperf / t->mperf);
2152                 else
2153                         outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""),
2154                                         tsc / units * t->aperf / t->mperf / interval_float);
2155         }
2156
2157         if (DO_BIC(BIC_TSC_MHz))
2158                 outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 * t->tsc / units / interval_float);
2159
2160         if (DO_BIC(BIC_IPC))
2161                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 1.0 * t->instr_count / t->aperf);
2162
2163         /* IRQ */
2164         if (DO_BIC(BIC_IRQ)) {
2165                 if (sums_need_wide_columns)
2166                         outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->irq_count);
2167                 else
2168                         outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->irq_count);
2169         }
2170
2171         /* SMI */
2172         if (DO_BIC(BIC_SMI))
2173                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->smi_count);
2174
2175         /* Added counters */
2176         for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
2177                 if (mp->format == FORMAT_RAW) {
2178                         if (mp->width == 32)
2179                                 outp +=
2180                                     sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)t->counter[i]);
2181                         else
2182                                 outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->counter[i]);
2183                 } else if (mp->format == FORMAT_DELTA) {
2184                         if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2185                                 outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->counter[i]);
2186                         else
2187                                 outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->counter[i]);
2188                 } else if (mp->format == FORMAT_PERCENT) {
2189                         if (mp->type == COUNTER_USEC)
2190                                 outp +=
2191                                     sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2192                                             t->counter[i] / interval_float / 10000);
2193                         else
2194                                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->counter[i] / tsc);
2195                 }
2196         }
2197
2198         /* C1 */
2199         if (DO_BIC(BIC_CPU_c1))
2200                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc);
2201
2202         /* print per-core data only for 1st thread in core */
2203         if (!is_cpu_first_thread_in_core(t, c, p))
2204                 goto done;
2205
2206         if (DO_BIC(BIC_CPU_c3))
2207                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c3 / tsc);
2208         if (DO_BIC(BIC_CPU_c6))
2209                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c6 / tsc);
2210         if (DO_BIC(BIC_CPU_c7))
2211                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c7 / tsc);
2212
2213         /* Mod%c6 */
2214         if (DO_BIC(BIC_Mod_c6))
2215                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->mc6_us / tsc);
2216
2217         if (DO_BIC(BIC_CoreTmp))
2218                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_temp_c);
2219
2220         /* Core throttle count */
2221         if (DO_BIC(BIC_CORE_THROT_CNT))
2222                 outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->core_throt_cnt);
2223
2224         for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
2225                 if (mp->format == FORMAT_RAW) {
2226                         if (mp->width == 32)
2227                                 outp +=
2228                                     sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)c->counter[i]);
2229                         else
2230                                 outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->counter[i]);
2231                 } else if (mp->format == FORMAT_DELTA) {
2232                         if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2233                                 outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->counter[i]);
2234                         else
2235                                 outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->counter[i]);
2236                 } else if (mp->format == FORMAT_PERCENT) {
2237                         outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->counter[i] / tsc);
2238                 }
2239         }
2240
2241         fmt8 = "%s%.2f";
2242
2243         if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
2244                 outp +=
2245                     sprintf(outp, fmt8, (printed++ ? delim : ""),
2246                             rapl_counter_get_value(&c->core_energy, RAPL_UNIT_WATTS, interval_float));
2247         if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl)
2248                 outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
2249                                 rapl_counter_get_value(&c->core_energy, RAPL_UNIT_JOULES, interval_float));
2250
2251         /* print per-package data only for 1st core in package */
2252         if (!is_cpu_first_core_in_package(t, c, p))
2253                 goto done;
2254
2255         /* PkgTmp */
2256         if (DO_BIC(BIC_PkgTmp))
2257                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->pkg_temp_c);
2258
2259         /* GFXrc6 */
2260         if (DO_BIC(BIC_GFX_rc6)) {
2261                 if (p->gfx_rc6_ms == -1) {      /* detect GFX counter reset */
2262                         outp += sprintf(outp, "%s**.**", (printed++ ? delim : ""));
2263                 } else {
2264                         outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2265                                         p->gfx_rc6_ms / 10.0 / interval_float);
2266                 }
2267         }
2268
2269         /* GFXMHz */
2270         if (DO_BIC(BIC_GFXMHz))
2271                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_mhz);
2272
2273         /* GFXACTMHz */
2274         if (DO_BIC(BIC_GFXACTMHz))
2275                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_act_mhz);
2276
2277         /* SAMmc6 */
2278         if (DO_BIC(BIC_SAM_mc6)) {
2279                 if (p->sam_mc6_ms == -1) {      /* detect GFX counter reset */
2280                         outp += sprintf(outp, "%s**.**", (printed++ ? delim : ""));
2281                 } else {
2282                         outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2283                                         p->sam_mc6_ms / 10.0 / interval_float);
2284                 }
2285         }
2286
2287         /* SAMMHz */
2288         if (DO_BIC(BIC_SAMMHz))
2289                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_mhz);
2290
2291         /* SAMACTMHz */
2292         if (DO_BIC(BIC_SAMACTMHz))
2293                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_act_mhz);
2294
2295         /* Totl%C0, Any%C0 GFX%C0 CPUGFX% */
2296         if (DO_BIC(BIC_Totl_c0))
2297                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc);
2298         if (DO_BIC(BIC_Any_c0))
2299                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_core_c0 / tsc);
2300         if (DO_BIC(BIC_GFX_c0))
2301                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_gfxe_c0 / tsc);
2302         if (DO_BIC(BIC_CPUGFX))
2303                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_both_core_gfxe_c0 / tsc);
2304
2305         if (DO_BIC(BIC_Pkgpc2))
2306                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc2 / tsc);
2307         if (DO_BIC(BIC_Pkgpc3))
2308                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc3 / tsc);
2309         if (DO_BIC(BIC_Pkgpc6))
2310                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc6 / tsc);
2311         if (DO_BIC(BIC_Pkgpc7))
2312                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc7 / tsc);
2313         if (DO_BIC(BIC_Pkgpc8))
2314                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc8 / tsc);
2315         if (DO_BIC(BIC_Pkgpc9))
2316                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc9 / tsc);
2317         if (DO_BIC(BIC_Pkgpc10))
2318                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc);
2319
2320         if (DO_BIC(BIC_CPU_LPI)) {
2321                 if (p->cpu_lpi >= 0)
2322                         outp +=
2323                             sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2324                                     100.0 * p->cpu_lpi / 1000000.0 / interval_float);
2325                 else
2326                         outp += sprintf(outp, "%s(neg)", (printed++ ? delim : ""));
2327         }
2328         if (DO_BIC(BIC_SYS_LPI)) {
2329                 if (p->sys_lpi >= 0)
2330                         outp +=
2331                             sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2332                                     100.0 * p->sys_lpi / 1000000.0 / interval_float);
2333                 else
2334                         outp += sprintf(outp, "%s(neg)", (printed++ ? delim : ""));
2335         }
2336
2337         if (DO_BIC(BIC_PkgWatt))
2338                 outp +=
2339                     sprintf(outp, fmt8, (printed++ ? delim : ""),
2340                             rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_WATTS, interval_float));
2341         if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
2342                 outp +=
2343                     sprintf(outp, fmt8, (printed++ ? delim : ""),
2344                             rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_WATTS, interval_float));
2345         if (DO_BIC(BIC_GFXWatt))
2346                 outp +=
2347                     sprintf(outp, fmt8, (printed++ ? delim : ""),
2348                             rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_WATTS, interval_float));
2349         if (DO_BIC(BIC_RAMWatt))
2350                 outp +=
2351                     sprintf(outp, fmt8, (printed++ ? delim : ""),
2352                             rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_WATTS, interval_float));
2353         if (DO_BIC(BIC_Pkg_J))
2354                 outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
2355                                 rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_JOULES, interval_float));
2356         if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
2357                 outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
2358                                 rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_JOULES, interval_float));
2359         if (DO_BIC(BIC_GFX_J))
2360                 outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
2361                                 rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_JOULES, interval_float));
2362         if (DO_BIC(BIC_RAM_J))
2363                 outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
2364                                 rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_JOULES, interval_float));
2365         if (DO_BIC(BIC_PKG__))
2366                 outp +=
2367                     sprintf(outp, fmt8, (printed++ ? delim : ""),
2368                             rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float));
2369         if (DO_BIC(BIC_RAM__))
2370                 outp +=
2371                     sprintf(outp, fmt8, (printed++ ? delim : ""),
2372                             rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float));
2373         /* UncMHz */
2374         if (DO_BIC(BIC_UNCORE_MHZ))
2375                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz);
2376
2377         for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
2378                 if (mp->format == FORMAT_RAW) {
2379                         if (mp->width == 32)
2380                                 outp +=
2381                                     sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)p->counter[i]);
2382                         else
2383                                 outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->counter[i]);
2384                 } else if (mp->format == FORMAT_DELTA) {
2385                         if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2386                                 outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->counter[i]);
2387                         else
2388                                 outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]);
2389                 } else if (mp->format == FORMAT_PERCENT) {
2390                         outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc);
2391                 }
2392         }
2393
2394 done:
2395         if (*(outp - 1) != '\n')
2396                 outp += sprintf(outp, "\n");
2397
2398         return 0;
2399 }
2400
2401 void flush_output_stdout(void)
2402 {
2403         FILE *filep;
2404
2405         if (outf == stderr)
2406                 filep = stdout;
2407         else
2408                 filep = outf;
2409
2410         fputs(output_buffer, filep);
2411         fflush(filep);
2412
2413         outp = output_buffer;
2414 }
2415
2416 void flush_output_stderr(void)
2417 {
2418         fputs(output_buffer, outf);
2419         fflush(outf);
2420         outp = output_buffer;
2421 }
2422
2423 void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
2424 {
2425         static int count;
2426
2427         if ((!count || (header_iterations && !(count % header_iterations))) || !summary_only)
2428                 print_header("\t");
2429
2430         format_counters(&average.threads, &average.cores, &average.packages);
2431
2432         count++;
2433
2434         if (summary_only)
2435                 return;
2436
2437         for_all_cpus(format_counters, t, c, p);
2438 }
2439
2440 #define DELTA_WRAP32(new, old)                  \
2441         old = ((((unsigned long long)new << 32) - ((unsigned long long)old << 32)) >> 32);
2442
2443 int delta_package(struct pkg_data *new, struct pkg_data *old)
2444 {
2445         int i;
2446         struct msr_counter *mp;
2447
2448         if (DO_BIC(BIC_Totl_c0))
2449                 old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0;
2450         if (DO_BIC(BIC_Any_c0))
2451                 old->pkg_any_core_c0 = new->pkg_any_core_c0 - old->pkg_any_core_c0;
2452         if (DO_BIC(BIC_GFX_c0))
2453                 old->pkg_any_gfxe_c0 = new->pkg_any_gfxe_c0 - old->pkg_any_gfxe_c0;
2454         if (DO_BIC(BIC_CPUGFX))
2455                 old->pkg_both_core_gfxe_c0 = new->pkg_both_core_gfxe_c0 - old->pkg_both_core_gfxe_c0;
2456
2457         old->pc2 = new->pc2 - old->pc2;
2458         if (DO_BIC(BIC_Pkgpc3))
2459                 old->pc3 = new->pc3 - old->pc3;
2460         if (DO_BIC(BIC_Pkgpc6))
2461                 old->pc6 = new->pc6 - old->pc6;
2462         if (DO_BIC(BIC_Pkgpc7))
2463                 old->pc7 = new->pc7 - old->pc7;
2464         old->pc8 = new->pc8 - old->pc8;
2465         old->pc9 = new->pc9 - old->pc9;
2466         old->pc10 = new->pc10 - old->pc10;
2467         old->cpu_lpi = new->cpu_lpi - old->cpu_lpi;
2468         old->sys_lpi = new->sys_lpi - old->sys_lpi;
2469         old->pkg_temp_c = new->pkg_temp_c;
2470
2471         /* flag an error when rc6 counter resets/wraps */
2472         if (old->gfx_rc6_ms > new->gfx_rc6_ms)
2473                 old->gfx_rc6_ms = -1;
2474         else
2475                 old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms;
2476
2477         old->uncore_mhz = new->uncore_mhz;
2478         old->gfx_mhz = new->gfx_mhz;
2479         old->gfx_act_mhz = new->gfx_act_mhz;
2480
2481         /* flag an error when mc6 counter resets/wraps */
2482         if (old->sam_mc6_ms > new->sam_mc6_ms)
2483                 old->sam_mc6_ms = -1;
2484         else
2485                 old->sam_mc6_ms = new->sam_mc6_ms - old->sam_mc6_ms;
2486
2487         old->sam_mhz = new->sam_mhz;
2488         old->sam_act_mhz = new->sam_act_mhz;
2489
2490         old->energy_pkg.raw_value = new->energy_pkg.raw_value - old->energy_pkg.raw_value;
2491         old->energy_cores.raw_value = new->energy_cores.raw_value - old->energy_cores.raw_value;
2492         old->energy_gfx.raw_value = new->energy_gfx.raw_value - old->energy_gfx.raw_value;
2493         old->energy_dram.raw_value = new->energy_dram.raw_value - old->energy_dram.raw_value;
2494         old->rapl_pkg_perf_status.raw_value = new->rapl_pkg_perf_status.raw_value - old->rapl_pkg_perf_status.raw_value;
2495         old->rapl_dram_perf_status.raw_value =
2496             new->rapl_dram_perf_status.raw_value - old->rapl_dram_perf_status.raw_value;
2497
2498         for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
2499                 if (mp->format == FORMAT_RAW)
2500                         old->counter[i] = new->counter[i];
2501                 else
2502                         old->counter[i] = new->counter[i] - old->counter[i];
2503         }
2504
2505         return 0;
2506 }
2507
2508 void delta_core(struct core_data *new, struct core_data *old)
2509 {
2510         int i;
2511         struct msr_counter *mp;
2512
2513         old->c3 = new->c3 - old->c3;
2514         old->c6 = new->c6 - old->c6;
2515         old->c7 = new->c7 - old->c7;
2516         old->core_temp_c = new->core_temp_c;
2517         old->core_throt_cnt = new->core_throt_cnt;
2518         old->mc6_us = new->mc6_us - old->mc6_us;
2519
2520         DELTA_WRAP32(new->core_energy.raw_value, old->core_energy.raw_value);
2521
2522         for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
2523                 if (mp->format == FORMAT_RAW)
2524                         old->counter[i] = new->counter[i];
2525                 else
2526                         old->counter[i] = new->counter[i] - old->counter[i];
2527         }
2528 }
2529
2530 int soft_c1_residency_display(int bic)
2531 {
2532         if (!DO_BIC(BIC_CPU_c1) || platform->has_msr_core_c1_res)
2533                 return 0;
2534
2535         return DO_BIC_READ(bic);
2536 }
2537
2538 /*
2539  * old = new - old
2540  */
2541 int delta_thread(struct thread_data *new, struct thread_data *old, struct core_data *core_delta)
2542 {
2543         int i;
2544         struct msr_counter *mp;
2545
2546         /* we run cpuid just the 1st time, copy the results */
2547         if (DO_BIC(BIC_APIC))
2548                 new->apic_id = old->apic_id;
2549         if (DO_BIC(BIC_X2APIC))
2550                 new->x2apic_id = old->x2apic_id;
2551
2552         /*
2553          * the timestamps from start of measurement interval are in "old"
2554          * the timestamp from end of measurement interval are in "new"
2555          * over-write old w/ new so we can print end of interval values
2556          */
2557
2558         timersub(&new->tv_begin, &old->tv_begin, &old->tv_delta);
2559         old->tv_begin = new->tv_begin;
2560         old->tv_end = new->tv_end;
2561
2562         old->tsc = new->tsc - old->tsc;
2563
2564         /* check for TSC < 1 Mcycles over interval */
2565         if (old->tsc < (1000 * 1000))
2566                 errx(-3, "Insanely slow TSC rate, TSC stops in idle?\n"
2567                      "You can disable all c-states by booting with \"idle=poll\"\n"
2568                      "or just the deep ones with \"processor.max_cstate=1\"");
2569
2570         old->c1 = new->c1 - old->c1;
2571
2572         if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
2573             || soft_c1_residency_display(BIC_Avg_MHz)) {
2574                 if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) {
2575                         old->aperf = new->aperf - old->aperf;
2576                         old->mperf = new->mperf - old->mperf;
2577                 } else {
2578                         return -1;
2579                 }
2580         }
2581
2582         if (platform->has_msr_core_c1_res) {
2583                 /*
2584                  * Some models have a dedicated C1 residency MSR,
2585                  * which should be more accurate than the derivation below.
2586                  */
2587         } else {
2588                 /*
2589                  * As counter collection is not atomic,
2590                  * it is possible for mperf's non-halted cycles + idle states
2591                  * to exceed TSC's all cycles: show c1 = 0% in that case.
2592                  */
2593                 if ((old->mperf + core_delta->c3 + core_delta->c6 + core_delta->c7) > (old->tsc * tsc_tweak))
2594                         old->c1 = 0;
2595                 else {
2596                         /* normal case, derive c1 */
2597                         old->c1 = (old->tsc * tsc_tweak) - old->mperf - core_delta->c3
2598                             - core_delta->c6 - core_delta->c7;
2599                 }
2600         }
2601
2602         if (old->mperf == 0) {
2603                 if (debug > 1)
2604                         fprintf(outf, "cpu%d MPERF 0!\n", old->cpu_id);
2605                 old->mperf = 1; /* divide by 0 protection */
2606         }
2607
2608         if (DO_BIC(BIC_IPC))
2609                 old->instr_count = new->instr_count - old->instr_count;
2610
2611         if (DO_BIC(BIC_IRQ))
2612                 old->irq_count = new->irq_count - old->irq_count;
2613
2614         if (DO_BIC(BIC_SMI))
2615                 old->smi_count = new->smi_count - old->smi_count;
2616
2617         for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
2618                 if (mp->format == FORMAT_RAW)
2619                         old->counter[i] = new->counter[i];
2620                 else
2621                         old->counter[i] = new->counter[i] - old->counter[i];
2622         }
2623         return 0;
2624 }
2625
2626 int delta_cpu(struct thread_data *t, struct core_data *c,
2627               struct pkg_data *p, struct thread_data *t2, struct core_data *c2, struct pkg_data *p2)
2628 {
2629         int retval = 0;
2630
2631         /* calculate core delta only for 1st thread in core */
2632         if (is_cpu_first_thread_in_core(t, c, p))
2633                 delta_core(c, c2);
2634
2635         /* always calculate thread delta */
2636         retval = delta_thread(t, t2, c2);       /* c2 is core delta */
2637         if (retval)
2638                 return retval;
2639
2640         /* calculate package delta only for 1st core in package */
2641         if (is_cpu_first_core_in_package(t, c, p))
2642                 retval = delta_package(p, p2);
2643
2644         return retval;
2645 }
2646
2647 void rapl_counter_clear(struct rapl_counter *c)
2648 {
2649         c->raw_value = 0;
2650         c->scale = 0.0;
2651         c->unit = RAPL_UNIT_INVALID;
2652 }
2653
2654 void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
2655 {
2656         int i;
2657         struct msr_counter *mp;
2658
2659         t->tv_begin.tv_sec = 0;
2660         t->tv_begin.tv_usec = 0;
2661         t->tv_end.tv_sec = 0;
2662         t->tv_end.tv_usec = 0;
2663         t->tv_delta.tv_sec = 0;
2664         t->tv_delta.tv_usec = 0;
2665
2666         t->tsc = 0;
2667         t->aperf = 0;
2668         t->mperf = 0;
2669         t->c1 = 0;
2670
2671         t->instr_count = 0;
2672
2673         t->irq_count = 0;
2674         t->smi_count = 0;
2675
2676         c->c3 = 0;
2677         c->c6 = 0;
2678         c->c7 = 0;
2679         c->mc6_us = 0;
2680         c->core_temp_c = 0;
2681         rapl_counter_clear(&c->core_energy);
2682         c->core_throt_cnt = 0;
2683
2684         p->pkg_wtd_core_c0 = 0;
2685         p->pkg_any_core_c0 = 0;
2686         p->pkg_any_gfxe_c0 = 0;
2687         p->pkg_both_core_gfxe_c0 = 0;
2688
2689         p->pc2 = 0;
2690         if (DO_BIC(BIC_Pkgpc3))
2691                 p->pc3 = 0;
2692         if (DO_BIC(BIC_Pkgpc6))
2693                 p->pc6 = 0;
2694         if (DO_BIC(BIC_Pkgpc7))
2695                 p->pc7 = 0;
2696         p->pc8 = 0;
2697         p->pc9 = 0;
2698         p->pc10 = 0;
2699         p->cpu_lpi = 0;
2700         p->sys_lpi = 0;
2701
2702         rapl_counter_clear(&p->energy_pkg);
2703         rapl_counter_clear(&p->energy_dram);
2704         rapl_counter_clear(&p->energy_cores);
2705         rapl_counter_clear(&p->energy_gfx);
2706         rapl_counter_clear(&p->rapl_pkg_perf_status);
2707         rapl_counter_clear(&p->rapl_dram_perf_status);
2708         p->pkg_temp_c = 0;
2709
2710         p->gfx_rc6_ms = 0;
2711         p->uncore_mhz = 0;
2712         p->gfx_mhz = 0;
2713         p->gfx_act_mhz = 0;
2714         p->sam_mc6_ms = 0;
2715         p->sam_mhz = 0;
2716         p->sam_act_mhz = 0;
2717         for (i = 0, mp = sys.tp; mp; i++, mp = mp->next)
2718                 t->counter[i] = 0;
2719
2720         for (i = 0, mp = sys.cp; mp; i++, mp = mp->next)
2721                 c->counter[i] = 0;
2722
2723         for (i = 0, mp = sys.pp; mp; i++, mp = mp->next)
2724                 p->counter[i] = 0;
2725 }
2726
2727 void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src)
2728 {
2729         /* Copy unit and scale from src if dst is not initialized */
2730         if (dst->unit == RAPL_UNIT_INVALID) {
2731                 dst->unit = src->unit;
2732                 dst->scale = src->scale;
2733         }
2734
2735         assert(dst->unit == src->unit);
2736         assert(dst->scale == src->scale);
2737
2738         dst->raw_value += src->raw_value;
2739 }
2740
2741 int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
2742 {
2743         int i;
2744         struct msr_counter *mp;
2745
2746         /* copy un-changing apic_id's */
2747         if (DO_BIC(BIC_APIC))
2748                 average.threads.apic_id = t->apic_id;
2749         if (DO_BIC(BIC_X2APIC))
2750                 average.threads.x2apic_id = t->x2apic_id;
2751
2752         /* remember first tv_begin */
2753         if (average.threads.tv_begin.tv_sec == 0)
2754                 average.threads.tv_begin = t->tv_begin;
2755
2756         /* remember last tv_end */
2757         average.threads.tv_end = t->tv_end;
2758
2759         average.threads.tsc += t->tsc;
2760         average.threads.aperf += t->aperf;
2761         average.threads.mperf += t->mperf;
2762         average.threads.c1 += t->c1;
2763
2764         average.threads.instr_count += t->instr_count;
2765
2766         average.threads.irq_count += t->irq_count;
2767         average.threads.smi_count += t->smi_count;
2768
2769         for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
2770                 if (mp->format == FORMAT_RAW)
2771                         continue;
2772                 average.threads.counter[i] += t->counter[i];
2773         }
2774
2775         /* sum per-core values only for 1st thread in core */
2776         if (!is_cpu_first_thread_in_core(t, c, p))
2777                 return 0;
2778
2779         average.cores.c3 += c->c3;
2780         average.cores.c6 += c->c6;
2781         average.cores.c7 += c->c7;
2782         average.cores.mc6_us += c->mc6_us;
2783
2784         average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c);
2785         average.cores.core_throt_cnt = MAX(average.cores.core_throt_cnt, c->core_throt_cnt);
2786
2787         rapl_counter_accumulate(&average.cores.core_energy, &c->core_energy);
2788
2789         for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
2790                 if (mp->format == FORMAT_RAW)
2791                         continue;
2792                 average.cores.counter[i] += c->counter[i];
2793         }
2794
2795         /* sum per-pkg values only for 1st core in pkg */
2796         if (!is_cpu_first_core_in_package(t, c, p))
2797                 return 0;
2798
2799         if (DO_BIC(BIC_Totl_c0))
2800                 average.packages.pkg_wtd_core_c0 += p->pkg_wtd_core_c0;
2801         if (DO_BIC(BIC_Any_c0))
2802                 average.packages.pkg_any_core_c0 += p->pkg_any_core_c0;
2803         if (DO_BIC(BIC_GFX_c0))
2804                 average.packages.pkg_any_gfxe_c0 += p->pkg_any_gfxe_c0;
2805         if (DO_BIC(BIC_CPUGFX))
2806                 average.packages.pkg_both_core_gfxe_c0 += p->pkg_both_core_gfxe_c0;
2807
2808         average.packages.pc2 += p->pc2;
2809         if (DO_BIC(BIC_Pkgpc3))
2810                 average.packages.pc3 += p->pc3;
2811         if (DO_BIC(BIC_Pkgpc6))
2812                 average.packages.pc6 += p->pc6;
2813         if (DO_BIC(BIC_Pkgpc7))
2814                 average.packages.pc7 += p->pc7;
2815         average.packages.pc8 += p->pc8;
2816         average.packages.pc9 += p->pc9;
2817         average.packages.pc10 += p->pc10;
2818
2819         average.packages.cpu_lpi = p->cpu_lpi;
2820         average.packages.sys_lpi = p->sys_lpi;
2821
2822         rapl_counter_accumulate(&average.packages.energy_pkg, &p->energy_pkg);
2823         rapl_counter_accumulate(&average.packages.energy_dram, &p->energy_dram);
2824         rapl_counter_accumulate(&average.packages.energy_cores, &p->energy_cores);
2825         rapl_counter_accumulate(&average.packages.energy_gfx, &p->energy_gfx);
2826
2827         average.packages.gfx_rc6_ms = p->gfx_rc6_ms;
2828         average.packages.uncore_mhz = p->uncore_mhz;
2829         average.packages.gfx_mhz = p->gfx_mhz;
2830         average.packages.gfx_act_mhz = p->gfx_act_mhz;
2831         average.packages.sam_mc6_ms = p->sam_mc6_ms;
2832         average.packages.sam_mhz = p->sam_mhz;
2833         average.packages.sam_act_mhz = p->sam_act_mhz;
2834
2835         average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c);
2836
2837         rapl_counter_accumulate(&average.packages.rapl_pkg_perf_status, &p->rapl_pkg_perf_status);
2838         rapl_counter_accumulate(&average.packages.rapl_dram_perf_status, &p->rapl_dram_perf_status);
2839
2840         for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
2841                 if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0))
2842                         average.packages.counter[i] = p->counter[i];
2843                 else
2844                         average.packages.counter[i] += p->counter[i];
2845         }
2846         return 0;
2847 }
2848
2849 /*
2850  * sum the counters for all cpus in the system
2851  * compute the weighted average
2852  */
2853 void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data *p)
2854 {
2855         int i;
2856         struct msr_counter *mp;
2857
2858         clear_counters(&average.threads, &average.cores, &average.packages);
2859
2860         for_all_cpus(sum_counters, t, c, p);
2861
2862         /* Use the global time delta for the average. */
2863         average.threads.tv_delta = tv_delta;
2864
2865         average.threads.tsc /= topo.allowed_cpus;
2866         average.threads.aperf /= topo.allowed_cpus;
2867         average.threads.mperf /= topo.allowed_cpus;
2868         average.threads.instr_count /= topo.allowed_cpus;
2869         average.threads.c1 /= topo.allowed_cpus;
2870
2871         if (average.threads.irq_count > 9999999)
2872                 sums_need_wide_columns = 1;
2873
2874         average.cores.c3 /= topo.allowed_cores;
2875         average.cores.c6 /= topo.allowed_cores;
2876         average.cores.c7 /= topo.allowed_cores;
2877         average.cores.mc6_us /= topo.allowed_cores;
2878
2879         if (DO_BIC(BIC_Totl_c0))
2880                 average.packages.pkg_wtd_core_c0 /= topo.allowed_packages;
2881         if (DO_BIC(BIC_Any_c0))
2882                 average.packages.pkg_any_core_c0 /= topo.allowed_packages;
2883         if (DO_BIC(BIC_GFX_c0))
2884                 average.packages.pkg_any_gfxe_c0 /= topo.allowed_packages;
2885         if (DO_BIC(BIC_CPUGFX))
2886                 average.packages.pkg_both_core_gfxe_c0 /= topo.allowed_packages;
2887
2888         average.packages.pc2 /= topo.allowed_packages;
2889         if (DO_BIC(BIC_Pkgpc3))
2890                 average.packages.pc3 /= topo.allowed_packages;
2891         if (DO_BIC(BIC_Pkgpc6))
2892                 average.packages.pc6 /= topo.allowed_packages;
2893         if (DO_BIC(BIC_Pkgpc7))
2894                 average.packages.pc7 /= topo.allowed_packages;
2895
2896         average.packages.pc8 /= topo.allowed_packages;
2897         average.packages.pc9 /= topo.allowed_packages;
2898         average.packages.pc10 /= topo.allowed_packages;
2899
2900         for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
2901                 if (mp->format == FORMAT_RAW)
2902                         continue;
2903                 if (mp->type == COUNTER_ITEMS) {
2904                         if (average.threads.counter[i] > 9999999)
2905                                 sums_need_wide_columns = 1;
2906                         continue;
2907                 }
2908                 average.threads.counter[i] /= topo.allowed_cpus;
2909         }
2910         for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
2911                 if (mp->format == FORMAT_RAW)
2912                         continue;
2913                 if (mp->type == COUNTER_ITEMS) {
2914                         if (average.cores.counter[i] > 9999999)
2915                                 sums_need_wide_columns = 1;
2916                 }
2917                 average.cores.counter[i] /= topo.allowed_cores;
2918         }
2919         for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
2920                 if (mp->format == FORMAT_RAW)
2921                         continue;
2922                 if (mp->type == COUNTER_ITEMS) {
2923                         if (average.packages.counter[i] > 9999999)
2924                                 sums_need_wide_columns = 1;
2925                 }
2926                 average.packages.counter[i] /= topo.allowed_packages;
2927         }
2928 }
2929
2930 static unsigned long long rdtsc(void)
2931 {
2932         unsigned int low, high;
2933
2934         asm volatile ("rdtsc":"=a" (low), "=d"(high));
2935
2936         return low | ((unsigned long long)high) << 32;
2937 }
2938
2939 /*
2940  * Open a file, and exit on failure
2941  */
2942 FILE *fopen_or_die(const char *path, const char *mode)
2943 {
2944         FILE *filep = fopen(path, mode);
2945
2946         if (!filep)
2947                 err(1, "%s: open failed", path);
2948         return filep;
2949 }
2950
2951 /*
2952  * snapshot_sysfs_counter()
2953  *
2954  * return snapshot of given counter
2955  */
2956 unsigned long long snapshot_sysfs_counter(char *path)
2957 {
2958         FILE *fp;
2959         int retval;
2960         unsigned long long counter;
2961
2962         fp = fopen_or_die(path, "r");
2963
2964         retval = fscanf(fp, "%lld", &counter);
2965         if (retval != 1)
2966                 err(1, "snapshot_sysfs_counter(%s)", path);
2967
2968         fclose(fp);
2969
2970         return counter;
2971 }
2972
2973 int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp)
2974 {
2975         if (mp->msr_num != 0) {
2976                 assert(!no_msr);
2977                 if (get_msr(cpu, mp->msr_num, counterp))
2978                         return -1;
2979         } else {
2980                 char path[128 + PATH_BYTES];
2981
2982                 if (mp->flags & SYSFS_PERCPU) {
2983                         sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->path);
2984
2985                         *counterp = snapshot_sysfs_counter(path);
2986                 } else {
2987                         *counterp = snapshot_sysfs_counter(mp->path);
2988                 }
2989         }
2990
2991         return 0;
2992 }
2993
2994 unsigned long long get_uncore_mhz(int package, int die)
2995 {
2996         char path[128];
2997
2998         sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", package,
2999                 die);
3000
3001         return (snapshot_sysfs_counter(path) / 1000);
3002 }
3003
3004 int get_epb(int cpu)
3005 {
3006         char path[128 + PATH_BYTES];
3007         unsigned long long msr;
3008         int ret, epb = -1;
3009         FILE *fp;
3010
3011         sprintf(path, "/sys/devices/system/cpu/cpu%d/power/energy_perf_bias", cpu);
3012
3013         fp = fopen(path, "r");
3014         if (!fp)
3015                 goto msr_fallback;
3016
3017         ret = fscanf(fp, "%d", &epb);
3018         if (ret != 1)
3019                 err(1, "%s(%s)", __func__, path);
3020
3021         fclose(fp);
3022
3023         return epb;
3024
3025 msr_fallback:
3026         if (no_msr)
3027                 return -1;
3028
3029         get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr);
3030
3031         return msr & 0xf;
3032 }
3033
3034 void get_apic_id(struct thread_data *t)
3035 {
3036         unsigned int eax, ebx, ecx, edx;
3037
3038         if (DO_BIC(BIC_APIC)) {
3039                 eax = ebx = ecx = edx = 0;
3040                 __cpuid(1, eax, ebx, ecx, edx);
3041
3042                 t->apic_id = (ebx >> 24) & 0xff;
3043         }
3044
3045         if (!DO_BIC(BIC_X2APIC))
3046                 return;
3047
3048         if (authentic_amd || hygon_genuine) {
3049                 unsigned int topology_extensions;
3050
3051                 if (max_extended_level < 0x8000001e)
3052                         return;
3053
3054                 eax = ebx = ecx = edx = 0;
3055                 __cpuid(0x80000001, eax, ebx, ecx, edx);
3056                 topology_extensions = ecx & (1 << 22);
3057
3058                 if (topology_extensions == 0)
3059                         return;
3060
3061                 eax = ebx = ecx = edx = 0;
3062                 __cpuid(0x8000001e, eax, ebx, ecx, edx);
3063
3064                 t->x2apic_id = eax;
3065                 return;
3066         }
3067
3068         if (!genuine_intel)
3069                 return;
3070
3071         if (max_level < 0xb)
3072                 return;
3073
3074         ecx = 0;
3075         __cpuid(0xb, eax, ebx, ecx, edx);
3076         t->x2apic_id = edx;
3077
3078         if (debug && (t->apic_id != (t->x2apic_id & 0xff)))
3079                 fprintf(outf, "cpu%d: BIOS BUG: apic 0x%x x2apic 0x%x\n", t->cpu_id, t->apic_id, t->x2apic_id);
3080 }
3081
3082 int get_core_throt_cnt(int cpu, unsigned long long *cnt)
3083 {
3084         char path[128 + PATH_BYTES];
3085         unsigned long long tmp;
3086         FILE *fp;
3087         int ret;
3088
3089         sprintf(path, "/sys/devices/system/cpu/cpu%d/thermal_throttle/core_throttle_count", cpu);
3090         fp = fopen(path, "r");
3091         if (!fp)
3092                 return -1;
3093         ret = fscanf(fp, "%lld", &tmp);
3094         fclose(fp);
3095         if (ret != 1)
3096                 return -1;
3097         *cnt = tmp;
3098
3099         return 0;
3100 }
3101
3102 struct amperf_group_fd {
3103         int aperf;              /* Also the group descriptor */
3104         int mperf;
3105 };
3106
3107 static int read_perf_counter_info(const char *const path, const char *const parse_format, void *value_ptr)
3108 {
3109         int fdmt;
3110         int bytes_read;
3111         char buf[64];
3112         int ret = -1;
3113
3114         fdmt = open(path, O_RDONLY, 0);
3115         if (fdmt == -1) {
3116                 if (debug)
3117                         fprintf(stderr, "Failed to parse perf counter info %s\n", path);
3118                 ret = -1;
3119                 goto cleanup_and_exit;
3120         }
3121
3122         bytes_read = read(fdmt, buf, sizeof(buf) - 1);
3123         if (bytes_read <= 0 || bytes_read >= (int)sizeof(buf)) {
3124                 if (debug)
3125                         fprintf(stderr, "Failed to parse perf counter info %s\n", path);
3126                 ret = -1;
3127                 goto cleanup_and_exit;
3128         }
3129
3130         buf[bytes_read] = '\0';
3131
3132         if (sscanf(buf, parse_format, value_ptr) != 1) {
3133                 if (debug)
3134                         fprintf(stderr, "Failed to parse perf counter info %s\n", path);
3135                 ret = -1;
3136                 goto cleanup_and_exit;
3137         }
3138
3139         ret = 0;
3140
3141 cleanup_and_exit:
3142         close(fdmt);
3143         return ret;
3144 }
3145
3146 static unsigned int read_perf_counter_info_n(const char *const path, const char *const parse_format)
3147 {
3148         unsigned int v;
3149         int status;
3150
3151         status = read_perf_counter_info(path, parse_format, &v);
3152         if (status)
3153                 v = -1;
3154
3155         return v;
3156 }
3157
3158 static unsigned int read_msr_type(void)
3159 {
3160         const char *const path = "/sys/bus/event_source/devices/msr/type";
3161         const char *const format = "%u";
3162
3163         return read_perf_counter_info_n(path, format);
3164 }
3165
3166 static unsigned int read_aperf_config(void)
3167 {
3168         const char *const path = "/sys/bus/event_source/devices/msr/events/aperf";
3169         const char *const format = "event=%x";
3170
3171         return read_perf_counter_info_n(path, format);
3172 }
3173
3174 static unsigned int read_mperf_config(void)
3175 {
3176         const char *const path = "/sys/bus/event_source/devices/msr/events/mperf";
3177         const char *const format = "event=%x";
3178
3179         return read_perf_counter_info_n(path, format);
3180 }
3181
3182 static unsigned int read_perf_type(const char *subsys)
3183 {
3184         const char *const path_format = "/sys/bus/event_source/devices/%s/type";
3185         const char *const format = "%u";
3186         char path[128];
3187
3188         snprintf(path, sizeof(path), path_format, subsys);
3189
3190         return read_perf_counter_info_n(path, format);
3191 }
3192
3193 static unsigned int read_rapl_config(const char *subsys, const char *event_name)
3194 {
3195         const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s";
3196         const char *const format = "event=%x";
3197         char path[128];
3198
3199         snprintf(path, sizeof(path), path_format, subsys, event_name);
3200
3201         return read_perf_counter_info_n(path, format);
3202 }
3203
3204 static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name)
3205 {
3206         const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.unit";
3207         const char *const format = "%s";
3208         char path[128];
3209         char unit_buffer[16];
3210
3211         snprintf(path, sizeof(path), path_format, subsys, event_name);
3212
3213         read_perf_counter_info(path, format, &unit_buffer);
3214         if (strcmp("Joules", unit_buffer) == 0)
3215                 return RAPL_UNIT_JOULES;
3216
3217         return RAPL_UNIT_INVALID;
3218 }
3219
3220 static double read_perf_rapl_scale(const char *subsys, const char *event_name)
3221 {
3222         const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale";
3223         const char *const format = "%lf";
3224         char path[128];
3225         double scale;
3226
3227         snprintf(path, sizeof(path), path_format, subsys, event_name);
3228
3229         if (read_perf_counter_info(path, format, &scale))
3230                 return 0.0;
3231
3232         return scale;
3233 }
3234
3235 static struct amperf_group_fd open_amperf_fd(int cpu)
3236 {
3237         const unsigned int msr_type = read_msr_type();
3238         const unsigned int aperf_config = read_aperf_config();
3239         const unsigned int mperf_config = read_mperf_config();
3240         struct amperf_group_fd fds = {.aperf = -1, .mperf = -1 };
3241
3242         fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP);
3243         fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP);
3244
3245         return fds;
3246 }
3247
3248 static int get_amperf_fd(int cpu)
3249 {
3250         assert(fd_amperf_percpu);
3251
3252         if (fd_amperf_percpu[cpu].aperf)
3253                 return fd_amperf_percpu[cpu].aperf;
3254
3255         fd_amperf_percpu[cpu] = open_amperf_fd(cpu);
3256
3257         return fd_amperf_percpu[cpu].aperf;
3258 }
3259
3260 /* Read APERF, MPERF and TSC using the perf API. */
3261 static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu)
3262 {
3263         union {
3264                 struct {
3265                         unsigned long nr_entries;
3266                         unsigned long aperf;
3267                         unsigned long mperf;
3268                 };
3269
3270                 unsigned long as_array[3];
3271         } cnt;
3272
3273         const int fd_amperf = get_amperf_fd(cpu);
3274
3275         /*
3276          * Read the TSC with rdtsc, because we want the absolute value and not
3277          * the offset from the start of the counter.
3278          */
3279         t->tsc = rdtsc();
3280
3281         const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array));
3282
3283         if (n != sizeof(cnt.as_array))
3284                 return -2;
3285
3286         t->aperf = cnt.aperf * aperf_mperf_multiplier;
3287         t->mperf = cnt.mperf * aperf_mperf_multiplier;
3288
3289         return 0;
3290 }
3291
3292 /* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */
3293 static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu)
3294 {
3295         unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
3296         int aperf_mperf_retry_count = 0;
3297
3298         /*
3299          * The TSC, APERF and MPERF must be read together for
3300          * APERF/MPERF and MPERF/TSC to give accurate results.
3301          *
3302          * Unfortunately, APERF and MPERF are read by
3303          * individual system call, so delays may occur
3304          * between them.  If the time to read them
3305          * varies by a large amount, we re-read them.
3306          */
3307
3308         /*
3309          * This initial dummy APERF read has been seen to
3310          * reduce jitter in the subsequent reads.
3311          */
3312
3313         if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
3314                 return -3;
3315
3316 retry:
3317         t->tsc = rdtsc();       /* re-read close to APERF */
3318
3319         tsc_before = t->tsc;
3320
3321         if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
3322                 return -3;
3323
3324         tsc_between = rdtsc();
3325
3326         if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf))
3327                 return -4;
3328
3329         tsc_after = rdtsc();
3330
3331         aperf_time = tsc_between - tsc_before;
3332         mperf_time = tsc_after - tsc_between;
3333
3334         /*
3335          * If the system call latency to read APERF and MPERF
3336          * differ by more than 2x, then try again.
3337          */
3338         if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) {
3339                 aperf_mperf_retry_count++;
3340                 if (aperf_mperf_retry_count < 5)
3341                         goto retry;
3342                 else
3343                         warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time);
3344         }
3345         aperf_mperf_retry_count = 0;
3346
3347         t->aperf = t->aperf * aperf_mperf_multiplier;
3348         t->mperf = t->mperf * aperf_mperf_multiplier;
3349
3350         return 0;
3351 }
3352
3353 size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci)
3354 {
3355         size_t ret = 0;
3356
3357         for (int i = 0; i < NUM_RAPL_COUNTERS; ++i)
3358                 if (rci->source[i] == RAPL_SOURCE_PERF)
3359                         ++ret;
3360
3361         return ret;
3362 }
3363
3364 void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx)
3365 {
3366         rc->raw_value = rci->data[idx];
3367         rc->unit = rci->unit[idx];
3368         rc->scale = rci->scale[idx];
3369 }
3370
3371 int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data *p)
3372 {
3373         unsigned long long perf_data[NUM_RAPL_COUNTERS + 1];
3374         struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain];
3375
3376         if (debug)
3377                 fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain);
3378
3379         assert(rapl_counter_info_perdomain);
3380
3381         /*
3382          * If we have any perf counters to read, read them all now, in bulk
3383          */
3384         if (rci->fd_perf != -1) {
3385                 size_t num_perf_counters = rapl_counter_info_count_perf(rci);
3386                 const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long);
3387                 const ssize_t actual_read_size = read(rci->fd_perf, &perf_data[0], sizeof(perf_data));
3388
3389                 if (actual_read_size != expected_read_size)
3390                         err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size,
3391                             actual_read_size);
3392         }
3393
3394         for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) {
3395                 switch (rci->source[i]) {
3396                 case RAPL_SOURCE_NONE:
3397                         break;
3398
3399                 case RAPL_SOURCE_PERF:
3400                         assert(pi < ARRAY_SIZE(perf_data));
3401                         assert(rci->fd_perf != -1);
3402
3403                         if (debug)
3404                                 fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n",
3405                                         i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]);
3406
3407                         rci->data[i] = perf_data[pi];
3408
3409                         ++pi;
3410                         break;
3411
3412                 case RAPL_SOURCE_MSR:
3413                         if (debug)
3414                                 fprintf(stderr, "Reading rapl counter via msr at %u\n", i);
3415
3416                         assert(!no_msr);
3417                         if (rci->flags[i] & RAPL_COUNTER_FLAG_USE_MSR_SUM) {
3418                                 if (get_msr_sum(cpu, rci->msr[i], &rci->data[i]))
3419                                         return -13 - i;
3420                         } else {
3421                                 if (get_msr(cpu, rci->msr[i], &rci->data[i]))
3422                                         return -13 - i;
3423                         }
3424
3425                         rci->data[i] &= rci->msr_mask[i];
3426                         if (rci->msr_shift[i] >= 0)
3427                                 rci->data[i] >>= abs(rci->msr_shift[i]);
3428                         else
3429                                 rci->data[i] <<= abs(rci->msr_shift[i]);
3430
3431                         break;
3432                 }
3433         }
3434
3435         _Static_assert(NUM_RAPL_COUNTERS == 7);
3436         write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG);
3437         write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES);
3438         write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM);
3439         write_rapl_counter(&p->energy_gfx, rci, RAPL_RCI_INDEX_GFX);
3440         write_rapl_counter(&p->rapl_pkg_perf_status, rci, RAPL_RCI_INDEX_PKG_PERF_STATUS);
3441         write_rapl_counter(&p->rapl_dram_perf_status, rci, RAPL_RCI_INDEX_DRAM_PERF_STATUS);
3442         write_rapl_counter(&c->core_energy, rci, RAPL_RCI_INDEX_CORE_ENERGY);
3443
3444         return 0;
3445 }
3446
3447 /*
3448  * get_counters(...)
3449  * migrate to cpu
3450  * acquire and record local counters for that cpu
3451  */
3452 int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
3453 {
3454         int cpu = t->cpu_id;
3455         unsigned long long msr;
3456         struct msr_counter *mp;
3457         int i;
3458         int status;
3459
3460         if (cpu_migrate(cpu)) {
3461                 fprintf(outf, "%s: Could not migrate to CPU %d\n", __func__, cpu);
3462                 return -1;
3463         }
3464
3465         gettimeofday(&t->tv_begin, (struct timezone *)NULL);
3466
3467         if (first_counter_read)
3468                 get_apic_id(t);
3469
3470         t->tsc = rdtsc();       /* we are running on local CPU of interest */
3471
3472         if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
3473             || soft_c1_residency_display(BIC_Avg_MHz)) {
3474                 int status = -1;
3475
3476                 assert(!no_perf || !no_msr);
3477
3478                 switch (amperf_source) {
3479                 case AMPERF_SOURCE_PERF:
3480                         status = read_aperf_mperf_tsc_perf(t, cpu);
3481                         break;
3482                 case AMPERF_SOURCE_MSR:
3483                         status = read_aperf_mperf_tsc_msr(t, cpu);
3484                         break;
3485                 }
3486
3487                 if (status != 0)
3488                         return status;
3489         }
3490
3491         if (DO_BIC(BIC_IPC))
3492                 if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long))
3493                         return -4;
3494
3495         if (DO_BIC(BIC_IRQ))
3496                 t->irq_count = irqs_per_cpu[cpu];
3497         if (DO_BIC(BIC_SMI)) {
3498                 if (get_msr(cpu, MSR_SMI_COUNT, &msr))
3499                         return -5;
3500                 t->smi_count = msr & 0xFFFFFFFF;
3501         }
3502         if (DO_BIC(BIC_CPU_c1) && platform->has_msr_core_c1_res) {
3503                 if (get_msr(cpu, MSR_CORE_C1_RES, &t->c1))
3504                         return -6;
3505         }
3506
3507         for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
3508                 if (get_mp(cpu, mp, &t->counter[i]))
3509                         return -10;
3510         }
3511
3512         /* collect core counters only for 1st thread in core */
3513         if (!is_cpu_first_thread_in_core(t, c, p))
3514                 goto done;
3515
3516         if (platform->has_per_core_rapl) {
3517                 status = get_rapl_counters(cpu, c->core_id, c, p);
3518                 if (status != 0)
3519                         return status;
3520         }
3521
3522         if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) {
3523                 if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3))
3524                         return -6;
3525         }
3526
3527         if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !platform->has_msr_knl_core_c6_residency) {
3528                 if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6))
3529                         return -7;
3530         } else if (platform->has_msr_knl_core_c6_residency && soft_c1_residency_display(BIC_CPU_c6)) {
3531                 if (get_msr(cpu, MSR_KNL_CORE_C6_RESIDENCY, &c->c6))
3532                         return -7;
3533         }
3534
3535         if (DO_BIC(BIC_CPU_c7) || soft_c1_residency_display(BIC_CPU_c7)) {
3536                 if (get_msr(cpu, MSR_CORE_C7_RESIDENCY, &c->c7))
3537                         return -8;
3538                 else if (t->is_atom) {
3539                         /*
3540                          * For Atom CPUs that has core cstate deeper than c6,
3541                          * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper.
3542                          * Minus CC7 (and deeper cstates) residency to get
3543                          * accturate cc6 residency.
3544                          */
3545                         c->c6 -= c->c7;
3546                 }
3547         }
3548
3549         if (DO_BIC(BIC_Mod_c6))
3550                 if (get_msr(cpu, MSR_MODULE_C6_RES_MS, &c->mc6_us))
3551                         return -8;
3552
3553         if (DO_BIC(BIC_CoreTmp)) {
3554                 if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
3555                         return -9;
3556                 c->core_temp_c = tj_max - ((msr >> 16) & 0x7F);
3557         }
3558
3559         if (DO_BIC(BIC_CORE_THROT_CNT))
3560                 get_core_throt_cnt(cpu, &c->core_throt_cnt);
3561
3562         for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
3563                 if (get_mp(cpu, mp, &c->counter[i]))
3564                         return -10;
3565         }
3566
3567         /* collect package counters only for 1st core in package */
3568         if (!is_cpu_first_core_in_package(t, c, p))
3569                 goto done;
3570
3571         if (DO_BIC(BIC_Totl_c0)) {
3572                 if (get_msr(cpu, MSR_PKG_WEIGHTED_CORE_C0_RES, &p->pkg_wtd_core_c0))
3573                         return -10;
3574         }
3575         if (DO_BIC(BIC_Any_c0)) {
3576                 if (get_msr(cpu, MSR_PKG_ANY_CORE_C0_RES, &p->pkg_any_core_c0))
3577                         return -11;
3578         }
3579         if (DO_BIC(BIC_GFX_c0)) {
3580                 if (get_msr(cpu, MSR_PKG_ANY_GFXE_C0_RES, &p->pkg_any_gfxe_c0))
3581                         return -12;
3582         }
3583         if (DO_BIC(BIC_CPUGFX)) {
3584                 if (get_msr(cpu, MSR_PKG_BOTH_CORE_GFXE_C0_RES, &p->pkg_both_core_gfxe_c0))
3585                         return -13;
3586         }
3587         if (DO_BIC(BIC_Pkgpc3))
3588                 if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3))
3589                         return -9;
3590         if (DO_BIC(BIC_Pkgpc6)) {
3591                 if (platform->has_msr_atom_pkg_c6_residency) {
3592                         if (get_msr(cpu, MSR_ATOM_PKG_C6_RESIDENCY, &p->pc6))
3593                                 return -10;
3594                 } else {
3595                         if (get_msr(cpu, MSR_PKG_C6_RESIDENCY, &p->pc6))
3596                                 return -10;
3597                 }
3598         }
3599
3600         if (DO_BIC(BIC_Pkgpc2))
3601                 if (get_msr(cpu, MSR_PKG_C2_RESIDENCY, &p->pc2))
3602                         return -11;
3603         if (DO_BIC(BIC_Pkgpc7))
3604                 if (get_msr(cpu, MSR_PKG_C7_RESIDENCY, &p->pc7))
3605                         return -12;
3606         if (DO_BIC(BIC_Pkgpc8))
3607                 if (get_msr(cpu, MSR_PKG_C8_RESIDENCY, &p->pc8))
3608                         return -13;
3609         if (DO_BIC(BIC_Pkgpc9))
3610                 if (get_msr(cpu, MSR_PKG_C9_RESIDENCY, &p->pc9))
3611                         return -13;
3612         if (DO_BIC(BIC_Pkgpc10))
3613                 if (get_msr(cpu, MSR_PKG_C10_RESIDENCY, &p->pc10))
3614                         return -13;
3615
3616         if (DO_BIC(BIC_CPU_LPI))
3617                 p->cpu_lpi = cpuidle_cur_cpu_lpi_us;
3618         if (DO_BIC(BIC_SYS_LPI))
3619                 p->sys_lpi = cpuidle_cur_sys_lpi_us;
3620
3621         if (!platform->has_per_core_rapl) {
3622                 status = get_rapl_counters(cpu, p->package_id, c, p);
3623                 if (status != 0)
3624                         return status;
3625         }
3626
3627         if (DO_BIC(BIC_PkgTmp)) {
3628                 if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
3629                         return -17;
3630                 p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F);
3631         }
3632
3633         /* n.b. assume die0 uncore frequency applies to whole package */
3634         if (DO_BIC(BIC_UNCORE_MHZ))
3635                 p->uncore_mhz = get_uncore_mhz(p->package_id, 0);
3636
3637         if (DO_BIC(BIC_GFX_rc6))
3638                 p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull;
3639
3640         if (DO_BIC(BIC_GFXMHz))
3641                 p->gfx_mhz = gfx_info[GFX_MHz].val;
3642
3643         if (DO_BIC(BIC_GFXACTMHz))
3644                 p->gfx_act_mhz = gfx_info[GFX_ACTMHz].val;
3645
3646         if (DO_BIC(BIC_SAM_mc6))
3647                 p->sam_mc6_ms = gfx_info[SAM_mc6].val_ull;
3648
3649         if (DO_BIC(BIC_SAMMHz))
3650                 p->sam_mhz = gfx_info[SAM_MHz].val;
3651
3652         if (DO_BIC(BIC_SAMACTMHz))
3653                 p->sam_act_mhz = gfx_info[SAM_ACTMHz].val;
3654
3655         for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
3656                 if (get_mp(cpu, mp, &p->counter[i]))
3657                         return -10;
3658         }
3659 done:
3660         gettimeofday(&t->tv_end, (struct timezone *)NULL);
3661
3662         return 0;
3663 }
3664
3665 /*
3666  * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit:
3667  * If you change the values, note they are used both in comparisons
3668  * (>= PCL__7) and to index pkg_cstate_limit_strings[].
3669  */
3670
3671 #define PCLUKN 0                /* Unknown */
3672 #define PCLRSV 1                /* Reserved */
3673 #define PCL__0 2                /* PC0 */
3674 #define PCL__1 3                /* PC1 */
3675 #define PCL__2 4                /* PC2 */
3676 #define PCL__3 5                /* PC3 */
3677 #define PCL__4 6                /* PC4 */
3678 #define PCL__6 7                /* PC6 */
3679 #define PCL_6N 8                /* PC6 No Retention */
3680 #define PCL_6R 9                /* PC6 Retention */
3681 #define PCL__7 10               /* PC7 */
3682 #define PCL_7S 11               /* PC7 Shrink */
3683 #define PCL__8 12               /* PC8 */
3684 #define PCL__9 13               /* PC9 */
3685 #define PCL_10 14               /* PC10 */
3686 #define PCLUNL 15               /* Unlimited */
3687
3688 int pkg_cstate_limit = PCLUKN;
3689 char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2",
3690         "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited"
3691 };
3692
3693 int nhm_pkg_cstate_limits[16] =
3694     { PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
3695         PCLRSV, PCLRSV
3696 };
3697
3698 int snb_pkg_cstate_limits[16] =
3699     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
3700         PCLRSV, PCLRSV
3701 };
3702
3703 int hsw_pkg_cstate_limits[16] =
3704     { PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
3705         PCLRSV, PCLRSV
3706 };
3707
3708 int slv_pkg_cstate_limits[16] =
3709     { PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
3710         PCL__6, PCL__7
3711 };
3712
3713 int amt_pkg_cstate_limits[16] =
3714     { PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
3715         PCLRSV, PCLRSV
3716 };
3717
3718 int phi_pkg_cstate_limits[16] =
3719     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
3720         PCLRSV, PCLRSV
3721 };
3722
3723 int glm_pkg_cstate_limits[16] =
3724     { PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
3725         PCLRSV, PCLRSV
3726 };
3727
3728 int skx_pkg_cstate_limits[16] =
3729     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
3730         PCLRSV, PCLRSV
3731 };
3732
3733 int icx_pkg_cstate_limits[16] =
3734     { PCL__0, PCL__2, PCL__6, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
3735         PCLRSV, PCLRSV
3736 };
3737
3738 void probe_cst_limit(void)
3739 {
3740         unsigned long long msr;
3741         int *pkg_cstate_limits;
3742
3743         if (!platform->has_nhm_msrs || no_msr)
3744                 return;
3745
3746         switch (platform->cst_limit) {
3747         case CST_LIMIT_NHM:
3748                 pkg_cstate_limits = nhm_pkg_cstate_limits;
3749                 break;
3750         case CST_LIMIT_SNB:
3751                 pkg_cstate_limits = snb_pkg_cstate_limits;
3752                 break;
3753         case CST_LIMIT_HSW:
3754                 pkg_cstate_limits = hsw_pkg_cstate_limits;
3755                 break;
3756         case CST_LIMIT_SKX:
3757                 pkg_cstate_limits = skx_pkg_cstate_limits;
3758                 break;
3759         case CST_LIMIT_ICX:
3760                 pkg_cstate_limits = icx_pkg_cstate_limits;
3761                 break;
3762         case CST_LIMIT_SLV:
3763                 pkg_cstate_limits = slv_pkg_cstate_limits;
3764                 break;
3765         case CST_LIMIT_AMT:
3766                 pkg_cstate_limits = amt_pkg_cstate_limits;
3767                 break;
3768         case CST_LIMIT_KNL:
3769                 pkg_cstate_limits = phi_pkg_cstate_limits;
3770                 break;
3771         case CST_LIMIT_GMT:
3772                 pkg_cstate_limits = glm_pkg_cstate_limits;
3773                 break;
3774         default:
3775                 return;
3776         }
3777
3778         get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
3779         pkg_cstate_limit = pkg_cstate_limits[msr & 0xF];
3780 }
3781
3782 static void dump_platform_info(void)
3783 {
3784         unsigned long long msr;
3785         unsigned int ratio;
3786
3787         if (!platform->has_nhm_msrs || no_msr)
3788                 return;
3789
3790         get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
3791
3792         fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr);
3793
3794         ratio = (msr >> 40) & 0xFF;
3795         fprintf(outf, "%d * %.1f = %.1f MHz max efficiency frequency\n", ratio, bclk, ratio * bclk);
3796
3797         ratio = (msr >> 8) & 0xFF;
3798         fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
3799 }
3800
3801 static void dump_power_ctl(void)
3802 {
3803         unsigned long long msr;
3804
3805         if (!platform->has_nhm_msrs || no_msr)
3806                 return;
3807
3808         get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr);
3809         fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n",
3810                 base_cpu, msr, msr & 0x2 ? "EN" : "DIS");
3811
3812         /* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */
3813         if (platform->has_cst_prewake_bit)
3814                 fprintf(outf, "C-state Pre-wake: %sabled\n", msr & 0x40000000 ? "DIS" : "EN");
3815
3816         return;
3817 }
3818
3819 static void dump_turbo_ratio_limit2(void)
3820 {
3821         unsigned long long msr;
3822         unsigned int ratio;
3823
3824         get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT2, &msr);
3825
3826         fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT2: 0x%08llx\n", base_cpu, msr);
3827
3828         ratio = (msr >> 8) & 0xFF;
3829         if (ratio)
3830                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 18 active cores\n", ratio, bclk, ratio * bclk);
3831
3832         ratio = (msr >> 0) & 0xFF;
3833         if (ratio)
3834                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 17 active cores\n", ratio, bclk, ratio * bclk);
3835         return;
3836 }
3837
3838 static void dump_turbo_ratio_limit1(void)
3839 {
3840         unsigned long long msr;
3841         unsigned int ratio;
3842
3843         get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &msr);
3844
3845         fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, msr);
3846
3847         ratio = (msr >> 56) & 0xFF;
3848         if (ratio)
3849                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 16 active cores\n", ratio, bclk, ratio * bclk);
3850
3851         ratio = (msr >> 48) & 0xFF;
3852         if (ratio)
3853                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 15 active cores\n", ratio, bclk, ratio * bclk);
3854
3855         ratio = (msr >> 40) & 0xFF;
3856         if (ratio)
3857                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 14 active cores\n", ratio, bclk, ratio * bclk);
3858
3859         ratio = (msr >> 32) & 0xFF;
3860         if (ratio)
3861                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 13 active cores\n", ratio, bclk, ratio * bclk);
3862
3863         ratio = (msr >> 24) & 0xFF;
3864         if (ratio)
3865                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 12 active cores\n", ratio, bclk, ratio * bclk);
3866
3867         ratio = (msr >> 16) & 0xFF;
3868         if (ratio)
3869                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 11 active cores\n", ratio, bclk, ratio * bclk);
3870
3871         ratio = (msr >> 8) & 0xFF;
3872         if (ratio)
3873                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 10 active cores\n", ratio, bclk, ratio * bclk);
3874
3875         ratio = (msr >> 0) & 0xFF;
3876         if (ratio)
3877                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 9 active cores\n", ratio, bclk, ratio * bclk);
3878         return;
3879 }
3880
3881 static void dump_turbo_ratio_limits(int trl_msr_offset)
3882 {
3883         unsigned long long msr, core_counts;
3884         int shift;
3885
3886         get_msr(base_cpu, trl_msr_offset, &msr);
3887         fprintf(outf, "cpu%d: MSR_%sTURBO_RATIO_LIMIT: 0x%08llx\n",
3888                 base_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY_" : "", msr);
3889
3890         if (platform->trl_msrs & TRL_CORECOUNT) {
3891                 get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts);
3892                 fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, core_counts);
3893         } else {
3894                 core_counts = 0x0807060504030201;
3895         }
3896
3897         for (shift = 56; shift >= 0; shift -= 8) {
3898                 unsigned int ratio, group_size;
3899
3900                 ratio = (msr >> shift) & 0xFF;
3901                 group_size = (core_counts >> shift) & 0xFF;
3902                 if (ratio)
3903                         fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n",
3904                                 ratio, bclk, ratio * bclk, group_size);
3905         }
3906
3907         return;
3908 }
3909
3910 static void dump_atom_turbo_ratio_limits(void)
3911 {
3912         unsigned long long msr;
3913         unsigned int ratio;
3914
3915         get_msr(base_cpu, MSR_ATOM_CORE_RATIOS, &msr);
3916         fprintf(outf, "cpu%d: MSR_ATOM_CORE_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF);
3917
3918         ratio = (msr >> 0) & 0x3F;
3919         if (ratio)
3920                 fprintf(outf, "%d * %.1f = %.1f MHz minimum operating frequency\n", ratio, bclk, ratio * bclk);
3921
3922         ratio = (msr >> 8) & 0x3F;
3923         if (ratio)
3924                 fprintf(outf, "%d * %.1f = %.1f MHz low frequency mode (LFM)\n", ratio, bclk, ratio * bclk);
3925
3926         ratio = (msr >> 16) & 0x3F;
3927         if (ratio)
3928                 fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
3929
3930         get_msr(base_cpu, MSR_ATOM_CORE_TURBO_RATIOS, &msr);
3931         fprintf(outf, "cpu%d: MSR_ATOM_CORE_TURBO_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF);
3932
3933         ratio = (msr >> 24) & 0x3F;
3934         if (ratio)
3935                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 4 active cores\n", ratio, bclk, ratio * bclk);
3936
3937         ratio = (msr >> 16) & 0x3F;
3938         if (ratio)
3939                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 3 active cores\n", ratio, bclk, ratio * bclk);
3940
3941         ratio = (msr >> 8) & 0x3F;
3942         if (ratio)
3943                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 2 active cores\n", ratio, bclk, ratio * bclk);
3944
3945         ratio = (msr >> 0) & 0x3F;
3946         if (ratio)
3947                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 1 active core\n", ratio, bclk, ratio * bclk);
3948 }
3949
3950 static void dump_knl_turbo_ratio_limits(void)
3951 {
3952         const unsigned int buckets_no = 7;
3953
3954         unsigned long long msr;
3955         int delta_cores, delta_ratio;
3956         int i, b_nr;
3957         unsigned int cores[buckets_no];
3958         unsigned int ratio[buckets_no];
3959
3960         get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT, &msr);
3961
3962         fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr);
3963
3964         /*
3965          * Turbo encoding in KNL is as follows:
3966          * [0] -- Reserved
3967          * [7:1] -- Base value of number of active cores of bucket 1.
3968          * [15:8] -- Base value of freq ratio of bucket 1.
3969          * [20:16] -- +ve delta of number of active cores of bucket 2.
3970          * i.e. active cores of bucket 2 =
3971          * active cores of bucket 1 + delta
3972          * [23:21] -- Negative delta of freq ratio of bucket 2.
3973          * i.e. freq ratio of bucket 2 =
3974          * freq ratio of bucket 1 - delta
3975          * [28:24]-- +ve delta of number of active cores of bucket 3.
3976          * [31:29]-- -ve delta of freq ratio of bucket 3.
3977          * [36:32]-- +ve delta of number of active cores of bucket 4.
3978          * [39:37]-- -ve delta of freq ratio of bucket 4.
3979          * [44:40]-- +ve delta of number of active cores of bucket 5.
3980          * [47:45]-- -ve delta of freq ratio of bucket 5.
3981          * [52:48]-- +ve delta of number of active cores of bucket 6.
3982          * [55:53]-- -ve delta of freq ratio of bucket 6.
3983          * [60:56]-- +ve delta of number of active cores of bucket 7.
3984          * [63:61]-- -ve delta of freq ratio of bucket 7.
3985          */
3986
3987         b_nr = 0;
3988         cores[b_nr] = (msr & 0xFF) >> 1;
3989         ratio[b_nr] = (msr >> 8) & 0xFF;
3990
3991         for (i = 16; i < 64; i += 8) {
3992                 delta_cores = (msr >> i) & 0x1F;
3993                 delta_ratio = (msr >> (i + 5)) & 0x7;
3994
3995                 cores[b_nr + 1] = cores[b_nr] + delta_cores;
3996                 ratio[b_nr + 1] = ratio[b_nr] - delta_ratio;
3997                 b_nr++;
3998         }
3999
4000         for (i = buckets_no - 1; i >= 0; i--)
4001                 if (i > 0 ? ratio[i] != ratio[i - 1] : 1)
4002                         fprintf(outf,
4003                                 "%d * %.1f = %.1f MHz max turbo %d active cores\n",
4004                                 ratio[i], bclk, ratio[i] * bclk, cores[i]);
4005 }
4006
4007 static void dump_cst_cfg(void)
4008 {
4009         unsigned long long msr;
4010
4011         if (!platform->has_nhm_msrs || no_msr)
4012                 return;
4013
4014         get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
4015
4016         fprintf(outf, "cpu%d: MSR_PKG_CST_CONFIG_CONTROL: 0x%08llx", base_cpu, msr);
4017
4018         fprintf(outf, " (%s%s%s%s%slocked, pkg-cstate-limit=%d (%s)",
4019                 (msr & SNB_C3_AUTO_UNDEMOTE) ? "UNdemote-C3, " : "",
4020                 (msr & SNB_C1_AUTO_UNDEMOTE) ? "UNdemote-C1, " : "",
4021                 (msr & NHM_C3_AUTO_DEMOTE) ? "demote-C3, " : "",
4022                 (msr & NHM_C1_AUTO_DEMOTE) ? "demote-C1, " : "",
4023                 (msr & (1 << 15)) ? "" : "UN", (unsigned int)msr & 0xF, pkg_cstate_limit_strings[pkg_cstate_limit]);
4024
4025 #define AUTOMATIC_CSTATE_CONVERSION             (1UL << 16)
4026         if (platform->has_cst_auto_convension) {
4027                 fprintf(outf, ", automatic c-state conversion=%s", (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off");
4028         }
4029
4030         fprintf(outf, ")\n");
4031
4032         return;
4033 }
4034
4035 static void dump_config_tdp(void)
4036 {
4037         unsigned long long msr;
4038
4039         get_msr(base_cpu, MSR_CONFIG_TDP_NOMINAL, &msr);
4040         fprintf(outf, "cpu%d: MSR_CONFIG_TDP_NOMINAL: 0x%08llx", base_cpu, msr);
4041         fprintf(outf, " (base_ratio=%d)\n", (unsigned int)msr & 0xFF);
4042
4043         get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_1, &msr);
4044         fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_1: 0x%08llx (", base_cpu, msr);
4045         if (msr) {
4046                 fprintf(outf, "PKG_MIN_PWR_LVL1=%d ", (unsigned int)(msr >> 48) & 0x7FFF);
4047                 fprintf(outf, "PKG_MAX_PWR_LVL1=%d ", (unsigned int)(msr >> 32) & 0x7FFF);
4048                 fprintf(outf, "LVL1_RATIO=%d ", (unsigned int)(msr >> 16) & 0xFF);
4049                 fprintf(outf, "PKG_TDP_LVL1=%d", (unsigned int)(msr) & 0x7FFF);
4050         }
4051         fprintf(outf, ")\n");
4052
4053         get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_2, &msr);
4054         fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_2: 0x%08llx (", base_cpu, msr);
4055         if (msr) {
4056                 fprintf(outf, "PKG_MIN_PWR_LVL2=%d ", (unsigned int)(msr >> 48) & 0x7FFF);
4057                 fprintf(outf, "PKG_MAX_PWR_LVL2=%d ", (unsigned int)(msr >> 32) & 0x7FFF);
4058                 fprintf(outf, "LVL2_RATIO=%d ", (unsigned int)(msr >> 16) & 0xFF);
4059                 fprintf(outf, "PKG_TDP_LVL2=%d", (unsigned int)(msr) & 0x7FFF);
4060         }
4061         fprintf(outf, ")\n");
4062
4063         get_msr(base_cpu, MSR_CONFIG_TDP_CONTROL, &msr);
4064         fprintf(outf, "cpu%d: MSR_CONFIG_TDP_CONTROL: 0x%08llx (", base_cpu, msr);
4065         if ((msr) & 0x3)
4066                 fprintf(outf, "TDP_LEVEL=%d ", (unsigned int)(msr) & 0x3);
4067         fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
4068         fprintf(outf, ")\n");
4069
4070         get_msr(base_cpu, MSR_TURBO_ACTIVATION_RATIO, &msr);
4071         fprintf(outf, "cpu%d: MSR_TURBO_ACTIVATION_RATIO: 0x%08llx (", base_cpu, msr);
4072         fprintf(outf, "MAX_NON_TURBO_RATIO=%d", (unsigned int)(msr) & 0xFF);
4073         fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
4074         fprintf(outf, ")\n");
4075 }
4076
4077 unsigned int irtl_time_units[] = { 1, 32, 1024, 32768, 1048576, 33554432, 0, 0 };
4078
4079 void print_irtl(void)
4080 {
4081         unsigned long long msr;
4082
4083         if (!platform->has_irtl_msrs || no_msr)
4084                 return;
4085
4086         if (platform->supported_cstates & PC3) {
4087                 get_msr(base_cpu, MSR_PKGC3_IRTL, &msr);
4088                 fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr);
4089                 fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
4090                         (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
4091         }
4092
4093         if (platform->supported_cstates & PC6) {
4094                 get_msr(base_cpu, MSR_PKGC6_IRTL, &msr);
4095                 fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr);
4096                 fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
4097                         (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
4098         }
4099
4100         if (platform->supported_cstates & PC7) {
4101                 get_msr(base_cpu, MSR_PKGC7_IRTL, &msr);
4102                 fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr);
4103                 fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
4104                         (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
4105         }
4106
4107         if (platform->supported_cstates & PC8) {
4108                 get_msr(base_cpu, MSR_PKGC8_IRTL, &msr);
4109                 fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr);
4110                 fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
4111                         (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
4112         }
4113
4114         if (platform->supported_cstates & PC9) {
4115                 get_msr(base_cpu, MSR_PKGC9_IRTL, &msr);
4116                 fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr);
4117                 fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
4118                         (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
4119         }
4120
4121         if (platform->supported_cstates & PC10) {
4122                 get_msr(base_cpu, MSR_PKGC10_IRTL, &msr);
4123                 fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr);
4124                 fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
4125                         (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
4126         }
4127 }
4128
4129 void free_fd_percpu(void)
4130 {
4131         int i;
4132
4133         if (!fd_percpu)
4134                 return;
4135
4136         for (i = 0; i < topo.max_cpu_num + 1; ++i) {
4137                 if (fd_percpu[i] != 0)
4138                         close(fd_percpu[i]);
4139         }
4140
4141         free(fd_percpu);
4142         fd_percpu = NULL;
4143 }
4144
4145 void free_fd_amperf_percpu(void)
4146 {
4147         int i;
4148
4149         if (!fd_amperf_percpu)
4150                 return;
4151
4152         for (i = 0; i < topo.max_cpu_num + 1; ++i) {
4153                 if (fd_amperf_percpu[i].mperf != 0)
4154                         close(fd_amperf_percpu[i].mperf);
4155
4156                 if (fd_amperf_percpu[i].aperf != 0)
4157                         close(fd_amperf_percpu[i].aperf);
4158         }
4159
4160         free(fd_amperf_percpu);
4161         fd_amperf_percpu = NULL;
4162 }
4163
4164 void free_fd_instr_count_percpu(void)
4165 {
4166         if (!fd_instr_count_percpu)
4167                 return;
4168
4169         for (int i = 0; i < topo.max_cpu_num + 1; ++i) {
4170                 if (fd_instr_count_percpu[i] != 0)
4171                         close(fd_instr_count_percpu[i]);
4172         }
4173
4174         free(fd_instr_count_percpu);
4175         fd_instr_count_percpu = NULL;
4176 }
4177
4178 void free_fd_rapl_percpu(void)
4179 {
4180         if (!rapl_counter_info_perdomain)
4181                 return;
4182
4183         const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages;
4184
4185         for (int domain_id = 0; domain_id < num_domains; ++domain_id) {
4186                 if (rapl_counter_info_perdomain[domain_id].fd_perf != -1)
4187                         close(rapl_counter_info_perdomain[domain_id].fd_perf);
4188         }
4189
4190         free(rapl_counter_info_perdomain);
4191 }
4192
4193 void free_all_buffers(void)
4194 {
4195         int i;
4196
4197         CPU_FREE(cpu_present_set);
4198         cpu_present_set = NULL;
4199         cpu_present_setsize = 0;
4200
4201         CPU_FREE(cpu_effective_set);
4202         cpu_effective_set = NULL;
4203         cpu_effective_setsize = 0;
4204
4205         CPU_FREE(cpu_allowed_set);
4206         cpu_allowed_set = NULL;
4207         cpu_allowed_setsize = 0;
4208
4209         CPU_FREE(cpu_affinity_set);
4210         cpu_affinity_set = NULL;
4211         cpu_affinity_setsize = 0;
4212
4213         free(thread_even);
4214         free(core_even);
4215         free(package_even);
4216
4217         thread_even = NULL;
4218         core_even = NULL;
4219         package_even = NULL;
4220
4221         free(thread_odd);
4222         free(core_odd);
4223         free(package_odd);
4224
4225         thread_odd = NULL;
4226         core_odd = NULL;
4227         package_odd = NULL;
4228
4229         free(output_buffer);
4230         output_buffer = NULL;
4231         outp = NULL;
4232
4233         free_fd_percpu();
4234         free_fd_instr_count_percpu();
4235         free_fd_amperf_percpu();
4236         free_fd_rapl_percpu();
4237
4238         free(irq_column_2_cpu);
4239         free(irqs_per_cpu);
4240
4241         for (i = 0; i <= topo.max_cpu_num; ++i) {
4242                 if (cpus[i].put_ids)
4243                         CPU_FREE(cpus[i].put_ids);
4244         }
4245         free(cpus);
4246 }
4247
4248 /*
4249  * Parse a file containing a single int.
4250  * Return 0 if file can not be opened
4251  * Exit if file can be opened, but can not be parsed
4252  */
4253 int parse_int_file(const char *fmt, ...)
4254 {
4255         va_list args;
4256         char path[PATH_MAX];
4257         FILE *filep;
4258         int value;
4259
4260         va_start(args, fmt);
4261         vsnprintf(path, sizeof(path), fmt, args);
4262         va_end(args);
4263         filep = fopen(path, "r");
4264         if (!filep)
4265                 return 0;
4266         if (fscanf(filep, "%d", &value) != 1)
4267                 err(1, "%s: failed to parse number from file", path);
4268         fclose(filep);
4269         return value;
4270 }
4271
4272 /*
4273  * cpu_is_first_core_in_package(cpu)
4274  * return 1 if given CPU is 1st core in package
4275  */
4276 int cpu_is_first_core_in_package(int cpu)
4277 {
4278         return cpu == parse_int_file("/sys/devices/system/cpu/cpu%d/topology/core_siblings_list", cpu);
4279 }
4280
4281 int get_physical_package_id(int cpu)
4282 {
4283         return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/physical_package_id", cpu);
4284 }
4285
4286 int get_die_id(int cpu)
4287 {
4288         return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/die_id", cpu);
4289 }
4290
4291 int get_core_id(int cpu)
4292 {
4293         return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/core_id", cpu);
4294 }
4295
4296 void set_node_data(void)
4297 {
4298         int pkg, node, lnode, cpu, cpux;
4299         int cpu_count;
4300
4301         /* initialize logical_node_id */
4302         for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu)
4303                 cpus[cpu].logical_node_id = -1;
4304
4305         cpu_count = 0;
4306         for (pkg = 0; pkg < topo.num_packages; pkg++) {
4307                 lnode = 0;
4308                 for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) {
4309                         if (cpus[cpu].physical_package_id != pkg)
4310                                 continue;
4311                         /* find a cpu with an unset logical_node_id */
4312                         if (cpus[cpu].logical_node_id != -1)
4313                                 continue;
4314                         cpus[cpu].logical_node_id = lnode;
4315                         node = cpus[cpu].physical_node_id;
4316                         cpu_count++;
4317                         /*
4318                          * find all matching cpus on this pkg and set
4319                          * the logical_node_id
4320                          */
4321                         for (cpux = cpu; cpux <= topo.max_cpu_num; cpux++) {
4322                                 if ((cpus[cpux].physical_package_id == pkg) && (cpus[cpux].physical_node_id == node)) {
4323                                         cpus[cpux].logical_node_id = lnode;
4324                                         cpu_count++;
4325                                 }
4326                         }
4327                         lnode++;
4328                         if (lnode > topo.nodes_per_pkg)
4329                                 topo.nodes_per_pkg = lnode;
4330                 }
4331                 if (cpu_count >= topo.max_cpu_num)
4332                         break;
4333         }
4334 }
4335
4336 int get_physical_node_id(struct cpu_topology *thiscpu)
4337 {
4338         char path[80];
4339         FILE *filep;
4340         int i;
4341         int cpu = thiscpu->logical_cpu_id;
4342
4343         for (i = 0; i <= topo.max_cpu_num; i++) {
4344                 sprintf(path, "/sys/devices/system/cpu/cpu%d/node%i/cpulist", cpu, i);
4345                 filep = fopen(path, "r");
4346                 if (!filep)
4347                         continue;
4348                 fclose(filep);
4349                 return i;
4350         }
4351         return -1;
4352 }
4353
4354 static int parse_cpu_str(char *cpu_str, cpu_set_t *cpu_set, int cpu_set_size)
4355 {
4356         unsigned int start, end;
4357         char *next = cpu_str;
4358
4359         while (next && *next) {
4360
4361                 if (*next == '-')       /* no negative cpu numbers */
4362                         return 1;
4363
4364                 start = strtoul(next, &next, 10);
4365
4366                 if (start >= CPU_SUBSET_MAXCPUS)
4367                         return 1;
4368                 CPU_SET_S(start, cpu_set_size, cpu_set);
4369
4370                 if (*next == '\0' || *next == '\n')
4371                         break;
4372
4373                 if (*next == ',') {
4374                         next += 1;
4375                         continue;
4376                 }
4377
4378                 if (*next == '-') {
4379                         next += 1;      /* start range */
4380                 } else if (*next == '.') {
4381                         next += 1;
4382                         if (*next == '.')
4383                                 next += 1;      /* start range */
4384                         else
4385                                 return 1;
4386                 }
4387
4388                 end = strtoul(next, &next, 10);
4389                 if (end <= start)
4390                         return 1;
4391
4392                 while (++start <= end) {
4393                         if (start >= CPU_SUBSET_MAXCPUS)
4394                                 return 1;
4395                         CPU_SET_S(start, cpu_set_size, cpu_set);
4396                 }
4397
4398                 if (*next == ',')
4399                         next += 1;
4400                 else if (*next != '\0' && *next != '\n')
4401                         return 1;
4402         }
4403
4404         return 0;
4405 }
4406
4407 int get_thread_siblings(struct cpu_topology *thiscpu)
4408 {
4409         char path[80], character;
4410         FILE *filep;
4411         unsigned long map;
4412         int so, shift, sib_core;
4413         int cpu = thiscpu->logical_cpu_id;
4414         int offset = topo.max_cpu_num + 1;
4415         size_t size;
4416         int thread_id = 0;
4417
4418         thiscpu->put_ids = CPU_ALLOC((topo.max_cpu_num + 1));
4419         if (thiscpu->thread_id < 0)
4420                 thiscpu->thread_id = thread_id++;
4421         if (!thiscpu->put_ids)
4422                 return -1;
4423
4424         size = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
4425         CPU_ZERO_S(size, thiscpu->put_ids);
4426
4427         sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu);
4428         filep = fopen(path, "r");
4429
4430         if (!filep) {
4431                 warnx("%s: open failed", path);
4432                 return -1;
4433         }
4434         do {
4435                 offset -= BITMASK_SIZE;
4436                 if (fscanf(filep, "%lx%c", &map, &character) != 2)
4437                         err(1, "%s: failed to parse file", path);
4438                 for (shift = 0; shift < BITMASK_SIZE; shift++) {
4439                         if ((map >> shift) & 0x1) {
4440                                 so = shift + offset;
4441                                 sib_core = get_core_id(so);
4442                                 if (sib_core == thiscpu->physical_core_id) {
4443                                         CPU_SET_S(so, size, thiscpu->put_ids);
4444                                         if ((so != cpu) && (cpus[so].thread_id < 0))
4445                                                 cpus[so].thread_id = thread_id++;
4446                                 }
4447                         }
4448                 }
4449         } while (character == ',');
4450         fclose(filep);
4451
4452         return CPU_COUNT_S(size, thiscpu->put_ids);
4453 }
4454
4455 /*
4456  * run func(thread, core, package) in topology order
4457  * skip non-present cpus
4458  */
4459
4460 int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *,
4461                                struct pkg_data *, struct thread_data *, struct core_data *,
4462                                struct pkg_data *), struct thread_data *thread_base,
4463                    struct core_data *core_base, struct pkg_data *pkg_base,
4464                    struct thread_data *thread_base2, struct core_data *core_base2, struct pkg_data *pkg_base2)
4465 {
4466         int retval, pkg_no, node_no, core_no, thread_no;
4467
4468         for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
4469                 for (node_no = 0; node_no < topo.nodes_per_pkg; ++node_no) {
4470                         for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
4471                                 for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
4472                                         struct thread_data *t, *t2;
4473                                         struct core_data *c, *c2;
4474                                         struct pkg_data *p, *p2;
4475
4476                                         t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
4477
4478                                         if (cpu_is_not_allowed(t->cpu_id))
4479                                                 continue;
4480
4481                                         t2 = GET_THREAD(thread_base2, thread_no, core_no, node_no, pkg_no);
4482
4483                                         c = GET_CORE(core_base, core_no, node_no, pkg_no);
4484                                         c2 = GET_CORE(core_base2, core_no, node_no, pkg_no);
4485
4486                                         p = GET_PKG(pkg_base, pkg_no);
4487                                         p2 = GET_PKG(pkg_base2, pkg_no);
4488
4489                                         retval = func(t, c, p, t2, c2, p2);
4490                                         if (retval)
4491                                                 return retval;
4492                                 }
4493                         }
4494                 }
4495         }
4496         return 0;
4497 }
4498
4499 /*
4500  * run func(cpu) on every cpu in /proc/stat
4501  * return max_cpu number
4502  */
4503 int for_all_proc_cpus(int (func) (int))
4504 {
4505         FILE *fp;
4506         int cpu_num;
4507         int retval;
4508
4509         fp = fopen_or_die(proc_stat, "r");
4510
4511         retval = fscanf(fp, "cpu %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n");
4512         if (retval != 0)
4513                 err(1, "%s: failed to parse format", proc_stat);
4514
4515         while (1) {
4516                 retval = fscanf(fp, "cpu%u %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n", &cpu_num);
4517                 if (retval != 1)
4518                         break;
4519
4520                 retval = func(cpu_num);
4521                 if (retval) {
4522                         fclose(fp);
4523                         return (retval);
4524                 }
4525         }
4526         fclose(fp);
4527         return 0;
4528 }
4529
4530 #define PATH_EFFECTIVE_CPUS     "/sys/fs/cgroup/cpuset.cpus.effective"
4531
4532 static char cpu_effective_str[1024];
4533
4534 static int update_effective_str(bool startup)
4535 {
4536         FILE *fp;
4537         char *pos;
4538         char buf[1024];
4539         int ret;
4540
4541         if (cpu_effective_str[0] == '\0' && !startup)
4542                 return 0;
4543
4544         fp = fopen(PATH_EFFECTIVE_CPUS, "r");
4545         if (!fp)
4546                 return 0;
4547
4548         pos = fgets(buf, 1024, fp);
4549         if (!pos)
4550                 err(1, "%s: file read failed\n", PATH_EFFECTIVE_CPUS);
4551
4552         fclose(fp);
4553
4554         ret = strncmp(cpu_effective_str, buf, 1024);
4555         if (!ret)
4556                 return 0;
4557
4558         strncpy(cpu_effective_str, buf, 1024);
4559         return 1;
4560 }
4561
4562 static void update_effective_set(bool startup)
4563 {
4564         update_effective_str(startup);
4565
4566         if (parse_cpu_str(cpu_effective_str, cpu_effective_set, cpu_effective_setsize))
4567                 err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str);
4568 }
4569
4570 void linux_perf_init(void);
4571 void rapl_perf_init(void);
4572
4573 void re_initialize(void)
4574 {
4575         free_all_buffers();
4576         setup_all_buffers(false);
4577         linux_perf_init();
4578         rapl_perf_init();
4579         fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus,
4580                 topo.allowed_cpus);
4581 }
4582
4583 void set_max_cpu_num(void)
4584 {
4585         FILE *filep;
4586         int base_cpu;
4587         unsigned long dummy;
4588         char pathname[64];
4589
4590         base_cpu = sched_getcpu();
4591         if (base_cpu < 0)
4592                 err(1, "cannot find calling cpu ID");
4593         sprintf(pathname, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", base_cpu);
4594
4595         filep = fopen_or_die(pathname, "r");
4596         topo.max_cpu_num = 0;
4597         while (fscanf(filep, "%lx,", &dummy) == 1)
4598                 topo.max_cpu_num += BITMASK_SIZE;
4599         fclose(filep);
4600         topo.max_cpu_num--;     /* 0 based */
4601 }
4602
4603 /*
4604  * count_cpus()
4605  * remember the last one seen, it will be the max
4606  */
4607 int count_cpus(int cpu)
4608 {
4609         UNUSED(cpu);
4610
4611         topo.num_cpus++;
4612         return 0;
4613 }
4614
4615 int mark_cpu_present(int cpu)
4616 {
4617         CPU_SET_S(cpu, cpu_present_setsize, cpu_present_set);
4618         return 0;
4619 }
4620
4621 int init_thread_id(int cpu)
4622 {
4623         cpus[cpu].thread_id = -1;
4624         return 0;
4625 }
4626
4627 /*
4628  * snapshot_proc_interrupts()
4629  *
4630  * read and record summary of /proc/interrupts
4631  *
4632  * return 1 if config change requires a restart, else return 0
4633  */
4634 int snapshot_proc_interrupts(void)
4635 {
4636         static FILE *fp;
4637         int column, retval;
4638
4639         if (fp == NULL)
4640                 fp = fopen_or_die("/proc/interrupts", "r");
4641         else
4642                 rewind(fp);
4643
4644         /* read 1st line of /proc/interrupts to get cpu* name for each column */
4645         for (column = 0; column < topo.num_cpus; ++column) {
4646                 int cpu_number;
4647
4648                 retval = fscanf(fp, " CPU%d", &cpu_number);
4649                 if (retval != 1)
4650                         break;
4651
4652                 if (cpu_number > topo.max_cpu_num) {
4653                         warn("/proc/interrupts: cpu%d: > %d", cpu_number, topo.max_cpu_num);
4654                         return 1;
4655                 }
4656
4657                 irq_column_2_cpu[column] = cpu_number;
4658                 irqs_per_cpu[cpu_number] = 0;
4659         }
4660
4661         /* read /proc/interrupt count lines and sum up irqs per cpu */
4662         while (1) {
4663                 int column;
4664                 char buf[64];
4665
4666                 retval = fscanf(fp, " %s:", buf);       /* flush irq# "N:" */
4667                 if (retval != 1)
4668                         break;
4669
4670                 /* read the count per cpu */
4671                 for (column = 0; column < topo.num_cpus; ++column) {
4672
4673                         int cpu_number, irq_count;
4674
4675                         retval = fscanf(fp, " %d", &irq_count);
4676                         if (retval != 1)
4677                                 break;
4678
4679                         cpu_number = irq_column_2_cpu[column];
4680                         irqs_per_cpu[cpu_number] += irq_count;
4681
4682                 }
4683
4684                 while (getc(fp) != '\n') ;      /* flush interrupt description */
4685
4686         }
4687         return 0;
4688 }
4689
4690 /*
4691  * snapshot_graphics()
4692  *
4693  * record snapshot of specified graphics sysfs knob
4694  *
4695  * return 1 if config change requires a restart, else return 0
4696  */
4697 int snapshot_graphics(int idx)
4698 {
4699         FILE *fp;
4700         int retval;
4701
4702         switch (idx) {
4703         case GFX_rc6:
4704         case SAM_mc6:
4705                 fp = fopen_or_die(gfx_info[idx].path, "r");
4706                 retval = fscanf(fp, "%lld", &gfx_info[idx].val_ull);
4707                 if (retval != 1)
4708                         err(1, "rc6");
4709                 fclose(fp);
4710                 return 0;
4711         case GFX_MHz:
4712         case GFX_ACTMHz:
4713         case SAM_MHz:
4714         case SAM_ACTMHz:
4715                 if (gfx_info[idx].fp == NULL) {
4716                         gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r");
4717                 } else {
4718                         rewind(gfx_info[idx].fp);
4719                         fflush(gfx_info[idx].fp);
4720                 }
4721                 retval = fscanf(gfx_info[idx].fp, "%d", &gfx_info[idx].val);
4722                 if (retval != 1)
4723                         err(1, "MHz");
4724                 return 0;
4725         default:
4726                 return -EINVAL;
4727         }
4728 }
4729
4730 /*
4731  * snapshot_cpu_lpi()
4732  *
4733  * record snapshot of
4734  * /sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us
4735  */
4736 int snapshot_cpu_lpi_us(void)
4737 {
4738         FILE *fp;
4739         int retval;
4740
4741         fp = fopen_or_die("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", "r");
4742
4743         retval = fscanf(fp, "%lld", &cpuidle_cur_cpu_lpi_us);
4744         if (retval != 1) {
4745                 fprintf(stderr, "Disabling Low Power Idle CPU output\n");
4746                 BIC_NOT_PRESENT(BIC_CPU_LPI);
4747                 fclose(fp);
4748                 return -1;
4749         }
4750
4751         fclose(fp);
4752
4753         return 0;
4754 }
4755
4756 /*
4757  * snapshot_sys_lpi()
4758  *
4759  * record snapshot of sys_lpi_file
4760  */
4761 int snapshot_sys_lpi_us(void)
4762 {
4763         FILE *fp;
4764         int retval;
4765
4766         fp = fopen_or_die(sys_lpi_file, "r");
4767
4768         retval = fscanf(fp, "%lld", &cpuidle_cur_sys_lpi_us);
4769         if (retval != 1) {
4770                 fprintf(stderr, "Disabling Low Power Idle System output\n");
4771                 BIC_NOT_PRESENT(BIC_SYS_LPI);
4772                 fclose(fp);
4773                 return -1;
4774         }
4775         fclose(fp);
4776
4777         return 0;
4778 }
4779
4780 /*
4781  * snapshot /proc and /sys files
4782  *
4783  * return 1 if configuration restart needed, else return 0
4784  */
4785 int snapshot_proc_sysfs_files(void)
4786 {
4787         if (DO_BIC(BIC_IRQ))
4788                 if (snapshot_proc_interrupts())
4789                         return 1;
4790
4791         if (DO_BIC(BIC_GFX_rc6))
4792                 snapshot_graphics(GFX_rc6);
4793
4794         if (DO_BIC(BIC_GFXMHz))
4795                 snapshot_graphics(GFX_MHz);
4796
4797         if (DO_BIC(BIC_GFXACTMHz))
4798                 snapshot_graphics(GFX_ACTMHz);
4799
4800         if (DO_BIC(BIC_SAM_mc6))
4801                 snapshot_graphics(SAM_mc6);
4802
4803         if (DO_BIC(BIC_SAMMHz))
4804                 snapshot_graphics(SAM_MHz);
4805
4806         if (DO_BIC(BIC_SAMACTMHz))
4807                 snapshot_graphics(SAM_ACTMHz);
4808
4809         if (DO_BIC(BIC_CPU_LPI))
4810                 snapshot_cpu_lpi_us();
4811
4812         if (DO_BIC(BIC_SYS_LPI))
4813                 snapshot_sys_lpi_us();
4814
4815         return 0;
4816 }
4817
4818 int exit_requested;
4819
4820 static void signal_handler(int signal)
4821 {
4822         switch (signal) {
4823         case SIGINT:
4824                 exit_requested = 1;
4825                 if (debug)
4826                         fprintf(stderr, " SIGINT\n");
4827                 break;
4828         case SIGUSR1:
4829                 if (debug > 1)
4830                         fprintf(stderr, "SIGUSR1\n");
4831                 break;
4832         }
4833 }
4834
4835 void setup_signal_handler(void)
4836 {
4837         struct sigaction sa;
4838
4839         memset(&sa, 0, sizeof(sa));
4840
4841         sa.sa_handler = &signal_handler;
4842
4843         if (sigaction(SIGINT, &sa, NULL) < 0)
4844                 err(1, "sigaction SIGINT");
4845         if (sigaction(SIGUSR1, &sa, NULL) < 0)
4846                 err(1, "sigaction SIGUSR1");
4847 }
4848
4849 void do_sleep(void)
4850 {
4851         struct timeval tout;
4852         struct timespec rest;
4853         fd_set readfds;
4854         int retval;
4855
4856         FD_ZERO(&readfds);
4857         FD_SET(0, &readfds);
4858
4859         if (ignore_stdin) {
4860                 nanosleep(&interval_ts, NULL);
4861                 return;
4862         }
4863
4864         tout = interval_tv;
4865         retval = select(1, &readfds, NULL, NULL, &tout);
4866
4867         if (retval == 1) {
4868                 switch (getc(stdin)) {
4869                 case 'q':
4870                         exit_requested = 1;
4871                         break;
4872                 case EOF:
4873                         /*
4874                          * 'stdin' is a pipe closed on the other end. There
4875                          * won't be any further input.
4876                          */
4877                         ignore_stdin = 1;
4878                         /* Sleep the rest of the time */
4879                         rest.tv_sec = (tout.tv_sec + tout.tv_usec / 1000000);
4880                         rest.tv_nsec = (tout.tv_usec % 1000000) * 1000;
4881                         nanosleep(&rest, NULL);
4882                 }
4883         }
4884 }
4885
4886 int get_msr_sum(int cpu, off_t offset, unsigned long long *msr)
4887 {
4888         int ret, idx;
4889         unsigned long long msr_cur, msr_last;
4890
4891         assert(!no_msr);
4892
4893         if (!per_cpu_msr_sum)
4894                 return 1;
4895
4896         idx = offset_to_idx(offset);
4897         if (idx < 0)
4898                 return idx;
4899         /* get_msr_sum() = sum + (get_msr() - last) */
4900         ret = get_msr(cpu, offset, &msr_cur);
4901         if (ret)
4902                 return ret;
4903         msr_last = per_cpu_msr_sum[cpu].entries[idx].last;
4904         DELTA_WRAP32(msr_cur, msr_last);
4905         *msr = msr_last + per_cpu_msr_sum[cpu].entries[idx].sum;
4906
4907         return 0;
4908 }
4909
4910 timer_t timerid;
4911
4912 /* Timer callback, update the sum of MSRs periodically. */
4913 static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg_data *p)
4914 {
4915         int i, ret;
4916         int cpu = t->cpu_id;
4917
4918         UNUSED(c);
4919         UNUSED(p);
4920
4921         assert(!no_msr);
4922
4923         for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) {
4924                 unsigned long long msr_cur, msr_last;
4925                 off_t offset;
4926
4927                 if (!idx_valid(i))
4928                         continue;
4929                 offset = idx_to_offset(i);
4930                 if (offset < 0)
4931                         continue;
4932                 ret = get_msr(cpu, offset, &msr_cur);
4933                 if (ret) {
4934                         fprintf(outf, "Can not update msr(0x%llx)\n", (unsigned long long)offset);
4935                         continue;
4936                 }
4937
4938                 msr_last = per_cpu_msr_sum[cpu].entries[i].last;
4939                 per_cpu_msr_sum[cpu].entries[i].last = msr_cur & 0xffffffff;
4940
4941                 DELTA_WRAP32(msr_cur, msr_last);
4942                 per_cpu_msr_sum[cpu].entries[i].sum += msr_last;
4943         }
4944         return 0;
4945 }
4946
4947 static void msr_record_handler(union sigval v)
4948 {
4949         UNUSED(v);
4950
4951         for_all_cpus(update_msr_sum, EVEN_COUNTERS);
4952 }
4953
4954 void msr_sum_record(void)
4955 {
4956         struct itimerspec its;
4957         struct sigevent sev;
4958
4959         per_cpu_msr_sum = calloc(topo.max_cpu_num + 1, sizeof(struct msr_sum_array));
4960         if (!per_cpu_msr_sum) {
4961                 fprintf(outf, "Can not allocate memory for long time MSR.\n");
4962                 return;
4963         }
4964         /*
4965          * Signal handler might be restricted, so use thread notifier instead.
4966          */
4967         memset(&sev, 0, sizeof(struct sigevent));
4968         sev.sigev_notify = SIGEV_THREAD;
4969         sev.sigev_notify_function = msr_record_handler;
4970
4971         sev.sigev_value.sival_ptr = &timerid;
4972         if (timer_create(CLOCK_REALTIME, &sev, &timerid) == -1) {
4973                 fprintf(outf, "Can not create timer.\n");
4974                 goto release_msr;
4975         }
4976
4977         its.it_value.tv_sec = 0;
4978         its.it_value.tv_nsec = 1;
4979         /*
4980          * A wraparound time has been calculated early.
4981          * Some sources state that the peak power for a
4982          * microprocessor is usually 1.5 times the TDP rating,
4983          * use 2 * TDP for safety.
4984          */
4985         its.it_interval.tv_sec = rapl_joule_counter_range / 2;
4986         its.it_interval.tv_nsec = 0;
4987
4988         if (timer_settime(timerid, 0, &its, NULL) == -1) {
4989                 fprintf(outf, "Can not set timer.\n");
4990                 goto release_timer;
4991         }
4992         return;
4993
4994 release_timer:
4995         timer_delete(timerid);
4996 release_msr:
4997         free(per_cpu_msr_sum);
4998 }
4999
5000 /*
5001  * set_my_sched_priority(pri)
5002  * return previous priority on success
5003  * return value < -20 on failure
5004  */
5005 int set_my_sched_priority(int priority)
5006 {
5007         int retval;
5008         int original_priority;
5009
5010         errno = 0;
5011         original_priority = getpriority(PRIO_PROCESS, 0);
5012         if (errno && (original_priority == -1))
5013                 return -21;
5014
5015         retval = setpriority(PRIO_PROCESS, 0, priority);
5016         if (retval)
5017                 return -21;
5018
5019         errno = 0;
5020         retval = getpriority(PRIO_PROCESS, 0);
5021         if (retval != priority)
5022                 return -21;
5023
5024         return original_priority;
5025 }
5026
5027 void turbostat_loop()
5028 {
5029         int retval;
5030         int restarted = 0;
5031         unsigned int done_iters = 0;
5032
5033         setup_signal_handler();
5034
5035         /*
5036          * elevate own priority for interval mode
5037          *
5038          * ignore on error - we probably don't have permission to set it, but
5039          * it's not a big deal
5040          */
5041         set_my_sched_priority(-20);
5042
5043 restart:
5044         restarted++;
5045
5046         snapshot_proc_sysfs_files();
5047         retval = for_all_cpus(get_counters, EVEN_COUNTERS);
5048         first_counter_read = 0;
5049         if (retval < -1) {
5050                 exit(retval);
5051         } else if (retval == -1) {
5052                 if (restarted > 10) {
5053                         exit(retval);
5054                 }
5055                 re_initialize();
5056                 goto restart;
5057         }
5058         restarted = 0;
5059         done_iters = 0;
5060         gettimeofday(&tv_even, (struct timezone *)NULL);
5061
5062         while (1) {
5063                 if (for_all_proc_cpus(cpu_is_not_present)) {
5064                         re_initialize();
5065                         goto restart;
5066                 }
5067                 if (update_effective_str(false)) {
5068                         re_initialize();
5069                         goto restart;
5070                 }
5071                 do_sleep();
5072                 if (snapshot_proc_sysfs_files())
5073                         goto restart;
5074                 retval = for_all_cpus(get_counters, ODD_COUNTERS);
5075                 if (retval < -1) {
5076                         exit(retval);
5077                 } else if (retval == -1) {
5078                         re_initialize();
5079                         goto restart;
5080                 }
5081                 gettimeofday(&tv_odd, (struct timezone *)NULL);
5082                 timersub(&tv_odd, &tv_even, &tv_delta);
5083                 if (for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS)) {
5084                         re_initialize();
5085                         goto restart;
5086                 }
5087                 compute_average(EVEN_COUNTERS);
5088                 format_all_counters(EVEN_COUNTERS);
5089                 flush_output_stdout();
5090                 if (exit_requested)
5091                         break;
5092                 if (num_iterations && ++done_iters >= num_iterations)
5093                         break;
5094                 do_sleep();
5095                 if (snapshot_proc_sysfs_files())
5096                         goto restart;
5097                 retval = for_all_cpus(get_counters, EVEN_COUNTERS);
5098                 if (retval < -1) {
5099                         exit(retval);
5100                 } else if (retval == -1) {
5101                         re_initialize();
5102                         goto restart;
5103                 }
5104                 gettimeofday(&tv_even, (struct timezone *)NULL);
5105                 timersub(&tv_even, &tv_odd, &tv_delta);
5106                 if (for_all_cpus_2(delta_cpu, EVEN_COUNTERS, ODD_COUNTERS)) {
5107                         re_initialize();
5108                         goto restart;
5109                 }
5110                 compute_average(ODD_COUNTERS);
5111                 format_all_counters(ODD_COUNTERS);
5112                 flush_output_stdout();
5113                 if (exit_requested)
5114                         break;
5115                 if (num_iterations && ++done_iters >= num_iterations)
5116                         break;
5117         }
5118 }
5119
5120 void check_dev_msr()
5121 {
5122         struct stat sb;
5123         char pathname[32];
5124
5125         if (no_msr)
5126                 return;
5127
5128         sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
5129         if (stat(pathname, &sb))
5130                 if (system("/sbin/modprobe msr > /dev/null 2>&1"))
5131                         no_msr = 1;
5132 }
5133
5134 /*
5135  * check for CAP_SYS_RAWIO
5136  * return 0 on success
5137  * return 1 on fail
5138  */
5139 int check_for_cap_sys_rawio(void)
5140 {
5141         cap_t caps;
5142         cap_flag_value_t cap_flag_value;
5143         int ret = 0;
5144
5145         caps = cap_get_proc();
5146         if (caps == NULL)
5147                 return 1;
5148
5149         if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) {
5150                 ret = 1;
5151                 goto free_and_exit;
5152         }
5153
5154         if (cap_flag_value != CAP_SET) {
5155                 ret = 1;
5156                 goto free_and_exit;
5157         }
5158
5159 free_and_exit:
5160         if (cap_free(caps) == -1)
5161                 err(-6, "cap_free\n");
5162
5163         return ret;
5164 }
5165
5166 void check_msr_permission(void)
5167 {
5168         int failed = 0;
5169         char pathname[32];
5170
5171         if (no_msr)
5172                 return;
5173
5174         /* check for CAP_SYS_RAWIO */
5175         failed += check_for_cap_sys_rawio();
5176
5177         /* test file permissions */
5178         sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
5179         if (euidaccess(pathname, R_OK)) {
5180                 failed++;
5181         }
5182
5183         if (failed) {
5184                 warnx("Failed to access %s. Some of the counters may not be available\n"
5185                       "\tRun as root to enable them or use %s to disable the access explicitly", pathname, "--no-msr");
5186                 no_msr = 1;
5187         }
5188 }
5189
5190 void probe_bclk(void)
5191 {
5192         unsigned long long msr;
5193         unsigned int base_ratio;
5194
5195         if (!platform->has_nhm_msrs || no_msr)
5196                 return;
5197
5198         if (platform->bclk_freq == BCLK_100MHZ)
5199                 bclk = 100.00;
5200         else if (platform->bclk_freq == BCLK_133MHZ)
5201                 bclk = 133.33;
5202         else if (platform->bclk_freq == BCLK_SLV)
5203                 bclk = slm_bclk();
5204         else
5205                 return;
5206
5207         get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
5208         base_ratio = (msr >> 8) & 0xFF;
5209
5210         base_hz = base_ratio * bclk * 1000000;
5211         has_base_hz = 1;
5212
5213         if (platform->enable_tsc_tweak)
5214                 tsc_tweak = base_hz / tsc_hz;
5215 }
5216
5217 static void remove_underbar(char *s)
5218 {
5219         char *to = s;
5220
5221         while (*s) {
5222                 if (*s != '_')
5223                         *to++ = *s;
5224                 s++;
5225         }
5226
5227         *to = 0;
5228 }
5229
5230 static void dump_turbo_ratio_info(void)
5231 {
5232         if (!has_turbo)
5233                 return;
5234
5235         if (!platform->has_nhm_msrs || no_msr)
5236                 return;
5237
5238         if (platform->trl_msrs & TRL_LIMIT2)
5239                 dump_turbo_ratio_limit2();
5240
5241         if (platform->trl_msrs & TRL_LIMIT1)
5242                 dump_turbo_ratio_limit1();
5243
5244         if (platform->trl_msrs & TRL_BASE) {
5245                 dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT);
5246
5247                 if (is_hybrid)
5248                         dump_turbo_ratio_limits(MSR_SECONDARY_TURBO_RATIO_LIMIT);
5249         }
5250
5251         if (platform->trl_msrs & TRL_ATOM)
5252                 dump_atom_turbo_ratio_limits();
5253
5254         if (platform->trl_msrs & TRL_KNL)
5255                 dump_knl_turbo_ratio_limits();
5256
5257         if (platform->has_config_tdp)
5258                 dump_config_tdp();
5259 }
5260
5261 static int read_sysfs_int(char *path)
5262 {
5263         FILE *input;
5264         int retval = -1;
5265
5266         input = fopen(path, "r");
5267         if (input == NULL) {
5268                 if (debug)
5269                         fprintf(outf, "NSFOD %s\n", path);
5270                 return (-1);
5271         }
5272         if (fscanf(input, "%d", &retval) != 1)
5273                 err(1, "%s: failed to read int from file", path);
5274         fclose(input);
5275
5276         return (retval);
5277 }
5278
5279 static void dump_sysfs_file(char *path)
5280 {
5281         FILE *input;
5282         char cpuidle_buf[64];
5283
5284         input = fopen(path, "r");
5285         if (input == NULL) {
5286                 if (debug)
5287                         fprintf(outf, "NSFOD %s\n", path);
5288                 return;
5289         }
5290         if (!fgets(cpuidle_buf, sizeof(cpuidle_buf), input))
5291                 err(1, "%s: failed to read file", path);
5292         fclose(input);
5293
5294         fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf);
5295 }
5296
5297 static void probe_intel_uncore_frequency(void)
5298 {
5299         int i, j;
5300         char path[256];
5301
5302         if (!genuine_intel)
5303                 return;
5304
5305         if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK))
5306                 goto probe_cluster;
5307
5308         BIC_PRESENT(BIC_UNCORE_MHZ);
5309
5310         if (quiet)
5311                 return;
5312
5313         for (i = 0; i < topo.num_packages; ++i) {
5314                 for (j = 0; j < topo.num_die; ++j) {
5315                         int k, l;
5316                         char path_base[128];
5317
5318                         sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i,
5319                                 j);
5320
5321                         sprintf(path, "%s/min_freq_khz", path_base);
5322                         k = read_sysfs_int(path);
5323                         sprintf(path, "%s/max_freq_khz", path_base);
5324                         l = read_sysfs_int(path);
5325                         fprintf(outf, "Uncore Frequency package%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000);
5326
5327                         sprintf(path, "%s/initial_min_freq_khz", path_base);
5328                         k = read_sysfs_int(path);
5329                         sprintf(path, "%s/initial_max_freq_khz", path_base);
5330                         l = read_sysfs_int(path);
5331                         fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000);
5332
5333                         sprintf(path, "%s/current_freq_khz", path_base);
5334                         k = read_sysfs_int(path);
5335                         fprintf(outf, " %d MHz\n", k / 1000);
5336                 }
5337         }
5338         return;
5339
5340 probe_cluster:
5341         if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK))
5342                 return;
5343
5344         if (quiet)
5345                 return;
5346
5347         for (i = 0;; ++i) {
5348                 int k, l;
5349                 char path_base[128];
5350                 int package_id, domain_id, cluster_id;
5351
5352                 sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i);
5353
5354                 if (access(path_base, R_OK))
5355                         break;
5356
5357                 sprintf(path, "%s/package_id", path_base);
5358                 package_id = read_sysfs_int(path);
5359
5360                 sprintf(path, "%s/domain_id", path_base);
5361                 domain_id = read_sysfs_int(path);
5362
5363                 sprintf(path, "%s/fabric_cluster_id", path_base);
5364                 cluster_id = read_sysfs_int(path);
5365
5366                 sprintf(path, "%s/min_freq_khz", path_base);
5367                 k = read_sysfs_int(path);
5368                 sprintf(path, "%s/max_freq_khz", path_base);
5369                 l = read_sysfs_int(path);
5370                 fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", package_id, domain_id,
5371                         cluster_id, k / 1000, l / 1000);
5372
5373                 sprintf(path, "%s/initial_min_freq_khz", path_base);
5374                 k = read_sysfs_int(path);
5375                 sprintf(path, "%s/initial_max_freq_khz", path_base);
5376                 l = read_sysfs_int(path);
5377                 fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000);
5378
5379                 sprintf(path, "%s/current_freq_khz", path_base);
5380                 k = read_sysfs_int(path);
5381                 fprintf(outf, " %d MHz\n", k / 1000);
5382         }
5383 }
5384
5385 static void probe_graphics(void)
5386 {
5387         /* Xe graphics sysfs knobs */
5388         if (!access("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", R_OK)) {
5389                 FILE *fp;
5390                 char buf[8];
5391                 bool gt0_is_gt;
5392                 int idx;
5393
5394                 fp = fopen("/sys/class/drm/card0/device/tile0/gt0/gtidle/name", "r");
5395                 if (!fp)
5396                         goto next;
5397
5398                 if (!fread(buf, sizeof(char), 7, fp)) {
5399                         fclose(fp);
5400                         goto next;
5401                 }
5402                 fclose(fp);
5403
5404                 if (!strncmp(buf, "gt0-rc", strlen("gt0-rc")))
5405                         gt0_is_gt = true;
5406                 else if (!strncmp(buf, "gt0-mc", strlen("gt0-mc")))
5407                         gt0_is_gt = false;
5408                 else
5409                         goto next;
5410
5411                 idx = gt0_is_gt ? GFX_rc6 : SAM_mc6;
5412                 gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms";
5413
5414                 idx = gt0_is_gt ? GFX_MHz : SAM_MHz;
5415                 if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", R_OK))
5416                         gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq";
5417
5418                 idx = gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz;
5419                 if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", R_OK))
5420                         gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq";
5421
5422                 idx = gt0_is_gt ? SAM_mc6 : GFX_rc6;
5423                 if (!access("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", R_OK))
5424                         gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms";
5425
5426                 idx = gt0_is_gt ? SAM_MHz : GFX_MHz;
5427                 if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", R_OK))
5428                         gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq";
5429
5430                 idx = gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz;
5431                 if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", R_OK))
5432                         gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq";
5433
5434                 goto end;
5435         }
5436
5437 next:
5438         /* New i915 graphics sysfs knobs */
5439         if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) {
5440                 gfx_info[GFX_rc6].path = "/sys/class/drm/card0/gt/gt0/rc6_residency_ms";
5441
5442                 if (!access("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", R_OK))
5443                         gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz";
5444
5445                 if (!access("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", R_OK))
5446                         gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz";
5447
5448                 if (!access("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", R_OK))
5449                         gfx_info[SAM_mc6].path = "/sys/class/drm/card0/gt/gt1/rc6_residency_ms";
5450
5451                 if (!access("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", R_OK))
5452                         gfx_info[SAM_MHz].path = "/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz";
5453
5454                 if (!access("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", R_OK))
5455                         gfx_info[SAM_ACTMHz].path = "/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz";
5456
5457                 goto end;
5458         }
5459
5460         /* Fall back to traditional i915 graphics sysfs knobs */
5461         if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
5462                 gfx_info[GFX_rc6].path = "/sys/class/drm/card0/power/rc6_residency_ms";
5463
5464         if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK))
5465                 gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt_cur_freq_mhz";
5466         else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK))
5467                 gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz";
5468
5469
5470         if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK))
5471                 gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz";
5472         else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK))
5473                 gfx_info[GFX_ACTMHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz";
5474
5475 end:
5476         if (gfx_info[GFX_rc6].path)
5477                 BIC_PRESENT(BIC_GFX_rc6);
5478         if (gfx_info[GFX_MHz].path)
5479                 BIC_PRESENT(BIC_GFXMHz);
5480         if (gfx_info[GFX_ACTMHz].path)
5481                 BIC_PRESENT(BIC_GFXACTMHz);
5482         if (gfx_info[SAM_mc6].path)
5483                 BIC_PRESENT(BIC_SAM_mc6);
5484         if (gfx_info[SAM_MHz].path)
5485                 BIC_PRESENT(BIC_SAMMHz);
5486         if (gfx_info[SAM_ACTMHz].path)
5487                 BIC_PRESENT(BIC_SAMACTMHz);
5488 }
5489
5490 static void dump_sysfs_cstate_config(void)
5491 {
5492         char path[64];
5493         char name_buf[16];
5494         char desc[64];
5495         FILE *input;
5496         int state;
5497         char *sp;
5498
5499         if (access("/sys/devices/system/cpu/cpuidle", R_OK)) {
5500                 fprintf(outf, "cpuidle not loaded\n");
5501                 return;
5502         }
5503
5504         dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_driver");
5505         dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_governor");
5506         dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_governor_ro");
5507
5508         for (state = 0; state < 10; ++state) {
5509
5510                 sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
5511                 input = fopen(path, "r");
5512                 if (input == NULL)
5513                         continue;
5514                 if (!fgets(name_buf, sizeof(name_buf), input))
5515                         err(1, "%s: failed to read file", path);
5516
5517                 /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
5518                 sp = strchr(name_buf, '-');
5519                 if (!sp)
5520                         sp = strchrnul(name_buf, '\n');
5521                 *sp = '\0';
5522                 fclose(input);
5523
5524                 remove_underbar(name_buf);
5525
5526                 sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", base_cpu, state);
5527                 input = fopen(path, "r");
5528                 if (input == NULL)
5529                         continue;
5530                 if (!fgets(desc, sizeof(desc), input))
5531                         err(1, "%s: failed to read file", path);
5532
5533                 fprintf(outf, "cpu%d: %s: %s", base_cpu, name_buf, desc);
5534                 fclose(input);
5535         }
5536 }
5537
5538 static void dump_sysfs_pstate_config(void)
5539 {
5540         char path[64];
5541         char driver_buf[64];
5542         char governor_buf[64];
5543         FILE *input;
5544         int turbo;
5545
5546         sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", base_cpu);
5547         input = fopen(path, "r");
5548         if (input == NULL) {
5549                 fprintf(outf, "NSFOD %s\n", path);
5550                 return;
5551         }
5552         if (!fgets(driver_buf, sizeof(driver_buf), input))
5553                 err(1, "%s: failed to read file", path);
5554         fclose(input);
5555
5556         sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", base_cpu);
5557         input = fopen(path, "r");
5558         if (input == NULL) {
5559                 fprintf(outf, "NSFOD %s\n", path);
5560                 return;
5561         }
5562         if (!fgets(governor_buf, sizeof(governor_buf), input))
5563                 err(1, "%s: failed to read file", path);
5564         fclose(input);
5565
5566         fprintf(outf, "cpu%d: cpufreq driver: %s", base_cpu, driver_buf);
5567         fprintf(outf, "cpu%d: cpufreq governor: %s", base_cpu, governor_buf);
5568
5569         sprintf(path, "/sys/devices/system/cpu/cpufreq/boost");
5570         input = fopen(path, "r");
5571         if (input != NULL) {
5572                 if (fscanf(input, "%d", &turbo) != 1)
5573                         err(1, "%s: failed to parse number from file", path);
5574                 fprintf(outf, "cpufreq boost: %d\n", turbo);
5575                 fclose(input);
5576         }
5577
5578         sprintf(path, "/sys/devices/system/cpu/intel_pstate/no_turbo");
5579         input = fopen(path, "r");
5580         if (input != NULL) {
5581                 if (fscanf(input, "%d", &turbo) != 1)
5582                         err(1, "%s: failed to parse number from file", path);
5583                 fprintf(outf, "cpufreq intel_pstate no_turbo: %d\n", turbo);
5584                 fclose(input);
5585         }
5586 }
5587
5588 /*
5589  * print_epb()
5590  * Decode the ENERGY_PERF_BIAS MSR
5591  */
5592 int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
5593 {
5594         char *epb_string;
5595         int cpu, epb;
5596
5597         UNUSED(c);
5598         UNUSED(p);
5599
5600         if (!has_epb)
5601                 return 0;
5602
5603         cpu = t->cpu_id;
5604
5605         /* EPB is per-package */
5606         if (!is_cpu_first_thread_in_package(t, c, p))
5607                 return 0;
5608
5609         if (cpu_migrate(cpu)) {
5610                 fprintf(outf, "print_epb: Could not migrate to CPU %d\n", cpu);
5611                 return -1;
5612         }
5613
5614         epb = get_epb(cpu);
5615         if (epb < 0)
5616                 return 0;
5617
5618         switch (epb) {
5619         case ENERGY_PERF_BIAS_PERFORMANCE:
5620                 epb_string = "performance";
5621                 break;
5622         case ENERGY_PERF_BIAS_NORMAL:
5623                 epb_string = "balanced";
5624                 break;
5625         case ENERGY_PERF_BIAS_POWERSAVE:
5626                 epb_string = "powersave";
5627                 break;
5628         default:
5629                 epb_string = "custom";
5630                 break;
5631         }
5632         fprintf(outf, "cpu%d: EPB: %d (%s)\n", cpu, epb, epb_string);
5633
5634         return 0;
5635 }
5636
5637 /*
5638  * print_hwp()
5639  * Decode the MSR_HWP_CAPABILITIES
5640  */
5641 int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
5642 {
5643         unsigned long long msr;
5644         int cpu;
5645
5646         UNUSED(c);
5647         UNUSED(p);
5648
5649         if (no_msr)
5650                 return 0;
5651
5652         if (!has_hwp)
5653                 return 0;
5654
5655         cpu = t->cpu_id;
5656
5657         /* MSR_HWP_CAPABILITIES is per-package */
5658         if (!is_cpu_first_thread_in_package(t, c, p))
5659                 return 0;
5660
5661         if (cpu_migrate(cpu)) {
5662                 fprintf(outf, "print_hwp: Could not migrate to CPU %d\n", cpu);
5663                 return -1;
5664         }
5665
5666         if (get_msr(cpu, MSR_PM_ENABLE, &msr))
5667                 return 0;
5668
5669         fprintf(outf, "cpu%d: MSR_PM_ENABLE: 0x%08llx (%sHWP)\n", cpu, msr, (msr & (1 << 0)) ? "" : "No-");
5670
5671         /* MSR_PM_ENABLE[1] == 1 if HWP is enabled and MSRs visible */
5672         if ((msr & (1 << 0)) == 0)
5673                 return 0;
5674
5675         if (get_msr(cpu, MSR_HWP_CAPABILITIES, &msr))
5676                 return 0;
5677
5678         fprintf(outf, "cpu%d: MSR_HWP_CAPABILITIES: 0x%08llx "
5679                 "(high %d guar %d eff %d low %d)\n",
5680                 cpu, msr,
5681                 (unsigned int)HWP_HIGHEST_PERF(msr),
5682                 (unsigned int)HWP_GUARANTEED_PERF(msr),
5683                 (unsigned int)HWP_MOSTEFFICIENT_PERF(msr), (unsigned int)HWP_LOWEST_PERF(msr));
5684
5685         if (get_msr(cpu, MSR_HWP_REQUEST, &msr))
5686                 return 0;
5687
5688         fprintf(outf, "cpu%d: MSR_HWP_REQUEST: 0x%08llx "
5689                 "(min %d max %d des %d epp 0x%x window 0x%x pkg 0x%x)\n",
5690                 cpu, msr,
5691                 (unsigned int)(((msr) >> 0) & 0xff),
5692                 (unsigned int)(((msr) >> 8) & 0xff),
5693                 (unsigned int)(((msr) >> 16) & 0xff),
5694                 (unsigned int)(((msr) >> 24) & 0xff),
5695                 (unsigned int)(((msr) >> 32) & 0xff3), (unsigned int)(((msr) >> 42) & 0x1));
5696
5697         if (has_hwp_pkg) {
5698                 if (get_msr(cpu, MSR_HWP_REQUEST_PKG, &msr))
5699                         return 0;
5700
5701                 fprintf(outf, "cpu%d: MSR_HWP_REQUEST_PKG: 0x%08llx "
5702                         "(min %d max %d des %d epp 0x%x window 0x%x)\n",
5703                         cpu, msr,
5704                         (unsigned int)(((msr) >> 0) & 0xff),
5705                         (unsigned int)(((msr) >> 8) & 0xff),
5706                         (unsigned int)(((msr) >> 16) & 0xff),
5707                         (unsigned int)(((msr) >> 24) & 0xff), (unsigned int)(((msr) >> 32) & 0xff3));
5708         }
5709         if (has_hwp_notify) {
5710                 if (get_msr(cpu, MSR_HWP_INTERRUPT, &msr))
5711                         return 0;
5712
5713                 fprintf(outf, "cpu%d: MSR_HWP_INTERRUPT: 0x%08llx "
5714                         "(%s_Guaranteed_Perf_Change, %s_Excursion_Min)\n",
5715                         cpu, msr, ((msr) & 0x1) ? "EN" : "Dis", ((msr) & 0x2) ? "EN" : "Dis");
5716         }
5717         if (get_msr(cpu, MSR_HWP_STATUS, &msr))
5718                 return 0;
5719
5720         fprintf(outf, "cpu%d: MSR_HWP_STATUS: 0x%08llx "
5721                 "(%sGuaranteed_Perf_Change, %sExcursion_Min)\n",
5722                 cpu, msr, ((msr) & 0x1) ? "" : "No-", ((msr) & 0x4) ? "" : "No-");
5723
5724         return 0;
5725 }
5726
5727 /*
5728  * print_perf_limit()
5729  */
5730 int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data *p)
5731 {
5732         unsigned long long msr;
5733         int cpu;
5734
5735         UNUSED(c);
5736         UNUSED(p);
5737
5738         if (no_msr)
5739                 return 0;
5740
5741         cpu = t->cpu_id;
5742
5743         /* per-package */
5744         if (!is_cpu_first_thread_in_package(t, c, p))
5745                 return 0;
5746
5747         if (cpu_migrate(cpu)) {
5748                 fprintf(outf, "print_perf_limit: Could not migrate to CPU %d\n", cpu);
5749                 return -1;
5750         }
5751
5752         if (platform->plr_msrs & PLR_CORE) {
5753                 get_msr(cpu, MSR_CORE_PERF_LIMIT_REASONS, &msr);
5754                 fprintf(outf, "cpu%d: MSR_CORE_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
5755                 fprintf(outf, " (Active: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
5756                         (msr & 1 << 15) ? "bit15, " : "",
5757                         (msr & 1 << 14) ? "bit14, " : "",
5758                         (msr & 1 << 13) ? "Transitions, " : "",
5759                         (msr & 1 << 12) ? "MultiCoreTurbo, " : "",
5760                         (msr & 1 << 11) ? "PkgPwrL2, " : "",
5761                         (msr & 1 << 10) ? "PkgPwrL1, " : "",
5762                         (msr & 1 << 9) ? "CorePwr, " : "",
5763                         (msr & 1 << 8) ? "Amps, " : "",
5764                         (msr & 1 << 6) ? "VR-Therm, " : "",
5765                         (msr & 1 << 5) ? "Auto-HWP, " : "",
5766                         (msr & 1 << 4) ? "Graphics, " : "",
5767                         (msr & 1 << 2) ? "bit2, " : "",
5768                         (msr & 1 << 1) ? "ThermStatus, " : "", (msr & 1 << 0) ? "PROCHOT, " : "");
5769                 fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n",
5770                         (msr & 1 << 31) ? "bit31, " : "",
5771                         (msr & 1 << 30) ? "bit30, " : "",
5772                         (msr & 1 << 29) ? "Transitions, " : "",
5773                         (msr & 1 << 28) ? "MultiCoreTurbo, " : "",
5774                         (msr & 1 << 27) ? "PkgPwrL2, " : "",
5775                         (msr & 1 << 26) ? "PkgPwrL1, " : "",
5776                         (msr & 1 << 25) ? "CorePwr, " : "",
5777                         (msr & 1 << 24) ? "Amps, " : "",
5778                         (msr & 1 << 22) ? "VR-Therm, " : "",
5779                         (msr & 1 << 21) ? "Auto-HWP, " : "",
5780                         (msr & 1 << 20) ? "Graphics, " : "",
5781                         (msr & 1 << 18) ? "bit18, " : "",
5782                         (msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 16) ? "PROCHOT, " : "");
5783
5784         }
5785         if (platform->plr_msrs & PLR_GFX) {
5786                 get_msr(cpu, MSR_GFX_PERF_LIMIT_REASONS, &msr);
5787                 fprintf(outf, "cpu%d: MSR_GFX_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
5788                 fprintf(outf, " (Active: %s%s%s%s%s%s%s%s)",
5789                         (msr & 1 << 0) ? "PROCHOT, " : "",
5790                         (msr & 1 << 1) ? "ThermStatus, " : "",
5791                         (msr & 1 << 4) ? "Graphics, " : "",
5792                         (msr & 1 << 6) ? "VR-Therm, " : "",
5793                         (msr & 1 << 8) ? "Amps, " : "",
5794                         (msr & 1 << 9) ? "GFXPwr, " : "",
5795                         (msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : "");
5796                 fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s)\n",
5797                         (msr & 1 << 16) ? "PROCHOT, " : "",
5798                         (msr & 1 << 17) ? "ThermStatus, " : "",
5799                         (msr & 1 << 20) ? "Graphics, " : "",
5800                         (msr & 1 << 22) ? "VR-Therm, " : "",
5801                         (msr & 1 << 24) ? "Amps, " : "",
5802                         (msr & 1 << 25) ? "GFXPwr, " : "",
5803                         (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : "");
5804         }
5805         if (platform->plr_msrs & PLR_RING) {
5806                 get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr);
5807                 fprintf(outf, "cpu%d: MSR_RING_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
5808                 fprintf(outf, " (Active: %s%s%s%s%s%s)",
5809                         (msr & 1 << 0) ? "PROCHOT, " : "",
5810                         (msr & 1 << 1) ? "ThermStatus, " : "",
5811                         (msr & 1 << 6) ? "VR-Therm, " : "",
5812                         (msr & 1 << 8) ? "Amps, " : "",
5813                         (msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : "");
5814                 fprintf(outf, " (Logged: %s%s%s%s%s%s)\n",
5815                         (msr & 1 << 16) ? "PROCHOT, " : "",
5816                         (msr & 1 << 17) ? "ThermStatus, " : "",
5817                         (msr & 1 << 22) ? "VR-Therm, " : "",
5818                         (msr & 1 << 24) ? "Amps, " : "",
5819                         (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : "");
5820         }
5821         return 0;
5822 }
5823
5824 #define RAPL_POWER_GRANULARITY  0x7FFF  /* 15 bit power granularity */
5825 #define RAPL_TIME_GRANULARITY   0x3F    /* 6 bit time granularity */
5826
5827 double get_quirk_tdp(void)
5828 {
5829         if (platform->rapl_quirk_tdp)
5830                 return platform->rapl_quirk_tdp;
5831
5832         return 135.0;
5833 }
5834
5835 double get_tdp_intel(void)
5836 {
5837         unsigned long long msr;
5838
5839         if (platform->rapl_msrs & RAPL_PKG_POWER_INFO)
5840                 if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr))
5841                         return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units;
5842         return get_quirk_tdp();
5843 }
5844
5845 double get_tdp_amd(void)
5846 {
5847         return get_quirk_tdp();
5848 }
5849
5850 void rapl_probe_intel(void)
5851 {
5852         unsigned long long msr;
5853         unsigned int time_unit;
5854         double tdp;
5855         const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt;
5856         const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J;
5857
5858         if (rapl_joules)
5859                 bic_enabled &= ~bic_watt_bits;
5860         else
5861                 bic_enabled &= ~bic_joules_bits;
5862
5863         if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS))
5864                 bic_enabled &= ~BIC_PKG__;
5865         if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS))
5866                 bic_enabled &= ~BIC_RAM__;
5867
5868         /* units on package 0, verify later other packages match */
5869         if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr))
5870                 return;
5871
5872         rapl_power_units = 1.0 / (1 << (msr & 0xF));
5873         if (platform->has_rapl_divisor)
5874                 rapl_energy_units = 1.0 * (1 << (msr >> 8 & 0x1F)) / 1000000;
5875         else
5876                 rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F));
5877
5878         if (platform->has_fixed_rapl_unit)
5879                 rapl_dram_energy_units = (15.3 / 1000000);
5880         else
5881                 rapl_dram_energy_units = rapl_energy_units;
5882
5883         time_unit = msr >> 16 & 0xF;
5884         if (time_unit == 0)
5885                 time_unit = 0xA;
5886
5887         rapl_time_units = 1.0 / (1 << (time_unit));
5888
5889         tdp = get_tdp_intel();
5890
5891         rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
5892         if (!quiet)
5893                 fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
5894 }
5895
5896 void rapl_probe_amd(void)
5897 {
5898         unsigned long long msr;
5899         double tdp;
5900         const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt;
5901         const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J;
5902
5903         if (rapl_joules)
5904                 bic_enabled &= ~bic_watt_bits;
5905         else
5906                 bic_enabled &= ~bic_joules_bits;
5907
5908         if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr))
5909                 return;
5910
5911         rapl_time_units = ldexp(1.0, -(msr >> 16 & 0xf));
5912         rapl_energy_units = ldexp(1.0, -(msr >> 8 & 0x1f));
5913         rapl_power_units = ldexp(1.0, -(msr & 0xf));
5914
5915         tdp = get_tdp_amd();
5916
5917         rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
5918         if (!quiet)
5919                 fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
5920 }
5921
5922 void print_power_limit_msr(int cpu, unsigned long long msr, char *label)
5923 {
5924         fprintf(outf, "cpu%d: %s: %sabled (%0.3f Watts, %f sec, clamp %sabled)\n",
5925                 cpu, label,
5926                 ((msr >> 15) & 1) ? "EN" : "DIS",
5927                 ((msr >> 0) & 0x7FFF) * rapl_power_units,
5928                 (1.0 + (((msr >> 22) & 0x3) / 4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units,
5929                 (((msr >> 16) & 1) ? "EN" : "DIS"));
5930
5931         return;
5932 }
5933
5934 int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
5935 {
5936         unsigned long long msr;
5937         const char *msr_name;
5938         int cpu;
5939
5940         UNUSED(c);
5941         UNUSED(p);
5942
5943         if (!platform->rapl_msrs)
5944                 return 0;
5945
5946         /* RAPL counters are per package, so print only for 1st thread/package */
5947         if (!is_cpu_first_thread_in_package(t, c, p))
5948                 return 0;
5949
5950         cpu = t->cpu_id;
5951         if (cpu_migrate(cpu)) {
5952                 fprintf(outf, "print_rapl: Could not migrate to CPU %d\n", cpu);
5953                 return -1;
5954         }
5955
5956         if (platform->rapl_msrs & RAPL_AMD_F17H) {
5957                 msr_name = "MSR_RAPL_PWR_UNIT";
5958                 if (get_msr(cpu, MSR_RAPL_PWR_UNIT, &msr))
5959                         return -1;
5960         } else {
5961                 msr_name = "MSR_RAPL_POWER_UNIT";
5962                 if (get_msr(cpu, MSR_RAPL_POWER_UNIT, &msr))
5963                         return -1;
5964         }
5965
5966         fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr,
5967                 rapl_power_units, rapl_energy_units, rapl_time_units);
5968
5969         if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) {
5970
5971                 if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr))
5972                         return -5;
5973
5974                 fprintf(outf, "cpu%d: MSR_PKG_POWER_INFO: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
5975                         cpu, msr,
5976                         ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
5977                         ((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
5978                         ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
5979                         ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
5980
5981         }
5982         if (platform->rapl_msrs & RAPL_PKG) {
5983
5984                 if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr))
5985                         return -9;
5986
5987                 fprintf(outf, "cpu%d: MSR_PKG_POWER_LIMIT: 0x%08llx (%slocked)\n",
5988                         cpu, msr, (msr >> 63) & 1 ? "" : "UN");
5989
5990                 print_power_limit_msr(cpu, msr, "PKG Limit #1");
5991                 fprintf(outf, "cpu%d: PKG Limit #2: %sabled (%0.3f Watts, %f* sec, clamp %sabled)\n",
5992                         cpu,
5993                         ((msr >> 47) & 1) ? "EN" : "DIS",
5994                         ((msr >> 32) & 0x7FFF) * rapl_power_units,
5995                         (1.0 + (((msr >> 54) & 0x3) / 4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units,
5996                         ((msr >> 48) & 1) ? "EN" : "DIS");
5997
5998                 if (get_msr(cpu, MSR_VR_CURRENT_CONFIG, &msr))
5999                         return -9;
6000
6001                 fprintf(outf, "cpu%d: MSR_VR_CURRENT_CONFIG: 0x%08llx\n", cpu, msr);
6002                 fprintf(outf, "cpu%d: PKG Limit #4: %f Watts (%slocked)\n",
6003                         cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN");
6004         }
6005
6006         if (platform->rapl_msrs & RAPL_DRAM_POWER_INFO) {
6007                 if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr))
6008                         return -6;
6009
6010                 fprintf(outf, "cpu%d: MSR_DRAM_POWER_INFO,: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
6011                         cpu, msr,
6012                         ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
6013                         ((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
6014                         ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
6015                         ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
6016         }
6017         if (platform->rapl_msrs & RAPL_DRAM) {
6018                 if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr))
6019                         return -9;
6020                 fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n",
6021                         cpu, msr, (msr >> 31) & 1 ? "" : "UN");
6022
6023                 print_power_limit_msr(cpu, msr, "DRAM Limit");
6024         }
6025         if (platform->rapl_msrs & RAPL_CORE_POLICY) {
6026                 if (get_msr(cpu, MSR_PP0_POLICY, &msr))
6027                         return -7;
6028
6029                 fprintf(outf, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF);
6030         }
6031         if (platform->rapl_msrs & RAPL_CORE_POWER_LIMIT) {
6032                 if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr))
6033                         return -9;
6034                 fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n",
6035                         cpu, msr, (msr >> 31) & 1 ? "" : "UN");
6036                 print_power_limit_msr(cpu, msr, "Cores Limit");
6037         }
6038         if (platform->rapl_msrs & RAPL_GFX) {
6039                 if (get_msr(cpu, MSR_PP1_POLICY, &msr))
6040                         return -8;
6041
6042                 fprintf(outf, "cpu%d: MSR_PP1_POLICY: %lld\n", cpu, msr & 0xF);
6043
6044                 if (get_msr(cpu, MSR_PP1_POWER_LIMIT, &msr))
6045                         return -9;
6046                 fprintf(outf, "cpu%d: MSR_PP1_POWER_LIMIT: 0x%08llx (%slocked)\n",
6047                         cpu, msr, (msr >> 31) & 1 ? "" : "UN");
6048                 print_power_limit_msr(cpu, msr, "GFX Limit");
6049         }
6050         return 0;
6051 }
6052
6053 /*
6054  * probe_rapl()
6055  *
6056  * sets rapl_power_units, rapl_energy_units, rapl_time_units
6057  */
6058 void probe_rapl(void)
6059 {
6060         if (!platform->rapl_msrs || no_msr)
6061                 return;
6062
6063         if (genuine_intel)
6064                 rapl_probe_intel();
6065         if (authentic_amd || hygon_genuine)
6066                 rapl_probe_amd();
6067
6068         if (quiet)
6069                 return;
6070
6071         for_all_cpus(print_rapl, ODD_COUNTERS);
6072 }
6073
6074 /*
6075  * MSR_IA32_TEMPERATURE_TARGET indicates the temperature where
6076  * the Thermal Control Circuit (TCC) activates.
6077  * This is usually equal to tjMax.
6078  *
6079  * Older processors do not have this MSR, so there we guess,
6080  * but also allow cmdline over-ride with -T.
6081  *
6082  * Several MSR temperature values are in units of degrees-C
6083  * below this value, including the Digital Thermal Sensor (DTS),
6084  * Package Thermal Management Sensor (PTM), and thermal event thresholds.
6085  */
6086 int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p)
6087 {
6088         unsigned long long msr;
6089         unsigned int tcc_default, tcc_offset;
6090         int cpu;
6091
6092         UNUSED(c);
6093         UNUSED(p);
6094
6095         /* tj_max is used only for dts or ptm */
6096         if (!(do_dts || do_ptm))
6097                 return 0;
6098
6099         /* this is a per-package concept */
6100         if (!is_cpu_first_thread_in_package(t, c, p))
6101                 return 0;
6102
6103         cpu = t->cpu_id;
6104         if (cpu_migrate(cpu)) {
6105                 fprintf(outf, "Could not migrate to CPU %d\n", cpu);
6106                 return -1;
6107         }
6108
6109         if (tj_max_override != 0) {
6110                 tj_max = tj_max_override;
6111                 fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", cpu, tj_max);
6112                 return 0;
6113         }
6114
6115         /* Temperature Target MSR is Nehalem and newer only */
6116         if (!platform->has_nhm_msrs || no_msr)
6117                 goto guess;
6118
6119         if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
6120                 goto guess;
6121
6122         tcc_default = (msr >> 16) & 0xFF;
6123
6124         if (!quiet) {
6125                 int bits = platform->tcc_offset_bits;
6126                 unsigned long long enabled = 0;
6127
6128                 if (bits && !get_msr(base_cpu, MSR_PLATFORM_INFO, &enabled))
6129                         enabled = (enabled >> 30) & 1;
6130
6131                 if (bits && enabled) {
6132                         tcc_offset = (msr >> 24) & GENMASK(bits - 1, 0);
6133                         fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n",
6134                                 cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset);
6135                 } else {
6136                         fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, tcc_default);
6137                 }
6138         }
6139
6140         if (!tcc_default)
6141                 goto guess;
6142
6143         tj_max = tcc_default;
6144
6145         return 0;
6146
6147 guess:
6148         tj_max = TJMAX_DEFAULT;
6149         fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", cpu, tj_max);
6150
6151         return 0;
6152 }
6153
6154 int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
6155 {
6156         unsigned long long msr;
6157         unsigned int dts, dts2;
6158         int cpu;
6159
6160         UNUSED(c);
6161         UNUSED(p);
6162
6163         if (no_msr)
6164                 return 0;
6165
6166         if (!(do_dts || do_ptm))
6167                 return 0;
6168
6169         cpu = t->cpu_id;
6170
6171         /* DTS is per-core, no need to print for each thread */
6172         if (!is_cpu_first_thread_in_core(t, c, p))
6173                 return 0;
6174
6175         if (cpu_migrate(cpu)) {
6176                 fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu);
6177                 return -1;
6178         }
6179
6180         if (do_ptm && is_cpu_first_core_in_package(t, c, p)) {
6181                 if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
6182                         return 0;
6183
6184                 dts = (msr >> 16) & 0x7F;
6185                 fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts);
6186
6187                 if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr))
6188                         return 0;
6189
6190                 dts = (msr >> 16) & 0x7F;
6191                 dts2 = (msr >> 8) & 0x7F;
6192                 fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
6193                         cpu, msr, tj_max - dts, tj_max - dts2);
6194         }
6195
6196         if (do_dts && debug) {
6197                 unsigned int resolution;
6198
6199                 if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
6200                         return 0;
6201
6202                 dts = (msr >> 16) & 0x7F;
6203                 resolution = (msr >> 27) & 0xF;
6204                 fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n",
6205                         cpu, msr, tj_max - dts, resolution);
6206
6207                 if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr))
6208                         return 0;
6209
6210                 dts = (msr >> 16) & 0x7F;
6211                 dts2 = (msr >> 8) & 0x7F;
6212                 fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
6213                         cpu, msr, tj_max - dts, tj_max - dts2);
6214         }
6215
6216         return 0;
6217 }
6218
6219 void probe_thermal(void)
6220 {
6221         if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK))
6222                 BIC_PRESENT(BIC_CORE_THROT_CNT);
6223         else
6224                 BIC_NOT_PRESENT(BIC_CORE_THROT_CNT);
6225
6226         for_all_cpus(set_temperature_target, ODD_COUNTERS);
6227
6228         if (quiet)
6229                 return;
6230
6231         for_all_cpus(print_thermal, ODD_COUNTERS);
6232 }
6233
6234 int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p)
6235 {
6236         unsigned int eax, ebx, ecx, edx;
6237
6238         UNUSED(c);
6239         UNUSED(p);
6240
6241         if (!genuine_intel)
6242                 return 0;
6243
6244         if (cpu_migrate(t->cpu_id)) {
6245                 fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id);
6246                 return -1;
6247         }
6248
6249         if (max_level < 0x1a)
6250                 return 0;
6251
6252         __cpuid(0x1a, eax, ebx, ecx, edx);
6253         eax = (eax >> 24) & 0xFF;
6254         if (eax == 0x20)
6255                 t->is_atom = true;
6256         return 0;
6257 }
6258
6259 void decode_feature_control_msr(void)
6260 {
6261         unsigned long long msr;
6262
6263         if (no_msr)
6264                 return;
6265
6266         if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr))
6267                 fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n",
6268                         base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : "");
6269 }
6270
6271 void decode_misc_enable_msr(void)
6272 {
6273         unsigned long long msr;
6274
6275         if (no_msr)
6276                 return;
6277
6278         if (!genuine_intel)
6279                 return;
6280
6281         if (!get_msr(base_cpu, MSR_IA32_MISC_ENABLE, &msr))
6282                 fprintf(outf, "cpu%d: MSR_IA32_MISC_ENABLE: 0x%08llx (%sTCC %sEIST %sMWAIT %sPREFETCH %sTURBO)\n",
6283                         base_cpu, msr,
6284                         msr & MSR_IA32_MISC_ENABLE_TM1 ? "" : "No-",
6285                         msr & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP ? "" : "No-",
6286                         msr & MSR_IA32_MISC_ENABLE_MWAIT ? "" : "No-",
6287                         msr & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE ? "No-" : "",
6288                         msr & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ? "No-" : "");
6289 }
6290
6291 void decode_misc_feature_control(void)
6292 {
6293         unsigned long long msr;
6294
6295         if (no_msr)
6296                 return;
6297
6298         if (!platform->has_msr_misc_feature_control)
6299                 return;
6300
6301         if (!get_msr(base_cpu, MSR_MISC_FEATURE_CONTROL, &msr))
6302                 fprintf(outf,
6303                         "cpu%d: MSR_MISC_FEATURE_CONTROL: 0x%08llx (%sL2-Prefetch %sL2-Prefetch-pair %sL1-Prefetch %sL1-IP-Prefetch)\n",
6304                         base_cpu, msr, msr & (0 << 0) ? "No-" : "", msr & (1 << 0) ? "No-" : "",
6305                         msr & (2 << 0) ? "No-" : "", msr & (3 << 0) ? "No-" : "");
6306 }
6307
6308 /*
6309  * Decode MSR_MISC_PWR_MGMT
6310  *
6311  * Decode the bits according to the Nehalem documentation
6312  * bit[0] seems to continue to have same meaning going forward
6313  * bit[1] less so...
6314  */
6315 void decode_misc_pwr_mgmt_msr(void)
6316 {
6317         unsigned long long msr;
6318
6319         if (no_msr)
6320                 return;
6321
6322         if (!platform->has_msr_misc_pwr_mgmt)
6323                 return;
6324
6325         if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr))
6326                 fprintf(outf, "cpu%d: MSR_MISC_PWR_MGMT: 0x%08llx (%sable-EIST_Coordination %sable-EPB %sable-OOB)\n",
6327                         base_cpu, msr,
6328                         msr & (1 << 0) ? "DIS" : "EN", msr & (1 << 1) ? "EN" : "DIS", msr & (1 << 8) ? "EN" : "DIS");
6329 }
6330
6331 /*
6332  * Decode MSR_CC6_DEMOTION_POLICY_CONFIG, MSR_MC6_DEMOTION_POLICY_CONFIG
6333  *
6334  * This MSRs are present on Silvermont processors,
6335  * Intel Atom processor E3000 series (Baytrail), and friends.
6336  */
6337 void decode_c6_demotion_policy_msr(void)
6338 {
6339         unsigned long long msr;
6340
6341         if (no_msr)
6342                 return;
6343
6344         if (!platform->has_msr_c6_demotion_policy_config)
6345                 return;
6346
6347         if (!get_msr(base_cpu, MSR_CC6_DEMOTION_POLICY_CONFIG, &msr))
6348                 fprintf(outf, "cpu%d: MSR_CC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-CC6-Demotion)\n",
6349                         base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
6350
6351         if (!get_msr(base_cpu, MSR_MC6_DEMOTION_POLICY_CONFIG, &msr))
6352                 fprintf(outf, "cpu%d: MSR_MC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-MC6-Demotion)\n",
6353                         base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
6354 }
6355
6356 void print_dev_latency(void)
6357 {
6358         char *path = "/dev/cpu_dma_latency";
6359         int fd;
6360         int value;
6361         int retval;
6362
6363         fd = open(path, O_RDONLY);
6364         if (fd < 0) {
6365                 if (debug)
6366                         warnx("Read %s failed", path);
6367                 return;
6368         }
6369
6370         retval = read(fd, (void *)&value, sizeof(int));
6371         if (retval != sizeof(int)) {
6372                 warn("read failed %s", path);
6373                 close(fd);
6374                 return;
6375         }
6376         fprintf(outf, "/dev/cpu_dma_latency: %d usec (%s)\n", value, value == 2000000000 ? "default" : "constrained");
6377
6378         close(fd);
6379 }
6380
6381 static int has_instr_count_access(void)
6382 {
6383         int fd;
6384         int has_access;
6385
6386         if (no_perf)
6387                 return 0;
6388
6389         fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0);
6390         has_access = fd != -1;
6391
6392         if (fd != -1)
6393                 close(fd);
6394
6395         if (!has_access)
6396                 warnx("Failed to access %s. Some of the counters may not be available\n"
6397                       "\tRun as root to enable them or use %s to disable the access explicitly",
6398                       "instructions retired perf counter", "--no-perf");
6399
6400         return has_access;
6401 }
6402
6403 bool is_aperf_access_required(void)
6404 {
6405         return BIC_IS_ENABLED(BIC_Avg_MHz)
6406             || BIC_IS_ENABLED(BIC_Busy)
6407             || BIC_IS_ENABLED(BIC_Bzy_MHz)
6408             || BIC_IS_ENABLED(BIC_IPC);
6409 }
6410
6411 int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
6412                            double *scale_, enum rapl_unit *unit_)
6413 {
6414         if (no_perf)
6415                 return -1;
6416
6417         const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name);
6418
6419         if (scale == 0.0)
6420                 return -1;
6421
6422         const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name);
6423
6424         if (unit == RAPL_UNIT_INVALID)
6425                 return -1;
6426
6427         const unsigned int rapl_type = read_perf_type(cai->perf_subsys);
6428         const unsigned int rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name);
6429
6430         const int fd_counter =
6431             open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP);
6432         if (fd_counter == -1)
6433                 return -1;
6434
6435         /* If it's the first counter opened, make it a group descriptor */
6436         if (rci->fd_perf == -1)
6437                 rci->fd_perf = fd_counter;
6438
6439         *scale_ = scale;
6440         *unit_ = unit;
6441         return fd_counter;
6442 }
6443
6444 int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
6445                           double *scale, enum rapl_unit *unit)
6446 {
6447         int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit);
6448
6449         if (debug)
6450                 fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
6451
6452         return ret;
6453 }
6454
6455 /*
6456  * Linux-perf manages the HW instructions-retired counter
6457  * by enabling when requested, and hiding rollover
6458  */
6459 void linux_perf_init(void)
6460 {
6461         if (access("/proc/sys/kernel/perf_event_paranoid", F_OK))
6462                 return;
6463
6464         if (BIC_IS_ENABLED(BIC_IPC) && has_aperf) {
6465                 fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
6466                 if (fd_instr_count_percpu == NULL)
6467                         err(-1, "calloc fd_instr_count_percpu");
6468         }
6469
6470         const bool aperf_required = is_aperf_access_required();
6471
6472         if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) {
6473                 fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu));
6474                 if (fd_amperf_percpu == NULL)
6475                         err(-1, "calloc fd_amperf_percpu");
6476         }
6477 }
6478
6479 void rapl_perf_init(void)
6480 {
6481         const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages;
6482         bool *domain_visited = calloc(num_domains, sizeof(bool));
6483
6484         rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain));
6485         if (rapl_counter_info_perdomain == NULL)
6486                 err(-1, "calloc rapl_counter_info_percpu");
6487
6488         /*
6489          * Initialize rapl_counter_info_percpu
6490          */
6491         for (int domain_id = 0; domain_id < num_domains; ++domain_id) {
6492                 struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id];
6493
6494                 rci->fd_perf = -1;
6495                 for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) {
6496                         rci->data[i] = 0;
6497                         rci->source[i] = RAPL_SOURCE_NONE;
6498                 }
6499         }
6500
6501         /*
6502          * Open/probe the counters
6503          * If can't get it via perf, fallback to MSR
6504          */
6505         for (size_t i = 0; i < ARRAY_SIZE(rapl_counter_arch_infos); ++i) {
6506
6507                 const struct rapl_counter_arch_info *const cai = &rapl_counter_arch_infos[i];
6508                 bool has_counter = 0;
6509                 double scale;
6510                 enum rapl_unit unit;
6511                 int next_domain;
6512
6513                 memset(domain_visited, 0, num_domains * sizeof(*domain_visited));
6514
6515                 for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) {
6516
6517                         if (cpu_is_not_allowed(cpu))
6518                                 continue;
6519
6520                         /* Skip already seen and handled RAPL domains */
6521                         next_domain =
6522                             platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id;
6523
6524                         if (domain_visited[next_domain])
6525                                 continue;
6526
6527                         domain_visited[next_domain] = 1;
6528
6529                         struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain];
6530
6531                         /* Check if the counter is enabled and accessible */
6532                         if (BIC_IS_ENABLED(cai->bic) && (platform->rapl_msrs & cai->feature_mask)) {
6533
6534                                 /* Use perf API for this counter */
6535                                 if (!no_perf && cai->perf_name
6536                                     && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) {
6537                                         rci->source[cai->rci_index] = RAPL_SOURCE_PERF;
6538                                         rci->scale[cai->rci_index] = scale * cai->compat_scale;
6539                                         rci->unit[cai->rci_index] = unit;
6540                                         rci->flags[cai->rci_index] = cai->flags;
6541
6542                                         /* Use MSR for this counter */
6543                                 } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) {
6544                                         rci->source[cai->rci_index] = RAPL_SOURCE_MSR;
6545                                         rci->msr[cai->rci_index] = cai->msr;
6546                                         rci->msr_mask[cai->rci_index] = cai->msr_mask;
6547                                         rci->msr_shift[cai->rci_index] = cai->msr_shift;
6548                                         rci->unit[cai->rci_index] = RAPL_UNIT_JOULES;
6549                                         rci->scale[cai->rci_index] = *cai->platform_rapl_msr_scale * cai->compat_scale;
6550                                         rci->flags[cai->rci_index] = cai->flags;
6551                                 }
6552                         }
6553
6554                         if (rci->source[cai->rci_index] != RAPL_SOURCE_NONE)
6555                                 has_counter = 1;
6556                 }
6557
6558                 /* If any CPU has access to the counter, make it present */
6559                 if (has_counter)
6560                         BIC_PRESENT(cai->bic);
6561         }
6562
6563         free(domain_visited);
6564 }
6565
6566 static int has_amperf_access_via_msr(void)
6567 {
6568         if (no_msr)
6569                 return 0;
6570
6571         if (probe_msr(base_cpu, MSR_IA32_APERF))
6572                 return 0;
6573
6574         if (probe_msr(base_cpu, MSR_IA32_MPERF))
6575                 return 0;
6576
6577         return 1;
6578 }
6579
6580 static int has_amperf_access_via_perf(void)
6581 {
6582         struct amperf_group_fd fds;
6583
6584         /*
6585          * Cache the last result, so we don't warn the user multiple times
6586          *
6587          * Negative means cached, no access
6588          * Zero means not cached
6589          * Positive means cached, has access
6590          */
6591         static int has_access_cached;
6592
6593         if (no_perf)
6594                 return 0;
6595
6596         if (has_access_cached != 0)
6597                 return has_access_cached > 0;
6598
6599         fds = open_amperf_fd(base_cpu);
6600         has_access_cached = (fds.aperf != -1) && (fds.mperf != -1);
6601
6602         if (fds.aperf == -1)
6603                 warnx("Failed to access %s. Some of the counters may not be available\n"
6604                       "\tRun as root to enable them or use %s to disable the access explicitly",
6605                       "APERF perf counter", "--no-perf");
6606         else
6607                 close(fds.aperf);
6608
6609         if (fds.mperf == -1)
6610                 warnx("Failed to access %s. Some of the counters may not be available\n"
6611                       "\tRun as root to enable them or use %s to disable the access explicitly",
6612                       "MPERF perf counter", "--no-perf");
6613         else
6614                 close(fds.mperf);
6615
6616         if (has_access_cached == 0)
6617                 has_access_cached = -1;
6618
6619         return has_access_cached > 0;
6620 }
6621
6622 /* Check if we can access APERF and MPERF */
6623 static int has_amperf_access(void)
6624 {
6625         if (!is_aperf_access_required())
6626                 return 0;
6627
6628         if (!no_msr && has_amperf_access_via_msr())
6629                 return 1;
6630
6631         if (!no_perf && has_amperf_access_via_perf())
6632                 return 1;
6633
6634         return 0;
6635 }
6636
6637 void probe_cstates(void)
6638 {
6639         probe_cst_limit();
6640
6641         if (platform->supported_cstates & CC1)
6642                 BIC_PRESENT(BIC_CPU_c1);
6643
6644         if (platform->supported_cstates & CC3)
6645                 BIC_PRESENT(BIC_CPU_c3);
6646
6647         if (platform->supported_cstates & CC6)
6648                 BIC_PRESENT(BIC_CPU_c6);
6649
6650         if (platform->supported_cstates & CC7)
6651                 BIC_PRESENT(BIC_CPU_c7);
6652
6653         if (platform->supported_cstates & PC2 && (pkg_cstate_limit >= PCL__2))
6654                 BIC_PRESENT(BIC_Pkgpc2);
6655
6656         if (platform->supported_cstates & PC3 && (pkg_cstate_limit >= PCL__3))
6657                 BIC_PRESENT(BIC_Pkgpc3);
6658
6659         if (platform->supported_cstates & PC6 && (pkg_cstate_limit >= PCL__6))
6660                 BIC_PRESENT(BIC_Pkgpc6);
6661
6662         if (platform->supported_cstates & PC7 && (pkg_cstate_limit >= PCL__7))
6663                 BIC_PRESENT(BIC_Pkgpc7);
6664
6665         if (platform->supported_cstates & PC8 && (pkg_cstate_limit >= PCL__8))
6666                 BIC_PRESENT(BIC_Pkgpc8);
6667
6668         if (platform->supported_cstates & PC9 && (pkg_cstate_limit >= PCL__9))
6669                 BIC_PRESENT(BIC_Pkgpc9);
6670
6671         if (platform->supported_cstates & PC10 && (pkg_cstate_limit >= PCL_10))
6672                 BIC_PRESENT(BIC_Pkgpc10);
6673
6674         if (platform->has_msr_module_c6_res_ms)
6675                 BIC_PRESENT(BIC_Mod_c6);
6676
6677         if (platform->has_ext_cst_msrs && !no_msr) {
6678                 BIC_PRESENT(BIC_Totl_c0);
6679                 BIC_PRESENT(BIC_Any_c0);
6680                 BIC_PRESENT(BIC_GFX_c0);
6681                 BIC_PRESENT(BIC_CPUGFX);
6682         }
6683
6684         if (quiet)
6685                 return;
6686
6687         dump_power_ctl();
6688         dump_cst_cfg();
6689         decode_c6_demotion_policy_msr();
6690         print_dev_latency();
6691         dump_sysfs_cstate_config();
6692         print_irtl();
6693 }
6694
6695 void probe_lpi(void)
6696 {
6697         if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK))
6698                 BIC_PRESENT(BIC_CPU_LPI);
6699         else
6700                 BIC_NOT_PRESENT(BIC_CPU_LPI);
6701
6702         if (!access(sys_lpi_file_sysfs, R_OK)) {
6703                 sys_lpi_file = sys_lpi_file_sysfs;
6704                 BIC_PRESENT(BIC_SYS_LPI);
6705         } else if (!access(sys_lpi_file_debugfs, R_OK)) {
6706                 sys_lpi_file = sys_lpi_file_debugfs;
6707                 BIC_PRESENT(BIC_SYS_LPI);
6708         } else {
6709                 sys_lpi_file_sysfs = NULL;
6710                 BIC_NOT_PRESENT(BIC_SYS_LPI);
6711         }
6712
6713 }
6714
6715 void probe_pstates(void)
6716 {
6717         probe_bclk();
6718
6719         if (quiet)
6720                 return;
6721
6722         dump_platform_info();
6723         dump_turbo_ratio_info();
6724         dump_sysfs_pstate_config();
6725         decode_misc_pwr_mgmt_msr();
6726
6727         for_all_cpus(print_hwp, ODD_COUNTERS);
6728         for_all_cpus(print_epb, ODD_COUNTERS);
6729         for_all_cpus(print_perf_limit, ODD_COUNTERS);
6730 }
6731
6732 void process_cpuid()
6733 {
6734         unsigned int eax, ebx, ecx, edx;
6735         unsigned int fms, family, model, stepping, ecx_flags, edx_flags;
6736         unsigned long long ucode_patch = 0;
6737         bool ucode_patch_valid = false;
6738
6739         eax = ebx = ecx = edx = 0;
6740
6741         __cpuid(0, max_level, ebx, ecx, edx);
6742
6743         if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
6744                 genuine_intel = 1;
6745         else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
6746                 authentic_amd = 1;
6747         else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
6748                 hygon_genuine = 1;
6749
6750         if (!quiet)
6751                 fprintf(outf, "CPUID(0): %.4s%.4s%.4s 0x%x CPUID levels\n",
6752                         (char *)&ebx, (char *)&edx, (char *)&ecx, max_level);
6753
6754         __cpuid(1, fms, ebx, ecx, edx);
6755         family = (fms >> 8) & 0xf;
6756         model = (fms >> 4) & 0xf;
6757         stepping = fms & 0xf;
6758         if (family == 0xf)
6759                 family += (fms >> 20) & 0xff;
6760         if (family >= 6)
6761                 model += ((fms >> 16) & 0xf) << 4;
6762         ecx_flags = ecx;
6763         edx_flags = edx;
6764
6765         if (!no_msr) {
6766                 if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
6767                         warnx("get_msr(UCODE)");
6768                 else
6769                         ucode_patch_valid = true;
6770         }
6771
6772         /*
6773          * check max extended function levels of CPUID.
6774          * This is needed to check for invariant TSC.
6775          * This check is valid for both Intel and AMD.
6776          */
6777         ebx = ecx = edx = 0;
6778         __cpuid(0x80000000, max_extended_level, ebx, ecx, edx);
6779
6780         if (!quiet) {
6781                 fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d)",
6782                         family, model, stepping, family, model, stepping);
6783                 if (ucode_patch_valid)
6784                         fprintf(outf, " microcode 0x%x", (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF));
6785                 fputc('\n', outf);
6786
6787                 fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level);
6788                 fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n",
6789                         ecx_flags & (1 << 0) ? "SSE3" : "-",
6790                         ecx_flags & (1 << 3) ? "MONITOR" : "-",
6791                         ecx_flags & (1 << 6) ? "SMX" : "-",
6792                         ecx_flags & (1 << 7) ? "EIST" : "-",
6793                         ecx_flags & (1 << 8) ? "TM2" : "-",
6794                         edx_flags & (1 << 4) ? "TSC" : "-",
6795                         edx_flags & (1 << 5) ? "MSR" : "-",
6796                         edx_flags & (1 << 22) ? "ACPI-TM" : "-",
6797                         edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-");
6798         }
6799
6800         probe_platform_features(family, model);
6801
6802         if (!(edx_flags & (1 << 5)))
6803                 errx(1, "CPUID: no MSR");
6804
6805         if (max_extended_level >= 0x80000007) {
6806
6807                 /*
6808                  * Non-Stop TSC is advertised by CPUID.EAX=0x80000007: EDX.bit8
6809                  * this check is valid for both Intel and AMD
6810                  */
6811                 __cpuid(0x80000007, eax, ebx, ecx, edx);
6812                 has_invariant_tsc = edx & (1 << 8);
6813         }
6814
6815         /*
6816          * APERF/MPERF is advertised by CPUID.EAX=0x6: ECX.bit0
6817          * this check is valid for both Intel and AMD
6818          */
6819
6820         __cpuid(0x6, eax, ebx, ecx, edx);
6821         has_aperf = ecx & (1 << 0);
6822         if (has_aperf && has_amperf_access()) {
6823                 BIC_PRESENT(BIC_Avg_MHz);
6824                 BIC_PRESENT(BIC_Busy);
6825                 BIC_PRESENT(BIC_Bzy_MHz);
6826                 BIC_PRESENT(BIC_IPC);
6827         }
6828         do_dts = eax & (1 << 0);
6829         if (do_dts)
6830                 BIC_PRESENT(BIC_CoreTmp);
6831         has_turbo = eax & (1 << 1);
6832         do_ptm = eax & (1 << 6);
6833         if (do_ptm)
6834                 BIC_PRESENT(BIC_PkgTmp);
6835         has_hwp = eax & (1 << 7);
6836         has_hwp_notify = eax & (1 << 8);
6837         has_hwp_activity_window = eax & (1 << 9);
6838         has_hwp_epp = eax & (1 << 10);
6839         has_hwp_pkg = eax & (1 << 11);
6840         has_epb = ecx & (1 << 3);
6841
6842         if (!quiet)
6843                 fprintf(outf, "CPUID(6): %sAPERF, %sTURBO, %sDTS, %sPTM, %sHWP, "
6844                         "%sHWPnotify, %sHWPwindow, %sHWPepp, %sHWPpkg, %sEPB\n",
6845                         has_aperf ? "" : "No-",
6846                         has_turbo ? "" : "No-",
6847                         do_dts ? "" : "No-",
6848                         do_ptm ? "" : "No-",
6849                         has_hwp ? "" : "No-",
6850                         has_hwp_notify ? "" : "No-",
6851                         has_hwp_activity_window ? "" : "No-",
6852                         has_hwp_epp ? "" : "No-", has_hwp_pkg ? "" : "No-", has_epb ? "" : "No-");
6853
6854         if (!quiet)
6855                 decode_misc_enable_msr();
6856
6857         if (max_level >= 0x7 && !quiet) {
6858                 int has_sgx;
6859
6860                 ecx = 0;
6861
6862                 __cpuid_count(0x7, 0, eax, ebx, ecx, edx);
6863
6864                 has_sgx = ebx & (1 << 2);
6865
6866                 is_hybrid = edx & (1 << 15);
6867
6868                 fprintf(outf, "CPUID(7): %sSGX %sHybrid\n", has_sgx ? "" : "No-", is_hybrid ? "" : "No-");
6869
6870                 if (has_sgx)
6871                         decode_feature_control_msr();
6872         }
6873
6874         if (max_level >= 0x15) {
6875                 unsigned int eax_crystal;
6876                 unsigned int ebx_tsc;
6877
6878                 /*
6879                  * CPUID 15H TSC/Crystal ratio, possibly Crystal Hz
6880                  */
6881                 eax_crystal = ebx_tsc = crystal_hz = edx = 0;
6882                 __cpuid(0x15, eax_crystal, ebx_tsc, crystal_hz, edx);
6883
6884                 if (ebx_tsc != 0) {
6885                         if (!quiet && (ebx != 0))
6886                                 fprintf(outf, "CPUID(0x15): eax_crystal: %d ebx_tsc: %d ecx_crystal_hz: %d\n",
6887                                         eax_crystal, ebx_tsc, crystal_hz);
6888
6889                         if (crystal_hz == 0)
6890                                 crystal_hz = platform->crystal_freq;
6891
6892                         if (crystal_hz) {
6893                                 tsc_hz = (unsigned long long)crystal_hz *ebx_tsc / eax_crystal;
6894                                 if (!quiet)
6895                                         fprintf(outf, "TSC: %lld MHz (%d Hz * %d / %d / 1000000)\n",
6896                                                 tsc_hz / 1000000, crystal_hz, ebx_tsc, eax_crystal);
6897                         }
6898                 }
6899         }
6900         if (max_level >= 0x16) {
6901                 unsigned int base_mhz, max_mhz, bus_mhz, edx;
6902
6903                 /*
6904                  * CPUID 16H Base MHz, Max MHz, Bus MHz
6905                  */
6906                 base_mhz = max_mhz = bus_mhz = edx = 0;
6907
6908                 __cpuid(0x16, base_mhz, max_mhz, bus_mhz, edx);
6909
6910                 bclk = bus_mhz;
6911
6912                 base_hz = base_mhz * 1000000;
6913                 has_base_hz = 1;
6914
6915                 if (platform->enable_tsc_tweak)
6916                         tsc_tweak = base_hz / tsc_hz;
6917
6918                 if (!quiet)
6919                         fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n",
6920                                 base_mhz, max_mhz, bus_mhz);
6921         }
6922
6923         if (has_aperf)
6924                 aperf_mperf_multiplier = platform->need_perf_multiplier ? 1024 : 1;
6925
6926         BIC_PRESENT(BIC_IRQ);
6927         BIC_PRESENT(BIC_TSC_MHz);
6928 }
6929
6930 void probe_pm_features(void)
6931 {
6932         probe_pstates();
6933
6934         probe_cstates();
6935
6936         probe_lpi();
6937
6938         probe_intel_uncore_frequency();
6939
6940         probe_graphics();
6941
6942         probe_rapl();
6943
6944         probe_thermal();
6945
6946         if (platform->has_nhm_msrs && !no_msr)
6947                 BIC_PRESENT(BIC_SMI);
6948
6949         if (!quiet)
6950                 decode_misc_feature_control();
6951 }
6952
6953 /*
6954  * in /dev/cpu/ return success for names that are numbers
6955  * ie. filter out ".", "..", "microcode".
6956  */
6957 int dir_filter(const struct dirent *dirp)
6958 {
6959         if (isdigit(dirp->d_name[0]))
6960                 return 1;
6961         else
6962                 return 0;
6963 }
6964
6965 void topology_probe(bool startup)
6966 {
6967         int i;
6968         int max_core_id = 0;
6969         int max_package_id = 0;
6970         int max_die_id = 0;
6971         int max_siblings = 0;
6972
6973         /* Initialize num_cpus, max_cpu_num */
6974         set_max_cpu_num();
6975         topo.num_cpus = 0;
6976         for_all_proc_cpus(count_cpus);
6977         if (!summary_only && topo.num_cpus > 1)
6978                 BIC_PRESENT(BIC_CPU);
6979
6980         if (debug > 1)
6981                 fprintf(outf, "num_cpus %d max_cpu_num %d\n", topo.num_cpus, topo.max_cpu_num);
6982
6983         cpus = calloc(1, (topo.max_cpu_num + 1) * sizeof(struct cpu_topology));
6984         if (cpus == NULL)
6985                 err(1, "calloc cpus");
6986
6987         /*
6988          * Allocate and initialize cpu_present_set
6989          */
6990         cpu_present_set = CPU_ALLOC((topo.max_cpu_num + 1));
6991         if (cpu_present_set == NULL)
6992                 err(3, "CPU_ALLOC");
6993         cpu_present_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
6994         CPU_ZERO_S(cpu_present_setsize, cpu_present_set);
6995         for_all_proc_cpus(mark_cpu_present);
6996
6997         /*
6998          * Allocate and initialize cpu_effective_set
6999          */
7000         cpu_effective_set = CPU_ALLOC((topo.max_cpu_num + 1));
7001         if (cpu_effective_set == NULL)
7002                 err(3, "CPU_ALLOC");
7003         cpu_effective_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
7004         CPU_ZERO_S(cpu_effective_setsize, cpu_effective_set);
7005         update_effective_set(startup);
7006
7007         /*
7008          * Allocate and initialize cpu_allowed_set
7009          */
7010         cpu_allowed_set = CPU_ALLOC((topo.max_cpu_num + 1));
7011         if (cpu_allowed_set == NULL)
7012                 err(3, "CPU_ALLOC");
7013         cpu_allowed_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
7014         CPU_ZERO_S(cpu_allowed_setsize, cpu_allowed_set);
7015
7016         /*
7017          * Validate and update cpu_allowed_set.
7018          *
7019          * Make sure all cpus in cpu_subset are also in cpu_present_set during startup.
7020          * Give a warning when cpus in cpu_subset become unavailable at runtime.
7021          * Give a warning when cpus are not effective because of cgroup setting.
7022          *
7023          * cpu_allowed_set is the intersection of cpu_present_set/cpu_effective_set/cpu_subset.
7024          */
7025         for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) {
7026                 if (cpu_subset && !CPU_ISSET_S(i, cpu_subset_size, cpu_subset))
7027                         continue;
7028
7029                 if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) {
7030                         if (cpu_subset) {
7031                                 /* cpus in cpu_subset must be in cpu_present_set during startup */
7032                                 if (startup)
7033                                         err(1, "cpu%d not present", i);
7034                                 else
7035                                         fprintf(stderr, "cpu%d not present\n", i);
7036                         }
7037                         continue;
7038                 }
7039
7040                 if (CPU_COUNT_S(cpu_effective_setsize, cpu_effective_set)) {
7041                         if (!CPU_ISSET_S(i, cpu_effective_setsize, cpu_effective_set)) {
7042                                 fprintf(stderr, "cpu%d not effective\n", i);
7043                                 continue;
7044                         }
7045                 }
7046
7047                 CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set);
7048         }
7049
7050         if (!CPU_COUNT_S(cpu_allowed_setsize, cpu_allowed_set))
7051                 err(-ENODEV, "No valid cpus found");
7052         sched_setaffinity(0, cpu_allowed_setsize, cpu_allowed_set);
7053
7054         /*
7055          * Allocate and initialize cpu_affinity_set
7056          */
7057         cpu_affinity_set = CPU_ALLOC((topo.max_cpu_num + 1));
7058         if (cpu_affinity_set == NULL)
7059                 err(3, "CPU_ALLOC");
7060         cpu_affinity_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
7061         CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
7062
7063         for_all_proc_cpus(init_thread_id);
7064
7065         /*
7066          * For online cpus
7067          * find max_core_id, max_package_id
7068          */
7069         for (i = 0; i <= topo.max_cpu_num; ++i) {
7070                 int siblings;
7071
7072                 if (cpu_is_not_present(i)) {
7073                         if (debug > 1)
7074                                 fprintf(outf, "cpu%d NOT PRESENT\n", i);
7075                         continue;
7076                 }
7077
7078                 cpus[i].logical_cpu_id = i;
7079
7080                 /* get package information */
7081                 cpus[i].physical_package_id = get_physical_package_id(i);
7082                 if (cpus[i].physical_package_id > max_package_id)
7083                         max_package_id = cpus[i].physical_package_id;
7084
7085                 /* get die information */
7086                 cpus[i].die_id = get_die_id(i);
7087                 if (cpus[i].die_id > max_die_id)
7088                         max_die_id = cpus[i].die_id;
7089
7090                 /* get numa node information */
7091                 cpus[i].physical_node_id = get_physical_node_id(&cpus[i]);
7092                 if (cpus[i].physical_node_id > topo.max_node_num)
7093                         topo.max_node_num = cpus[i].physical_node_id;
7094
7095                 /* get core information */
7096                 cpus[i].physical_core_id = get_core_id(i);
7097                 if (cpus[i].physical_core_id > max_core_id)
7098                         max_core_id = cpus[i].physical_core_id;
7099
7100                 /* get thread information */
7101                 siblings = get_thread_siblings(&cpus[i]);
7102                 if (siblings > max_siblings)
7103                         max_siblings = siblings;
7104                 if (cpus[i].thread_id == 0)
7105                         topo.num_cores++;
7106         }
7107
7108         topo.cores_per_node = max_core_id + 1;
7109         if (debug > 1)
7110                 fprintf(outf, "max_core_id %d, sizing for %d cores per package\n", max_core_id, topo.cores_per_node);
7111         if (!summary_only && topo.cores_per_node > 1)
7112                 BIC_PRESENT(BIC_Core);
7113
7114         topo.num_die = max_die_id + 1;
7115         if (debug > 1)
7116                 fprintf(outf, "max_die_id %d, sizing for %d die\n", max_die_id, topo.num_die);
7117         if (!summary_only && topo.num_die > 1)
7118                 BIC_PRESENT(BIC_Die);
7119
7120         topo.num_packages = max_package_id + 1;
7121         if (debug > 1)
7122                 fprintf(outf, "max_package_id %d, sizing for %d packages\n", max_package_id, topo.num_packages);
7123         if (!summary_only && topo.num_packages > 1)
7124                 BIC_PRESENT(BIC_Package);
7125
7126         set_node_data();
7127         if (debug > 1)
7128                 fprintf(outf, "nodes_per_pkg %d\n", topo.nodes_per_pkg);
7129         if (!summary_only && topo.nodes_per_pkg > 1)
7130                 BIC_PRESENT(BIC_Node);
7131
7132         topo.threads_per_core = max_siblings;
7133         if (debug > 1)
7134                 fprintf(outf, "max_siblings %d\n", max_siblings);
7135
7136         if (debug < 1)
7137                 return;
7138
7139         for (i = 0; i <= topo.max_cpu_num; ++i) {
7140                 if (cpu_is_not_present(i))
7141                         continue;
7142                 fprintf(outf,
7143                         "cpu %d pkg %d die %d node %d lnode %d core %d thread %d\n",
7144                         i, cpus[i].physical_package_id, cpus[i].die_id,
7145                         cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].physical_core_id, cpus[i].thread_id);
7146         }
7147
7148 }
7149
7150 void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_data **p)
7151 {
7152         int i;
7153         int num_cores = topo.cores_per_node * topo.nodes_per_pkg * topo.num_packages;
7154         int num_threads = topo.threads_per_core * num_cores;
7155
7156         *t = calloc(num_threads, sizeof(struct thread_data));
7157         if (*t == NULL)
7158                 goto error;
7159
7160         for (i = 0; i < num_threads; i++)
7161                 (*t)[i].cpu_id = -1;
7162
7163         *c = calloc(num_cores, sizeof(struct core_data));
7164         if (*c == NULL)
7165                 goto error;
7166
7167         for (i = 0; i < num_cores; i++) {
7168                 (*c)[i].core_id = -1;
7169                 (*c)[i].base_cpu = -1;
7170         }
7171
7172         *p = calloc(topo.num_packages, sizeof(struct pkg_data));
7173         if (*p == NULL)
7174                 goto error;
7175
7176         for (i = 0; i < topo.num_packages; i++) {
7177                 (*p)[i].package_id = i;
7178                 (*p)[i].base_cpu = -1;
7179         }
7180
7181         return;
7182 error:
7183         err(1, "calloc counters");
7184 }
7185
7186 /*
7187  * init_counter()
7188  *
7189  * set FIRST_THREAD_IN_CORE and FIRST_CORE_IN_PACKAGE
7190  */
7191 void init_counter(struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base, int cpu_id)
7192 {
7193         int pkg_id = cpus[cpu_id].physical_package_id;
7194         int node_id = cpus[cpu_id].logical_node_id;
7195         int core_id = cpus[cpu_id].physical_core_id;
7196         int thread_id = cpus[cpu_id].thread_id;
7197         struct thread_data *t;
7198         struct core_data *c;
7199         struct pkg_data *p;
7200
7201         /* Workaround for systems where physical_node_id==-1
7202          * and logical_node_id==(-1 - topo.num_cpus)
7203          */
7204         if (node_id < 0)
7205                 node_id = 0;
7206
7207         t = GET_THREAD(thread_base, thread_id, core_id, node_id, pkg_id);
7208         c = GET_CORE(core_base, core_id, node_id, pkg_id);
7209         p = GET_PKG(pkg_base, pkg_id);
7210
7211         t->cpu_id = cpu_id;
7212         if (!cpu_is_not_allowed(cpu_id)) {
7213                 if (c->base_cpu < 0)
7214                         c->base_cpu = t->cpu_id;
7215                 if (p->base_cpu < 0)
7216                         p->base_cpu = t->cpu_id;
7217         }
7218
7219         c->core_id = core_id;
7220         p->package_id = pkg_id;
7221 }
7222
7223 int initialize_counters(int cpu_id)
7224 {
7225         init_counter(EVEN_COUNTERS, cpu_id);
7226         init_counter(ODD_COUNTERS, cpu_id);
7227         return 0;
7228 }
7229
7230 void allocate_output_buffer()
7231 {
7232         output_buffer = calloc(1, (1 + topo.num_cpus) * 2048);
7233         outp = output_buffer;
7234         if (outp == NULL)
7235                 err(-1, "calloc output buffer");
7236 }
7237
7238 void allocate_fd_percpu(void)
7239 {
7240         fd_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
7241         if (fd_percpu == NULL)
7242                 err(-1, "calloc fd_percpu");
7243 }
7244
7245 void allocate_irq_buffers(void)
7246 {
7247         irq_column_2_cpu = calloc(topo.num_cpus, sizeof(int));
7248         if (irq_column_2_cpu == NULL)
7249                 err(-1, "calloc %d", topo.num_cpus);
7250
7251         irqs_per_cpu = calloc(topo.max_cpu_num + 1, sizeof(int));
7252         if (irqs_per_cpu == NULL)
7253                 err(-1, "calloc %d", topo.max_cpu_num + 1);
7254 }
7255
7256 int update_topo(struct thread_data *t, struct core_data *c, struct pkg_data *p)
7257 {
7258         topo.allowed_cpus++;
7259         if ((int)t->cpu_id == c->base_cpu)
7260                 topo.allowed_cores++;
7261         if ((int)t->cpu_id == p->base_cpu)
7262                 topo.allowed_packages++;
7263
7264         return 0;
7265 }
7266
7267 void topology_update(void)
7268 {
7269         topo.allowed_cpus = 0;
7270         topo.allowed_cores = 0;
7271         topo.allowed_packages = 0;
7272         for_all_cpus(update_topo, ODD_COUNTERS);
7273 }
7274
7275 void setup_all_buffers(bool startup)
7276 {
7277         topology_probe(startup);
7278         allocate_irq_buffers();
7279         allocate_fd_percpu();
7280         allocate_counters(&thread_even, &core_even, &package_even);
7281         allocate_counters(&thread_odd, &core_odd, &package_odd);
7282         allocate_output_buffer();
7283         for_all_proc_cpus(initialize_counters);
7284         topology_update();
7285 }
7286
7287 void set_base_cpu(void)
7288 {
7289         int i;
7290
7291         for (i = 0; i < topo.max_cpu_num + 1; ++i) {
7292                 if (cpu_is_not_allowed(i))
7293                         continue;
7294                 base_cpu = i;
7295                 if (debug > 1)
7296                         fprintf(outf, "base_cpu = %d\n", base_cpu);
7297                 return;
7298         }
7299         err(-ENODEV, "No valid cpus found");
7300 }
7301
7302 static void set_amperf_source(void)
7303 {
7304         amperf_source = AMPERF_SOURCE_PERF;
7305
7306         const bool aperf_required = is_aperf_access_required();
7307
7308         if (no_perf || !aperf_required || !has_amperf_access_via_perf())
7309                 amperf_source = AMPERF_SOURCE_MSR;
7310
7311         if (quiet || !debug)
7312                 return;
7313
7314         fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf");
7315 }
7316
7317 bool has_added_counters(void)
7318 {
7319         /*
7320          * It only makes sense to call this after the command line is parsed,
7321          * otherwise sys structure is not populated.
7322          */
7323
7324         return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters;
7325 }
7326
7327 bool is_msr_access_required(void)
7328 {
7329         if (no_msr)
7330                 return false;
7331
7332         if (has_added_counters())
7333                 return true;
7334
7335         return BIC_IS_ENABLED(BIC_SMI)
7336             || BIC_IS_ENABLED(BIC_CPU_c1)
7337             || BIC_IS_ENABLED(BIC_CPU_c3)
7338             || BIC_IS_ENABLED(BIC_CPU_c6)
7339             || BIC_IS_ENABLED(BIC_CPU_c7)
7340             || BIC_IS_ENABLED(BIC_Mod_c6)
7341             || BIC_IS_ENABLED(BIC_CoreTmp)
7342             || BIC_IS_ENABLED(BIC_Totl_c0)
7343             || BIC_IS_ENABLED(BIC_Any_c0)
7344             || BIC_IS_ENABLED(BIC_GFX_c0)
7345             || BIC_IS_ENABLED(BIC_CPUGFX)
7346             || BIC_IS_ENABLED(BIC_Pkgpc3)
7347             || BIC_IS_ENABLED(BIC_Pkgpc6)
7348             || BIC_IS_ENABLED(BIC_Pkgpc2)
7349             || BIC_IS_ENABLED(BIC_Pkgpc7)
7350             || BIC_IS_ENABLED(BIC_Pkgpc8)
7351             || BIC_IS_ENABLED(BIC_Pkgpc9)
7352             || BIC_IS_ENABLED(BIC_Pkgpc10)
7353             /* TODO: Multiplex access with perf */
7354             || BIC_IS_ENABLED(BIC_CorWatt)
7355             || BIC_IS_ENABLED(BIC_Cor_J)
7356             || BIC_IS_ENABLED(BIC_PkgWatt)
7357             || BIC_IS_ENABLED(BIC_CorWatt)
7358             || BIC_IS_ENABLED(BIC_GFXWatt)
7359             || BIC_IS_ENABLED(BIC_RAMWatt)
7360             || BIC_IS_ENABLED(BIC_Pkg_J)
7361             || BIC_IS_ENABLED(BIC_Cor_J)
7362             || BIC_IS_ENABLED(BIC_GFX_J)
7363             || BIC_IS_ENABLED(BIC_RAM_J)
7364             || BIC_IS_ENABLED(BIC_PKG__)
7365             || BIC_IS_ENABLED(BIC_RAM__)
7366             || BIC_IS_ENABLED(BIC_PkgTmp)
7367             || (is_aperf_access_required() && !has_amperf_access_via_perf());
7368 }
7369
7370 void check_msr_access(void)
7371 {
7372         if (!is_msr_access_required())
7373                 no_msr = 1;
7374
7375         check_dev_msr();
7376         check_msr_permission();
7377
7378         if (no_msr)
7379                 bic_disable_msr_access();
7380 }
7381
7382 void check_perf_access(void)
7383 {
7384         const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC);
7385
7386         if (no_perf || !intrcount_required || !has_instr_count_access())
7387                 bic_enabled &= ~BIC_IPC;
7388
7389         const bool aperf_required = is_aperf_access_required();
7390
7391         if (!aperf_required || !has_amperf_access()) {
7392                 bic_enabled &= ~BIC_Avg_MHz;
7393                 bic_enabled &= ~BIC_Busy;
7394                 bic_enabled &= ~BIC_Bzy_MHz;
7395                 bic_enabled &= ~BIC_IPC;
7396         }
7397 }
7398
7399 void turbostat_init()
7400 {
7401         setup_all_buffers(true);
7402         set_base_cpu();
7403         check_msr_access();
7404         check_perf_access();
7405         process_cpuid();
7406         probe_pm_features();
7407         set_amperf_source();
7408         linux_perf_init();
7409         rapl_perf_init();
7410
7411         for_all_cpus(get_cpu_type, ODD_COUNTERS);
7412         for_all_cpus(get_cpu_type, EVEN_COUNTERS);
7413
7414         if (DO_BIC(BIC_IPC))
7415                 (void)get_instr_count_fd(base_cpu);
7416
7417         /*
7418          * If TSC tweak is needed, but couldn't get it,
7419          * disable more BICs, since it can't be reported accurately.
7420          */
7421         if (platform->enable_tsc_tweak && !has_base_hz) {
7422                 bic_enabled &= ~BIC_Busy;
7423                 bic_enabled &= ~BIC_Bzy_MHz;
7424         }
7425 }
7426
7427 int fork_it(char **argv)
7428 {
7429         pid_t child_pid;
7430         int status;
7431
7432         snapshot_proc_sysfs_files();
7433         status = for_all_cpus(get_counters, EVEN_COUNTERS);
7434         first_counter_read = 0;
7435         if (status)
7436                 exit(status);
7437         gettimeofday(&tv_even, (struct timezone *)NULL);
7438
7439         child_pid = fork();
7440         if (!child_pid) {
7441                 /* child */
7442                 execvp(argv[0], argv);
7443                 err(errno, "exec %s", argv[0]);
7444         } else {
7445
7446                 /* parent */
7447                 if (child_pid == -1)
7448                         err(1, "fork");
7449
7450                 signal(SIGINT, SIG_IGN);
7451                 signal(SIGQUIT, SIG_IGN);
7452                 if (waitpid(child_pid, &status, 0) == -1)
7453                         err(status, "waitpid");
7454
7455                 if (WIFEXITED(status))
7456                         status = WEXITSTATUS(status);
7457         }
7458         /*
7459          * n.b. fork_it() does not check for errors from for_all_cpus()
7460          * because re-starting is problematic when forking
7461          */
7462         snapshot_proc_sysfs_files();
7463         for_all_cpus(get_counters, ODD_COUNTERS);
7464         gettimeofday(&tv_odd, (struct timezone *)NULL);
7465         timersub(&tv_odd, &tv_even, &tv_delta);
7466         if (for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS))
7467                 fprintf(outf, "%s: Counter reset detected\n", progname);
7468         else {
7469                 compute_average(EVEN_COUNTERS);
7470                 format_all_counters(EVEN_COUNTERS);
7471         }
7472
7473         fprintf(outf, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec / 1000000.0);
7474
7475         flush_output_stderr();
7476
7477         return status;
7478 }
7479
7480 int get_and_dump_counters(void)
7481 {
7482         int status;
7483
7484         snapshot_proc_sysfs_files();
7485         status = for_all_cpus(get_counters, ODD_COUNTERS);
7486         if (status)
7487                 return status;
7488
7489         status = for_all_cpus(dump_counters, ODD_COUNTERS);
7490         if (status)
7491                 return status;
7492
7493         flush_output_stdout();
7494
7495         return status;
7496 }
7497
7498 void print_version()
7499 {
7500         fprintf(outf, "turbostat version 2024.04.08 - Len Brown <lenb@kernel.org>\n");
7501 }
7502
7503 #define COMMAND_LINE_SIZE 2048
7504
7505 void print_bootcmd(void)
7506 {
7507         char bootcmd[COMMAND_LINE_SIZE];
7508         FILE *fp;
7509         int ret;
7510
7511         memset(bootcmd, 0, COMMAND_LINE_SIZE);
7512         fp = fopen("/proc/cmdline", "r");
7513         if (!fp)
7514                 return;
7515
7516         ret = fread(bootcmd, sizeof(char), COMMAND_LINE_SIZE - 1, fp);
7517         if (ret) {
7518                 bootcmd[ret] = '\0';
7519                 /* the last character is already '\n' */
7520                 fprintf(outf, "Kernel command line: %s", bootcmd);
7521         }
7522
7523         fclose(fp);
7524 }
7525
7526 int add_counter(unsigned int msr_num, char *path, char *name,
7527                 unsigned int width, enum counter_scope scope,
7528                 enum counter_type type, enum counter_format format, int flags)
7529 {
7530         struct msr_counter *msrp;
7531
7532         if (no_msr && msr_num)
7533                 errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num);
7534
7535         msrp = calloc(1, sizeof(struct msr_counter));
7536         if (msrp == NULL) {
7537                 perror("calloc");
7538                 exit(1);
7539         }
7540
7541         msrp->msr_num = msr_num;
7542         strncpy(msrp->name, name, NAME_BYTES - 1);
7543         if (path)
7544                 strncpy(msrp->path, path, PATH_BYTES - 1);
7545         msrp->width = width;
7546         msrp->type = type;
7547         msrp->format = format;
7548         msrp->flags = flags;
7549
7550         switch (scope) {
7551
7552         case SCOPE_CPU:
7553                 msrp->next = sys.tp;
7554                 sys.tp = msrp;
7555                 sys.added_thread_counters++;
7556                 if (sys.added_thread_counters > MAX_ADDED_THREAD_COUNTERS) {
7557                         fprintf(stderr, "exceeded max %d added thread counters\n", MAX_ADDED_COUNTERS);
7558                         exit(-1);
7559                 }
7560                 break;
7561
7562         case SCOPE_CORE:
7563                 msrp->next = sys.cp;
7564                 sys.cp = msrp;
7565                 sys.added_core_counters++;
7566                 if (sys.added_core_counters > MAX_ADDED_COUNTERS) {
7567                         fprintf(stderr, "exceeded max %d added core counters\n", MAX_ADDED_COUNTERS);
7568                         exit(-1);
7569                 }
7570                 break;
7571
7572         case SCOPE_PACKAGE:
7573                 msrp->next = sys.pp;
7574                 sys.pp = msrp;
7575                 sys.added_package_counters++;
7576                 if (sys.added_package_counters > MAX_ADDED_COUNTERS) {
7577                         fprintf(stderr, "exceeded max %d added package counters\n", MAX_ADDED_COUNTERS);
7578                         exit(-1);
7579                 }
7580                 break;
7581         }
7582
7583         return 0;
7584 }
7585
7586 void parse_add_command(char *add_command)
7587 {
7588         int msr_num = 0;
7589         char *path = NULL;
7590         char name_buffer[NAME_BYTES] = "";
7591         int width = 64;
7592         int fail = 0;
7593         enum counter_scope scope = SCOPE_CPU;
7594         enum counter_type type = COUNTER_CYCLES;
7595         enum counter_format format = FORMAT_DELTA;
7596
7597         while (add_command) {
7598
7599                 if (sscanf(add_command, "msr0x%x", &msr_num) == 1)
7600                         goto next;
7601
7602                 if (sscanf(add_command, "msr%d", &msr_num) == 1)
7603                         goto next;
7604
7605                 if (*add_command == '/') {
7606                         path = add_command;
7607                         goto next;
7608                 }
7609
7610                 if (sscanf(add_command, "u%d", &width) == 1) {
7611                         if ((width == 32) || (width == 64))
7612                                 goto next;
7613                         width = 64;
7614                 }
7615                 if (!strncmp(add_command, "cpu", strlen("cpu"))) {
7616                         scope = SCOPE_CPU;
7617                         goto next;
7618                 }
7619                 if (!strncmp(add_command, "core", strlen("core"))) {
7620                         scope = SCOPE_CORE;
7621                         goto next;
7622                 }
7623                 if (!strncmp(add_command, "package", strlen("package"))) {
7624                         scope = SCOPE_PACKAGE;
7625                         goto next;
7626                 }
7627                 if (!strncmp(add_command, "cycles", strlen("cycles"))) {
7628                         type = COUNTER_CYCLES;
7629                         goto next;
7630                 }
7631                 if (!strncmp(add_command, "seconds", strlen("seconds"))) {
7632                         type = COUNTER_SECONDS;
7633                         goto next;
7634                 }
7635                 if (!strncmp(add_command, "usec", strlen("usec"))) {
7636                         type = COUNTER_USEC;
7637                         goto next;
7638                 }
7639                 if (!strncmp(add_command, "raw", strlen("raw"))) {
7640                         format = FORMAT_RAW;
7641                         goto next;
7642                 }
7643                 if (!strncmp(add_command, "delta", strlen("delta"))) {
7644                         format = FORMAT_DELTA;
7645                         goto next;
7646                 }
7647                 if (!strncmp(add_command, "percent", strlen("percent"))) {
7648                         format = FORMAT_PERCENT;
7649                         goto next;
7650                 }
7651
7652                 if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) {        /* 18 < NAME_BYTES */
7653                         char *eos;
7654
7655                         eos = strchr(name_buffer, ',');
7656                         if (eos)
7657                                 *eos = '\0';
7658                         goto next;
7659                 }
7660
7661 next:
7662                 add_command = strchr(add_command, ',');
7663                 if (add_command) {
7664                         *add_command = '\0';
7665                         add_command++;
7666                 }
7667
7668         }
7669         if ((msr_num == 0) && (path == NULL)) {
7670                 fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter ) required\n");
7671                 fail++;
7672         }
7673
7674         /* generate default column header */
7675         if (*name_buffer == '\0') {
7676                 if (width == 32)
7677                         sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
7678                 else
7679                         sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
7680         }
7681
7682         if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0))
7683                 fail++;
7684
7685         if (fail) {
7686                 help();
7687                 exit(1);
7688         }
7689 }
7690
7691 int is_deferred_add(char *name)
7692 {
7693         int i;
7694
7695         for (i = 0; i < deferred_add_index; ++i)
7696                 if (!strcmp(name, deferred_add_names[i]))
7697                         return 1;
7698         return 0;
7699 }
7700
7701 int is_deferred_skip(char *name)
7702 {
7703         int i;
7704
7705         for (i = 0; i < deferred_skip_index; ++i)
7706                 if (!strcmp(name, deferred_skip_names[i]))
7707                         return 1;
7708         return 0;
7709 }
7710
7711 void probe_sysfs(void)
7712 {
7713         char path[64];
7714         char name_buf[16];
7715         FILE *input;
7716         int state;
7717         char *sp;
7718
7719         for (state = 10; state >= 0; --state) {
7720
7721                 sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
7722                 input = fopen(path, "r");
7723                 if (input == NULL)
7724                         continue;
7725                 if (!fgets(name_buf, sizeof(name_buf), input))
7726                         err(1, "%s: failed to read file", path);
7727
7728                 /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
7729                 sp = strchr(name_buf, '-');
7730                 if (!sp)
7731                         sp = strchrnul(name_buf, '\n');
7732                 *sp = '%';
7733                 *(sp + 1) = '\0';
7734
7735                 remove_underbar(name_buf);
7736
7737                 fclose(input);
7738
7739                 sprintf(path, "cpuidle/state%d/time", state);
7740
7741                 if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf))
7742                         continue;
7743
7744                 if (is_deferred_skip(name_buf))
7745                         continue;
7746
7747                 add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU);
7748         }
7749
7750         for (state = 10; state >= 0; --state) {
7751
7752                 sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
7753                 input = fopen(path, "r");
7754                 if (input == NULL)
7755                         continue;
7756                 if (!fgets(name_buf, sizeof(name_buf), input))
7757                         err(1, "%s: failed to read file", path);
7758                 /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
7759                 sp = strchr(name_buf, '-');
7760                 if (!sp)
7761                         sp = strchrnul(name_buf, '\n');
7762                 *sp = '\0';
7763                 fclose(input);
7764
7765                 remove_underbar(name_buf);
7766
7767                 sprintf(path, "cpuidle/state%d/usage", state);
7768
7769                 if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf))
7770                         continue;
7771
7772                 if (is_deferred_skip(name_buf))
7773                         continue;
7774
7775                 add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU);
7776         }
7777
7778 }
7779
7780 /*
7781  * parse cpuset with following syntax
7782  * 1,2,4..6,8-10 and set bits in cpu_subset
7783  */
7784 void parse_cpu_command(char *optarg)
7785 {
7786         if (!strcmp(optarg, "core")) {
7787                 if (cpu_subset)
7788                         goto error;
7789                 show_core_only++;
7790                 return;
7791         }
7792         if (!strcmp(optarg, "package")) {
7793                 if (cpu_subset)
7794                         goto error;
7795                 show_pkg_only++;
7796                 return;
7797         }
7798         if (show_core_only || show_pkg_only)
7799                 goto error;
7800
7801         cpu_subset = CPU_ALLOC(CPU_SUBSET_MAXCPUS);
7802         if (cpu_subset == NULL)
7803                 err(3, "CPU_ALLOC");
7804         cpu_subset_size = CPU_ALLOC_SIZE(CPU_SUBSET_MAXCPUS);
7805
7806         CPU_ZERO_S(cpu_subset_size, cpu_subset);
7807
7808         if (parse_cpu_str(optarg, cpu_subset, cpu_subset_size))
7809                 goto error;
7810
7811         return;
7812
7813 error:
7814         fprintf(stderr, "\"--cpu %s\" malformed\n", optarg);
7815         help();
7816         exit(-1);
7817 }
7818
7819 void cmdline(int argc, char **argv)
7820 {
7821         int opt;
7822         int option_index = 0;
7823         static struct option long_options[] = {
7824                 { "add", required_argument, 0, 'a' },
7825                 { "cpu", required_argument, 0, 'c' },
7826                 { "Dump", no_argument, 0, 'D' },
7827                 { "debug", no_argument, 0, 'd' },       /* internal, not documented */
7828                 { "enable", required_argument, 0, 'e' },
7829                 { "interval", required_argument, 0, 'i' },
7830                 { "IPC", no_argument, 0, 'I' },
7831                 { "num_iterations", required_argument, 0, 'n' },
7832                 { "header_iterations", required_argument, 0, 'N' },
7833                 { "help", no_argument, 0, 'h' },
7834                 { "hide", required_argument, 0, 'H' },  // meh, -h taken by --help
7835                 { "Joules", no_argument, 0, 'J' },
7836                 { "list", no_argument, 0, 'l' },
7837                 { "out", required_argument, 0, 'o' },
7838                 { "quiet", no_argument, 0, 'q' },
7839                 { "no-msr", no_argument, 0, 'M' },
7840                 { "no-perf", no_argument, 0, 'P' },
7841                 { "show", required_argument, 0, 's' },
7842                 { "Summary", no_argument, 0, 'S' },
7843                 { "TCC", required_argument, 0, 'T' },
7844                 { "version", no_argument, 0, 'v' },
7845                 { 0, 0, 0, 0 }
7846         };
7847
7848         progname = argv[0];
7849
7850         /*
7851          * Parse some options early, because they may make other options invalid,
7852          * like adding the MSR counter with --add and at the same time using --no-msr.
7853          */
7854         while ((opt = getopt_long_only(argc, argv, "MP", long_options, &option_index)) != -1) {
7855                 switch (opt) {
7856                 case 'M':
7857                         no_msr = 1;
7858                         break;
7859                 case 'P':
7860                         no_perf = 1;
7861                         break;
7862                 default:
7863                         break;
7864                 }
7865         }
7866         optind = 0;
7867
7868         while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qMST:v", long_options, &option_index)) != -1) {
7869                 switch (opt) {
7870                 case 'a':
7871                         parse_add_command(optarg);
7872                         break;
7873                 case 'c':
7874                         parse_cpu_command(optarg);
7875                         break;
7876                 case 'D':
7877                         dump_only++;
7878                         break;
7879                 case 'e':
7880                         /* --enable specified counter */
7881                         bic_enabled = bic_enabled | bic_lookup(optarg, SHOW_LIST);
7882                         break;
7883                 case 'd':
7884                         debug++;
7885                         ENABLE_BIC(BIC_DISABLED_BY_DEFAULT);
7886                         break;
7887                 case 'H':
7888                         /*
7889                          * --hide: do not show those specified
7890                          *  multiple invocations simply clear more bits in enabled mask
7891                          */
7892                         bic_enabled &= ~bic_lookup(optarg, HIDE_LIST);
7893                         break;
7894                 case 'h':
7895                 default:
7896                         help();
7897                         exit(1);
7898                 case 'i':
7899                         {
7900                                 double interval = strtod(optarg, NULL);
7901
7902                                 if (interval < 0.001) {
7903                                         fprintf(outf, "interval %f seconds is too small\n", interval);
7904                                         exit(2);
7905                                 }
7906
7907                                 interval_tv.tv_sec = interval_ts.tv_sec = interval;
7908                                 interval_tv.tv_usec = (interval - interval_tv.tv_sec) * 1000000;
7909                                 interval_ts.tv_nsec = (interval - interval_ts.tv_sec) * 1000000000;
7910                         }
7911                         break;
7912                 case 'J':
7913                         rapl_joules++;
7914                         break;
7915                 case 'l':
7916                         ENABLE_BIC(BIC_DISABLED_BY_DEFAULT);
7917                         list_header_only++;
7918                         quiet++;
7919                         break;
7920                 case 'o':
7921                         outf = fopen_or_die(optarg, "w");
7922                         break;
7923                 case 'q':
7924                         quiet = 1;
7925                         break;
7926                 case 'M':
7927                 case 'P':
7928                         /* Parsed earlier */
7929                         break;
7930                 case 'n':
7931                         num_iterations = strtod(optarg, NULL);
7932
7933                         if (num_iterations <= 0) {
7934                                 fprintf(outf, "iterations %d should be positive number\n", num_iterations);
7935                                 exit(2);
7936                         }
7937                         break;
7938                 case 'N':
7939                         header_iterations = strtod(optarg, NULL);
7940
7941                         if (header_iterations <= 0) {
7942                                 fprintf(outf, "iterations %d should be positive number\n", header_iterations);
7943                                 exit(2);
7944                         }
7945                         break;
7946                 case 's':
7947                         /*
7948                          * --show: show only those specified
7949                          *  The 1st invocation will clear and replace the enabled mask
7950                          *  subsequent invocations can add to it.
7951                          */
7952                         if (shown == 0)
7953                                 bic_enabled = bic_lookup(optarg, SHOW_LIST);
7954                         else
7955                                 bic_enabled |= bic_lookup(optarg, SHOW_LIST);
7956                         shown = 1;
7957                         break;
7958                 case 'S':
7959                         summary_only++;
7960                         break;
7961                 case 'T':
7962                         tj_max_override = atoi(optarg);
7963                         break;
7964                 case 'v':
7965                         print_version();
7966                         exit(0);
7967                         break;
7968                 }
7969         }
7970 }
7971
7972 void set_rlimit(void)
7973 {
7974         struct rlimit limit;
7975
7976         if (getrlimit(RLIMIT_NOFILE, &limit) < 0)
7977                 err(1, "Failed to get rlimit");
7978
7979         if (limit.rlim_max < MAX_NOFILE)
7980                 limit.rlim_max = MAX_NOFILE;
7981         if (limit.rlim_cur < MAX_NOFILE)
7982                 limit.rlim_cur = MAX_NOFILE;
7983
7984         if (setrlimit(RLIMIT_NOFILE, &limit) < 0)
7985                 err(1, "Failed to set rlimit");
7986 }
7987
7988 int main(int argc, char **argv)
7989 {
7990         int fd, ret;
7991
7992         fd = open("/sys/fs/cgroup/cgroup.procs", O_WRONLY);
7993         if (fd < 0)
7994                 goto skip_cgroup_setting;
7995
7996         ret = write(fd, "0\n", 2);
7997         if (ret == -1)
7998                 perror("Can't update cgroup\n");
7999
8000         close(fd);
8001
8002 skip_cgroup_setting:
8003         outf = stderr;
8004         cmdline(argc, argv);
8005
8006         if (!quiet) {
8007                 print_version();
8008                 print_bootcmd();
8009         }
8010
8011         probe_sysfs();
8012
8013         if (!getuid())
8014                 set_rlimit();
8015
8016         turbostat_init();
8017
8018         if (!no_msr)
8019                 msr_sum_record();
8020
8021         /* dump counters and exit */
8022         if (dump_only)
8023                 return get_and_dump_counters();
8024
8025         /* list header and exit */
8026         if (list_header_only) {
8027                 print_header(",");
8028                 flush_output_stdout();
8029                 return 0;
8030         }
8031
8032         /*
8033          * if any params left, it must be a command to fork
8034          */
8035         if (argc - optind)
8036                 return fork_it(argv + optind);
8037         else
8038                 turbostat_loop();
8039
8040         return 0;
8041 }