Merge tag 'sched-urgent-2024-03-24' of git://git.kernel.org/pub/scm/linux/kernel...
[sfrench/cifs-2.6.git] / arch / riscv / kernel / unaligned_access_speed.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright 2024 Rivos Inc.
4  */
5
6 #include <linux/cpu.h>
7 #include <linux/cpumask.h>
8 #include <linux/jump_label.h>
9 #include <linux/mm.h>
10 #include <linux/smp.h>
11 #include <linux/types.h>
12 #include <asm/cpufeature.h>
13 #include <asm/hwprobe.h>
14
15 #include "copy-unaligned.h"
16
17 #define MISALIGNED_ACCESS_JIFFIES_LG2 1
18 #define MISALIGNED_BUFFER_SIZE 0x4000
19 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
20 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
21
22 DEFINE_PER_CPU(long, misaligned_access_speed);
23
24 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
25 static cpumask_t fast_misaligned_access;
26 static int check_unaligned_access(void *param)
27 {
28         int cpu = smp_processor_id();
29         u64 start_cycles, end_cycles;
30         u64 word_cycles;
31         u64 byte_cycles;
32         int ratio;
33         unsigned long start_jiffies, now;
34         struct page *page = param;
35         void *dst;
36         void *src;
37         long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
38
39         if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
40                 return 0;
41
42         /* Make an unaligned destination buffer. */
43         dst = (void *)((unsigned long)page_address(page) | 0x1);
44         /* Unalign src as well, but differently (off by 1 + 2 = 3). */
45         src = dst + (MISALIGNED_BUFFER_SIZE / 2);
46         src += 2;
47         word_cycles = -1ULL;
48         /* Do a warmup. */
49         __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
50         preempt_disable();
51         start_jiffies = jiffies;
52         while ((now = jiffies) == start_jiffies)
53                 cpu_relax();
54
55         /*
56          * For a fixed amount of time, repeatedly try the function, and take
57          * the best time in cycles as the measurement.
58          */
59         while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
60                 start_cycles = get_cycles64();
61                 /* Ensure the CSR read can't reorder WRT to the copy. */
62                 mb();
63                 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
64                 /* Ensure the copy ends before the end time is snapped. */
65                 mb();
66                 end_cycles = get_cycles64();
67                 if ((end_cycles - start_cycles) < word_cycles)
68                         word_cycles = end_cycles - start_cycles;
69         }
70
71         byte_cycles = -1ULL;
72         __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
73         start_jiffies = jiffies;
74         while ((now = jiffies) == start_jiffies)
75                 cpu_relax();
76
77         while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
78                 start_cycles = get_cycles64();
79                 mb();
80                 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
81                 mb();
82                 end_cycles = get_cycles64();
83                 if ((end_cycles - start_cycles) < byte_cycles)
84                         byte_cycles = end_cycles - start_cycles;
85         }
86
87         preempt_enable();
88
89         /* Don't divide by zero. */
90         if (!word_cycles || !byte_cycles) {
91                 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
92                         cpu);
93
94                 return 0;
95         }
96
97         if (word_cycles < byte_cycles)
98                 speed = RISCV_HWPROBE_MISALIGNED_FAST;
99
100         ratio = div_u64((byte_cycles * 100), word_cycles);
101         pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
102                 cpu,
103                 ratio / 100,
104                 ratio % 100,
105                 (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
106
107         per_cpu(misaligned_access_speed, cpu) = speed;
108
109         /*
110          * Set the value of fast_misaligned_access of a CPU. These operations
111          * are atomic to avoid race conditions.
112          */
113         if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
114                 cpumask_set_cpu(cpu, &fast_misaligned_access);
115         else
116                 cpumask_clear_cpu(cpu, &fast_misaligned_access);
117
118         return 0;
119 }
120
121 static void check_unaligned_access_nonboot_cpu(void *param)
122 {
123         unsigned int cpu = smp_processor_id();
124         struct page **pages = param;
125
126         if (smp_processor_id() != 0)
127                 check_unaligned_access(pages[cpu]);
128 }
129
130 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
131
132 static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
133 {
134         if (cpumask_weight(mask) == weight)
135                 static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
136         else
137                 static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
138 }
139
140 static void set_unaligned_access_static_branches_except_cpu(int cpu)
141 {
142         /*
143          * Same as set_unaligned_access_static_branches, except excludes the
144          * given CPU from the result. When a CPU is hotplugged into an offline
145          * state, this function is called before the CPU is set to offline in
146          * the cpumask, and thus the CPU needs to be explicitly excluded.
147          */
148
149         cpumask_t fast_except_me;
150
151         cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
152         cpumask_clear_cpu(cpu, &fast_except_me);
153
154         modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
155 }
156
157 static void set_unaligned_access_static_branches(void)
158 {
159         /*
160          * This will be called after check_unaligned_access_all_cpus so the
161          * result of unaligned access speed for all CPUs will be available.
162          *
163          * To avoid the number of online cpus changing between reading
164          * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
165          * held before calling this function.
166          */
167
168         cpumask_t fast_and_online;
169
170         cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
171
172         modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
173 }
174
175 static int lock_and_set_unaligned_access_static_branch(void)
176 {
177         cpus_read_lock();
178         set_unaligned_access_static_branches();
179         cpus_read_unlock();
180
181         return 0;
182 }
183
184 arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
185
186 static int riscv_online_cpu(unsigned int cpu)
187 {
188         static struct page *buf;
189
190         /* We are already set since the last check */
191         if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
192                 goto exit;
193
194         buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
195         if (!buf) {
196                 pr_warn("Allocation failure, not measuring misaligned performance\n");
197                 return -ENOMEM;
198         }
199
200         check_unaligned_access(buf);
201         __free_pages(buf, MISALIGNED_BUFFER_ORDER);
202
203 exit:
204         set_unaligned_access_static_branches();
205
206         return 0;
207 }
208
209 static int riscv_offline_cpu(unsigned int cpu)
210 {
211         set_unaligned_access_static_branches_except_cpu(cpu);
212
213         return 0;
214 }
215
216 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
217 static int check_unaligned_access_speed_all_cpus(void)
218 {
219         unsigned int cpu;
220         unsigned int cpu_count = num_possible_cpus();
221         struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
222
223         if (!bufs) {
224                 pr_warn("Allocation failure, not measuring misaligned performance\n");
225                 return 0;
226         }
227
228         /*
229          * Allocate separate buffers for each CPU so there's no fighting over
230          * cache lines.
231          */
232         for_each_cpu(cpu, cpu_online_mask) {
233                 bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
234                 if (!bufs[cpu]) {
235                         pr_warn("Allocation failure, not measuring misaligned performance\n");
236                         goto out;
237                 }
238         }
239
240         /* Check everybody except 0, who stays behind to tend jiffies. */
241         on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
242
243         /* Check core 0. */
244         smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
245
246         /*
247          * Setup hotplug callbacks for any new CPUs that come online or go
248          * offline.
249          */
250         cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
251                                   riscv_online_cpu, riscv_offline_cpu);
252
253 out:
254         for_each_cpu(cpu, cpu_online_mask) {
255                 if (bufs[cpu])
256                         __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
257         }
258
259         kfree(bufs);
260         return 0;
261 }
262
263 static int check_unaligned_access_all_cpus(void)
264 {
265         bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
266
267         if (!all_cpus_emulated)
268                 return check_unaligned_access_speed_all_cpus();
269
270         return 0;
271 }
272 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
273 static int check_unaligned_access_all_cpus(void)
274 {
275         check_unaligned_access_emulated_all_cpus();
276
277         return 0;
278 }
279 #endif
280
281 arch_initcall(check_unaligned_access_all_cpus);