1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
|
// SPDX-License-Identifier: GPL-2.0+
// Copyright 2017 IBM Corp.
#include <linux/sched/mm.h>
#include <linux/mutex.h>
#include <linux/mmu_context.h>
#include <asm/copro.h>
#include <asm/pnv-ocxl.h>
#include "ocxl_internal.h"
#define SPA_PASID_BITS 15
#define SPA_PASID_MAX ((1 << SPA_PASID_BITS) - 1)
#define SPA_PE_MASK SPA_PASID_MAX
#define SPA_SPA_SIZE_LOG 22 /* Each SPA is 4 Mb */
#define SPA_CFG_SF (1ull << (63-0))
#define SPA_CFG_TA (1ull << (63-1))
#define SPA_CFG_HV (1ull << (63-3))
#define SPA_CFG_UV (1ull << (63-4))
#define SPA_CFG_XLAT_hpt (0ull << (63-6)) /* Hashed page table (HPT) mode */
#define SPA_CFG_XLAT_roh (2ull << (63-6)) /* Radix on HPT mode */
#define SPA_CFG_XLAT_ror (3ull << (63-6)) /* Radix on Radix mode */
#define SPA_CFG_PR (1ull << (63-49))
#define SPA_CFG_TC (1ull << (63-54))
#define SPA_CFG_DR (1ull << (63-59))
#define SPA_XSL_TF (1ull << (63-3)) /* Translation fault */
#define SPA_XSL_S (1ull << (63-38)) /* Store operation */
#define SPA_PE_VALID 0x80000000
struct pe_data {
struct mm_struct *mm;
/* callback to trigger when a translation fault occurs */
void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr);
/* opaque pointer to be passed to the above callback */
void *xsl_err_data;
struct rcu_head rcu;
};
struct spa {
struct ocxl_process_element *spa_mem;
int spa_order;
struct mutex spa_lock;
struct radix_tree_root pe_tree; /* Maps PE handles to pe_data */
char *irq_name;
int virq;
void __iomem *reg_dsisr;
void __iomem *reg_dar;
void __iomem *reg_tfc;
void __iomem *reg_pe_handle;
/*
* The following field are used by the memory fault
* interrupt handler. We can only have one interrupt at a
* time. The NPU won't raise another interrupt until the
* previous one has been ack'd by writing to the TFC register
*/
struct xsl_fault {
struct work_struct fault_work;
u64 pe;
u64 dsisr;
u64 dar;
struct pe_data pe_data;
} xsl_fault;
};
/*
* A opencapi link can be used be by several PCI functions. We have
* one link per device slot.
*
* A linked list of opencapi links should suffice, as there's a
* limited number of opencapi slots on a system and lookup is only
* done when the device is probed
*/
struct link {
struct list_head list;
struct kref ref;
int domain;
int bus;
int dev;
atomic_t irq_available;
struct spa *spa;
void *platform_data;
};
static struct list_head links_list = LIST_HEAD_INIT(links_list);
static DEFINE_MUTEX(links_list_lock);
enum xsl_response {
CONTINUE,
ADDRESS_ERROR,
RESTART,
};
static void read_irq(struct spa *spa, u64 *dsisr, u64 *dar, u64 *pe)
{
u64 reg;
*dsisr = in_be64(spa->reg_dsisr);
*dar = in_be64(spa->reg_dar);
reg = in_be64(spa->reg_pe_handle);
*pe = reg & SPA_PE_MASK;
}
static void ack_irq(struct spa *spa, enum xsl_response r)
{
u64 reg = 0;
/* continue is not supported */
if (r == RESTART)
reg = PPC_BIT(31);
else if (r == ADDRESS_ERROR)
reg = PPC_BIT(30);
else
WARN(1, "Invalid irq response %d\n", r);
if (reg)
out_be64(spa->reg_tfc, reg);
}
static void xsl_fault_handler_bh(struct work_struct *fault_work)
{
unsigned int flt = 0;
unsigned long access, flags, inv_flags = 0;
enum xsl_response r;
struct xsl_fault *fault = container_of(fault_work, struct xsl_fault,
fault_work);
struct spa *spa = container_of(fault, struct spa, xsl_fault);
int rc;
/*
* We need to release a reference on the mm whenever exiting this
* function (taken in the memory fault interrupt handler)
*/
rc = copro_handle_mm_fault(fault->pe_data.mm, fault->dar, fault->dsisr,
&flt);
if (rc) {
pr_debug("copro_handle_mm_fault failed: %d\n", rc);
if (fault->pe_data.xsl_err_cb) {
fault->pe_data.xsl_err_cb(
fault->pe_data.xsl_err_data,
fault->dar, fault->dsisr);
}
r = ADDRESS_ERROR;
goto ack;
}
if (!radix_enabled()) {
/*
* update_mmu_cache() will not have loaded the hash
* since current->trap is not a 0x400 or 0x300, so
* just call hash_page_mm() here.
*/
access = _PAGE_PRESENT | _PAGE_READ;
if (fault->dsisr & SPA_XSL_S)
access |= _PAGE_WRITE;
if (REGION_ID(fault->dar) != USER_REGION_ID)
access |= _PAGE_PRIVILEGED;
local_irq_save(flags);
hash_page_mm(fault->pe_data.mm, fault->dar, access, 0x300,
inv_flags);
local_irq_restore(flags);
}
r = RESTART;
ack:
mmdrop(fault->pe_data.mm);
ack_irq(spa, r);
}
static irqreturn_t xsl_fault_handler(int irq, void *data)
{
struct link *link = (struct link *) data;
struct spa *spa = link->spa;
u64 dsisr, dar, pe_handle;
struct pe_data *pe_data;
struct ocxl_process_element *pe;
int lpid, pid, tid;
read_irq(spa, &dsisr, &dar, &pe_handle);
WARN_ON(pe_handle > SPA_PE_MASK);
pe = spa->spa_mem + pe_handle;
lpid = be32_to_cpu(pe->lpid);
pid = be32_to_cpu(pe->pid);
tid = be32_to_cpu(pe->tid);
/* We could be reading all null values here if the PE is being
* removed while an interrupt kicks in. It's not supposed to
* happen if the driver notified the AFU to terminate the
* PASID, and the AFU waited for pending operations before
* acknowledging. But even if it happens, we won't find a
* memory context below and fail silently, so it should be ok.
*/
if (!(dsisr & SPA_XSL_TF)) {
WARN(1, "Invalid xsl interrupt fault register %#llx\n", dsisr);
ack_irq(spa, ADDRESS_ERROR);
return IRQ_HANDLED;
}
rcu_read_lock();
pe_data = radix_tree_lookup(&spa->pe_tree, pe_handle);
if (!pe_data) {
/*
* Could only happen if the driver didn't notify the
* AFU about PASID termination before removing the PE,
* or the AFU didn't wait for all memory access to
* have completed.
*
* Either way, we fail early, but we shouldn't log an
* error message, as it is a valid (if unexpected)
* scenario
*/
rcu_read_unlock();
pr_debug("Unknown mm context for xsl interrupt\n");
ack_irq(spa, ADDRESS_ERROR);
return IRQ_HANDLED;
}
WARN_ON(pe_data->mm->context.id != pid);
spa->xsl_fault.pe = pe_handle;
spa->xsl_fault.dar = dar;
spa->xsl_fault.dsisr = dsisr;
spa->xsl_fault.pe_data = *pe_data;
mmgrab(pe_data->mm); /* mm count is released by bottom half */
rcu_read_unlock();
schedule_work(&spa->xsl_fault.fault_work);
return IRQ_HANDLED;
}
static void unmap_irq_registers(struct spa *spa)
{
pnv_ocxl_unmap_xsl_regs(spa->reg_dsisr, spa->reg_dar, spa->reg_tfc,
spa->reg_pe_handle);
}
static int map_irq_registers(struct pci_dev *dev, struct spa *spa)
{
return pnv_ocxl_map_xsl_regs(dev, &spa->reg_dsisr, &spa->reg_dar,
&spa->reg_tfc, &spa->reg_pe_handle);
}
static int setup_xsl_irq(struct pci_dev *dev, struct link *link)
{
struct spa *spa = link->spa;
int rc;
int hwirq;
rc = pnv_ocxl_get_xsl_irq(dev, &hwirq);
if (rc)
return rc;
rc = map_irq_registers(dev, spa);
if (rc)
return rc;
spa->irq_name = kasprintf(GFP_KERNEL, "ocxl-xsl-%x-%x-%x",
link->domain, link->bus, link->dev);
if (!spa->irq_name) {
unmap_irq_registers(spa);
dev_err(&dev->dev, "Can't allocate name for xsl interrupt\n");
return -ENOMEM;
}
/*
* At some point, we'll need to look into allowing a higher
* number of interrupts. Could we have an IRQ domain per link?
*/
spa->virq = irq_create_mapping(NULL, hwirq);
if (!spa->virq) {
kfree(spa->irq_name);
unmap_irq_registers(spa);
dev_err(&dev->dev,
"irq_create_mapping failed for translation interrupt\n");
return -EINVAL;
}
dev_dbg(&dev->dev, "hwirq %d mapped to virq %d\n", hwirq, spa->virq);
rc = request_irq(spa->virq, xsl_fault_handler, 0, spa->irq_name,
link);
if (rc) {
irq_dispose_mapping(spa->virq);
kfree(spa->irq_name);
unmap_irq_registers(spa);
dev_err(&dev->dev,
"request_irq failed for translation interrupt: %d\n",
rc);
return -EINVAL;
}
return 0;
}
static void release_xsl_irq(struct link *link)
{
struct spa *spa = link->spa;
if (spa->virq) {
free_irq(spa->virq, link);
irq_dispose_mapping(spa->virq);
}
kfree(spa->irq_name);
unmap_irq_registers(spa);
}
static int alloc_spa(struct pci_dev *dev, struct link *link)
{
struct spa *spa;
spa = kzalloc(sizeof(struct spa), GFP_KERNEL);
if (!spa)
return -ENOMEM;
mutex_init(&spa->spa_lock);
INIT_RADIX_TREE(&spa->pe_tree, GFP_KERNEL);
INIT_WORK(&spa->xsl_fault.fault_work, xsl_fault_handler_bh);
spa->spa_order = SPA_SPA_SIZE_LOG - PAGE_SHIFT;
spa->spa_mem = (struct ocxl_process_element *)
__get_free_pages(GFP_KERNEL | __GFP_ZERO, spa->spa_order);
if (!spa->spa_mem) {
dev_err(&dev->dev, "Can't allocate Shared Process Area\n");
kfree(spa);
return -ENOMEM;
}
pr_debug("Allocated SPA for %x:%x:%x at %p\n", link->domain, link->bus,
link->dev, spa->spa_mem);
link->spa = spa;
return 0;
}
static void free_spa(struct link *link)
{
struct spa *spa = link->spa;
pr_debug("Freeing SPA for %x:%x:%x\n", link->domain, link->bus,
link->dev);
if (spa && spa->spa_mem) {
free_pages((unsigned long) spa->spa_mem, spa->spa_order);
kfree(spa);
link->spa = NULL;
}
}
static int alloc_link(struct pci_dev *dev, int PE_mask, struct link **out_link)
{
struct link *link;
int rc;
link = kzalloc(sizeof(struct link), GFP_KERNEL);
if (!link)
return -ENOMEM;
kref_init(&link->ref);
link->domain = pci_domain_nr(dev->bus);
link->bus = dev->bus->number;
link->dev = PCI_SLOT(dev->devfn);
atomic_set(&link->irq_available, MAX_IRQ_PER_LINK);
rc = alloc_spa(dev, link);
if (rc)
goto err_free;
rc = setup_xsl_irq(dev, link);
if (rc)
goto err_spa;
/* platform specific hook */
rc = pnv_ocxl_spa_setup(dev, link->spa->spa_mem, PE_mask,
&link->platform_data);
if (rc)
goto err_xsl_irq;
*out_link = link;
return 0;
err_xsl_irq:
release_xsl_irq(link);
err_spa:
free_spa(link);
err_free:
kfree(link);
return rc;
}
static void free_link(struct link *link)
{
release_xsl_irq(link);
free_spa(link);
kfree(link);
}
int ocxl_link_setup(struct pci_dev *dev, int PE_mask, void **link_handle)
{
int rc = 0;
struct link *link;
mutex_lock(&links_list_lock);
list_for_each_entry(link, &links_list, list) {
/* The functions of a device all share the same link */
if (link->domain == pci_domain_nr(dev->bus) &&
link->bus == dev->bus->number &&
link->dev == PCI_SLOT(dev->devfn)) {
kref_get(&link->ref);
*link_handle = link;
goto unlock;
}
}
rc = alloc_link(dev, PE_mask, &link);
if (rc)
goto unlock;
list_add(&link->list, &links_list);
*link_handle = link;
unlock:
mutex_unlock(&links_list_lock);
return rc;
}
static void release_xsl(struct kref *ref)
{
struct link *link = container_of(ref, struct link, ref);
list_del(&link->list);
/* call platform code before releasing data */
pnv_ocxl_spa_release(link->platform_data);
free_link(link);
}
void ocxl_link_release(struct pci_dev *dev, void *link_handle)
{
struct link *link = (struct link *) link_handle;
mutex_lock(&links_list_lock);
kref_put(&link->ref, release_xsl);
mutex_unlock(&links_list_lock);
}
static u64 calculate_cfg_state(bool kernel)
{
u64 state;
state = SPA_CFG_DR;
if (mfspr(SPRN_LPCR) & LPCR_TC)
state |= SPA_CFG_TC;
if (radix_enabled())
state |= SPA_CFG_XLAT_ror;
else
state |= SPA_CFG_XLAT_hpt;
state |= SPA_CFG_HV;
if (kernel) {
if (mfmsr() & MSR_SF)
state |= SPA_CFG_SF;
} else {
state |= SPA_CFG_PR;
if (!test_tsk_thread_flag(current, TIF_32BIT))
state |= SPA_CFG_SF;
}
return state;
}
int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
u64 amr, struct mm_struct *mm,
void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr),
void *xsl_err_data)
{
struct link *link = (struct link *) link_handle;
struct spa *spa = link->spa;
struct ocxl_process_element *pe;
int pe_handle, rc = 0;
struct pe_data *pe_data;
BUILD_BUG_ON(sizeof(struct ocxl_process_element) != 128);
if (pasid > SPA_PASID_MAX)
return -EINVAL;
mutex_lock(&spa->spa_lock);
pe_handle = pasid & SPA_PE_MASK;
pe = spa->spa_mem + pe_handle;
if (pe->software_state) {
rc = -EBUSY;
goto unlock;
}
pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL);
if (!pe_data) {
rc = -ENOMEM;
goto unlock;
}
pe_data->mm = mm;
pe_data->xsl_err_cb = xsl_err_cb;
pe_data->xsl_err_data = xsl_err_data;
memset(pe, 0, sizeof(struct ocxl_process_element));
pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
pe->lpid = cpu_to_be32(mfspr(SPRN_LPID));
pe->pid = cpu_to_be32(pidr);
pe->tid = cpu_to_be32(tidr);
pe->amr = cpu_to_be64(amr);
pe->software_state = cpu_to_be32(SPA_PE_VALID);
mm_context_add_copro(mm);
/*
* Barrier is to make sure PE is visible in the SPA before it
* is used by the device. It also helps with the global TLBI
* invalidation
*/
mb();
radix_tree_insert(&spa->pe_tree, pe_handle, pe_data);
/*
* The mm must stay valid for as long as the device uses it. We
* lower the count when the context is removed from the SPA.
*
* We grab mm_count (and not mm_users), as we don't want to
* end up in a circular dependency if a process mmaps its
* mmio, therefore incrementing the file ref count when
* calling mmap(), and forgets to unmap before exiting. In
* that scenario, when the kernel handles the death of the
* process, the file is not cleaned because unmap was not
* called, and the mm wouldn't be freed because we would still
* have a reference on mm_users. Incrementing mm_count solves
* the problem.
*/
mmgrab(mm);
unlock:
mutex_unlock(&spa->spa_lock);
return rc;
}
int ocxl_link_remove_pe(void *link_handle, int pasid)
{
struct link *link = (struct link *) link_handle;
struct spa *spa = link->spa;
struct ocxl_process_element *pe;
struct pe_data *pe_data;
int pe_handle, rc;
if (pasid > SPA_PASID_MAX)
return -EINVAL;
/*
* About synchronization with our memory fault handler:
*
* Before removing the PE, the driver is supposed to have
* notified the AFU, which should have cleaned up and make
* sure the PASID is no longer in use, including pending
* interrupts. However, there's no way to be sure...
*
* We clear the PE and remove the context from our radix
* tree. From that point on, any new interrupt for that
* context will fail silently, which is ok. As mentioned
* above, that's not expected, but it could happen if the
* driver or AFU didn't do the right thing.
*
* There could still be a bottom half running, but we don't
* need to wait/flush, as it is managing a reference count on
* the mm it reads from the radix tree.
*/
pe_handle = pasid & SPA_PE_MASK;
pe = spa->spa_mem + pe_handle;
mutex_lock(&spa->spa_lock);
if (!(be32_to_cpu(pe->software_state) & SPA_PE_VALID)) {
rc = -EINVAL;
goto unlock;
}
memset(pe, 0, sizeof(struct ocxl_process_element));
/*
* The barrier makes sure the PE is removed from the SPA
* before we clear the NPU context cache below, so that the
* old PE cannot be reloaded erroneously.
*/
mb();
/*
* hook to platform code
* On powerpc, the entry needs to be cleared from the context
* cache of the NPU.
*/
rc = pnv_ocxl_spa_remove_pe(link->platform_data, pe_handle);
WARN_ON(rc);
pe_data = radix_tree_delete(&spa->pe_tree, pe_handle);
if (!pe_data) {
WARN(1, "Couldn't find pe data when removing PE\n");
} else {
mm_context_remove_copro(pe_data->mm);
mmdrop(pe_data->mm);
kfree_rcu(pe_data, rcu);
}
unlock:
mutex_unlock(&spa->spa_lock);
return rc;
}
int ocxl_link_irq_alloc(void *link_handle, int *hw_irq, u64 *trigger_addr)
{
struct link *link = (struct link *) link_handle;
int rc, irq;
u64 addr;
if (atomic_dec_if_positive(&link->irq_available) < 0)
return -ENOSPC;
rc = pnv_ocxl_alloc_xive_irq(&irq, &addr);
if (rc) {
atomic_inc(&link->irq_available);
return rc;
}
*hw_irq = irq;
*trigger_addr = addr;
return 0;
}
void ocxl_link_free_irq(void *link_handle, int hw_irq)
{
struct link *link = (struct link *) link_handle;
pnv_ocxl_free_xive_irq(hw_irq);
atomic_inc(&link->irq_available);
}
|