- balloon: don't crash in HVM-with-PoD guests.
[opensuse:kernel-source.git] / patches.xen / xen-blkfront-multi-page-ring
1 From: Ben Howard <behoward@amazon.com>
2 Subject: blkfront: allow using multiple page backend communication ring
3 Patch-mainline: n/a
4
5 [jb: adjust xenstore interface to match backend, some other cleanup]
6 [jb: fix ring cleanup]
7 [jb: fix negotiation]
8 [jb: use individually allocated pages and vmap() for ring]
9 [jb: re-work resume to (hopefully) cope with ring size shrinking across suspend/resume]
10 Acked-by: jbeulich@suse.com
11
12 --- 13.1.orig/drivers/xen/blkfront/blkfront.c   2014-02-10 16:16:40.000000000 +0100
13 +++ 13.1/drivers/xen/blkfront/blkfront.c        2014-02-10 16:25:55.000000000 +0100
14 @@ -40,7 +40,9 @@
15  #include <linux/cdrom.h>
16  #include <linux/sched.h>
17  #include <linux/interrupt.h>
18 +#include <linux/log2.h>
19  #include <linux/scatterlist.h>
20 +#include <linux/vmalloc.h>
21  #include <scsi/scsi.h>
22  #include <xen/evtchn.h>
23  #include <xen/xenbus.h>
24 @@ -54,12 +56,15 @@
25  #include <xen/platform-compat.h>
26  #endif
27  
28 +struct blk_resume_entry {
29 +       struct list_head list;
30 +       struct blk_shadow copy;
31 +};
32 +
33  #define BLKIF_STATE_DISCONNECTED 0
34  #define BLKIF_STATE_CONNECTED    1
35  #define BLKIF_STATE_SUSPENDED    2
36  
37 -#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
38 -    (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
39  
40  static void connect(struct blkfront_info *);
41  static void blkfront_closing(struct blkfront_info *);
42 @@ -71,7 +76,8 @@ static void kick_pending_request_queues(
43  
44  static irqreturn_t blkif_int(int irq, void *dev_id);
45  static void blkif_restart_queue(struct work_struct *arg);
46 -static int blkif_recover(struct blkfront_info *);
47 +static int blkif_recover(struct blkfront_info *, unsigned int old_ring_size,
48 +                        unsigned int new_ring_size);
49  static void blkif_completion(struct blk_shadow *);
50  static void blkif_free(struct blkfront_info *, int);
51  
52 @@ -85,8 +91,9 @@ static void blkif_free(struct blkfront_i
53  static int blkfront_probe(struct xenbus_device *dev,
54                           const struct xenbus_device_id *id)
55  {
56 -       int err, vdevice, i;
57 +       int err, vdevice;
58         struct blkfront_info *info;
59 +       enum xenbus_state backend_state;
60  
61         /* FIXME: Use dynamic device id if this is not set. */
62         err = xenbus_scanf(XBT_NIL, dev->nodename,
63 @@ -113,15 +120,21 @@ static int blkfront_probe(struct xenbus_
64         info->vdevice = vdevice;
65         info->connected = BLKIF_STATE_DISCONNECTED;
66         INIT_WORK(&info->work, blkif_restart_queue);
67 -
68 -       for (i = 0; i < BLK_RING_SIZE; i++)
69 -               info->shadow[i].req.id = i+1;
70 -       info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
71 +       INIT_LIST_HEAD(&info->resume_list);
72  
73         /* Front end dir is a number, which is used as the id. */
74         info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
75         dev_set_drvdata(&dev->dev, info);
76  
77 +       backend_state = xenbus_read_driver_state(dev->otherend);
78 +       /*
79 +        * XenbusStateInitWait would be the correct state to enter here,
80 +        * but (at least) blkback considers this a fatal error.
81 +        */
82 +       xenbus_switch_state(dev, XenbusStateInitialising);
83 +       if (backend_state != XenbusStateInitWait)
84 +               return 0;
85 +
86         err = talk_to_backend(dev, info);
87         if (err) {
88                 kfree(info);
89 @@ -142,28 +155,73 @@ static int blkfront_probe(struct xenbus_
90  static int blkfront_resume(struct xenbus_device *dev)
91  {
92         struct blkfront_info *info = dev_get_drvdata(&dev->dev);
93 -       int err;
94 +       enum xenbus_state backend_state;
95  
96         DPRINTK("blkfront_resume: %s\n", dev->nodename);
97  
98         blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
99  
100 -       err = talk_to_backend(dev, info);
101 -       if (info->connected == BLKIF_STATE_SUSPENDED && !err)
102 -               err = blkif_recover(info);
103 +       backend_state = xenbus_read_driver_state(dev->otherend);
104 +       /* See respective comment in blkfront_probe(). */
105 +       xenbus_switch_state(dev, XenbusStateInitialising);
106 +       if (backend_state != XenbusStateInitWait)
107 +               return 0;
108  
109 -       return err;
110 +       return talk_to_backend(dev, info);
111  }
112  
113  
114 +static void shadow_init(struct blk_shadow *shadow, unsigned int ring_size)
115 +{
116 +       unsigned int i = 0;
117 +
118 +       WARN_ON(!ring_size);
119 +       while (++i < ring_size)
120 +               shadow[i - 1].req.id = i;
121 +       shadow[i - 1].req.id = 0x0fffffff;
122 +}
123 +
124  /* Common code used when first setting up, and when resuming. */
125  static int talk_to_backend(struct xenbus_device *dev,
126                            struct blkfront_info *info)
127  {
128 -       const char *message = NULL;
129 +       unsigned int ring_size, ring_order;
130 +       unsigned int old_ring_size = RING_SIZE(&info->ring);
131 +       const char *what = NULL;
132         struct xenbus_transaction xbt;
133         int err;
134  
135 +       if (dev->state >= XenbusStateInitialised)
136 +               return 0;
137 +
138 +       err = xenbus_scanf(XBT_NIL, dev->otherend,
139 +                          "max-ring-pages", "%u", &ring_size);
140 +       if (err != 1)
141 +               ring_size = 0;
142 +       else if (!ring_size)
143 +               pr_warn("blkfront: %s: zero max-ring-pages\n", dev->nodename);
144 +       err = xenbus_scanf(XBT_NIL, dev->otherend,
145 +                          "max-ring-page-order", "%u", &ring_order);
146 +       if (err != 1)
147 +               ring_order = ring_size ? ilog2(ring_size) : 0;
148 +       else if (!ring_size)
149 +               /* nothing */;
150 +       else if ((ring_size - 1) >> ring_order)
151 +               pr_warn("blkfront: %s: max-ring-pages (%#x) inconsistent with"
152 +                       " max-ring-page-order (%u)\n",
153 +                       dev->nodename, ring_size, ring_order);
154 +       else
155 +               ring_order = ilog2(ring_size);
156 +
157 +       if (ring_order > BLK_MAX_RING_PAGE_ORDER)
158 +               ring_order = BLK_MAX_RING_PAGE_ORDER;
159 +       /*
160 +        * While for larger rings not all pages are actually used, be on the
161 +        * safe side and set up a full power of two to please as many backends
162 +        * as possible.
163 +        */
164 +       info->ring_size = ring_size = 1U << ring_order;
165 +
166         /* Create shared ring, alloc event channel. */
167         err = setup_blkring(dev, info);
168         if (err)
169 @@ -176,24 +234,45 @@ again:
170                 goto destroy_blkring;
171         }
172  
173 -       err = xenbus_printf(xbt, dev->nodename,
174 -                           "ring-ref","%u", info->ring_ref);
175 -       if (err) {
176 -               message = "writing ring-ref";
177 -               goto abort_transaction;
178 -       }
179 -       err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
180 +       if (ring_size == 1) {
181 +               what = "ring-ref";
182 +               err = xenbus_printf(xbt, dev->nodename, what, "%u",
183 +                                   info->ring_refs[0]);
184 +               if (err)
185 +                       goto abort_transaction;
186 +       } else {
187 +               unsigned int i;
188 +               char buf[16];
189 +
190 +               what = "ring-page-order";
191 +               err = xenbus_printf(xbt, dev->nodename, what, "%u",
192 +                                   ring_order);
193 +               if (err)
194 +                       goto abort_transaction;
195 +               what = "num-ring-pages";
196 +               err = xenbus_printf(xbt, dev->nodename, what, "%u", ring_size);
197 +               if (err)
198 +                       goto abort_transaction;
199 +               what = buf;
200 +               for (i = 0; i < ring_size; i++) {
201 +                       snprintf(buf, sizeof(buf), "ring-ref%u", i);
202 +                       err = xenbus_printf(xbt, dev->nodename, what, "%u",
203 +                                           info->ring_refs[i]);
204 +                       if (err)
205 +                               goto abort_transaction;
206 +               }
207 +       }
208 +
209 +       what = "event-channel";
210 +       err = xenbus_printf(xbt, dev->nodename, what, "%u",
211                             irq_to_evtchn_port(info->irq));
212 -       if (err) {
213 -               message = "writing event-channel";
214 +       if (err)
215                 goto abort_transaction;
216 -       }
217 -       err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
218 +       what = "protocol";
219 +       err = xenbus_printf(xbt, dev->nodename, what, "%s",
220                             XEN_IO_PROTO_ABI_NATIVE);
221 -       if (err) {
222 -               message = "writing protocol";
223 +       if (err)
224                 goto abort_transaction;
225 -       }
226  
227         err = xenbus_transaction_end(xbt, 0);
228         if (err) {
229 @@ -205,12 +284,27 @@ again:
230  
231         xenbus_switch_state(dev, XenbusStateInitialised);
232  
233 +       ring_size = RING_SIZE(&info->ring);
234 +       switch (info->connected) {
235 +       case BLKIF_STATE_DISCONNECTED:
236 +               shadow_init(info->shadow, ring_size);
237 +               break;
238 +       case BLKIF_STATE_SUSPENDED:
239 +               err = blkif_recover(info, old_ring_size, ring_size);
240 +               if (err)
241 +                       goto out;
242 +               break;
243 +       }
244 +
245 +       pr_info("blkfront: %s: ring-pages=%u nr_ents=%u\n",
246 +               dev->nodename, info->ring_size, ring_size);
247 +
248         return 0;
249  
250   abort_transaction:
251         xenbus_transaction_end(xbt, 1);
252 -       if (message)
253 -               xenbus_dev_fatal(dev, err, "%s", message);
254 +       if (what)
255 +               xenbus_dev_fatal(dev, err, "writing %s", what);
256   destroy_blkring:
257         blkif_free(info, 0);
258   out:
259 @@ -223,26 +317,35 @@ static int setup_blkring(struct xenbus_d
260  {
261         blkif_sring_t *sring;
262         int err;
263 +       unsigned int nr;
264  
265 -       info->ring_ref = GRANT_INVALID_REF;
266 +       for (nr = 0; nr < info->ring_size; nr++) {
267 +               info->ring_refs[nr] = GRANT_INVALID_REF;
268 +               info->ring_pages[nr] = alloc_page(GFP_NOIO | __GFP_HIGH
269 +                                                | __GFP_HIGHMEM);
270 +               if (!info->ring_pages[nr])
271 +                       break;
272 +       }
273  
274 -       sring = (blkif_sring_t *)__get_free_page(GFP_NOIO | __GFP_HIGH);
275 +       sring = nr == info->ring_size
276 +               ? vmap(info->ring_pages, nr, VM_MAP, PAGE_KERNEL)
277 +               : NULL;
278         if (!sring) {
279 +               while (nr--)
280 +                       __free_page(info->ring_pages[nr]);
281                 xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
282                 return -ENOMEM;
283         }
284         SHARED_RING_INIT(sring);
285 -       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
286 +       FRONT_RING_INIT(&info->ring, sring,
287 +                       (unsigned long)info->ring_size << PAGE_SHIFT);
288  
289         sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
290  
291 -       err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
292 -       if (err < 0) {
293 -               free_page((unsigned long)sring);
294 -               info->ring.sring = NULL;
295 +       err = xenbus_multi_grant_ring(dev, nr, info->ring_pages,
296 +                                     info->ring_refs);
297 +       if (err < 0)
298                 goto fail;
299 -       }
300 -       info->ring_ref = err;
301  
302         err = bind_listening_port_to_irqhandler(
303                 dev->otherend_id, blkif_int, 0, "blkif", info);
304 @@ -273,13 +376,19 @@ static void backend_changed(struct xenbu
305  
306         switch (backend_state) {
307         case XenbusStateInitialising:
308 -       case XenbusStateInitWait:
309         case XenbusStateInitialised:
310         case XenbusStateReconfiguring:
311         case XenbusStateReconfigured:
312         case XenbusStateUnknown:
313                 break;
314  
315 +       case XenbusStateInitWait:
316 +               if (talk_to_backend(dev, info)) {
317 +                       dev_set_drvdata(&dev->dev, NULL);
318 +                       kfree(info);
319 +               }
320 +               break;
321 +
322         case XenbusStateConnected:
323                 connect(info);
324                 break;
325 @@ -566,7 +675,7 @@ static inline int GET_ID_FROM_FREELIST(
326         struct blkfront_info *info)
327  {
328         unsigned long free = info->shadow_free;
329 -       BUG_ON(free >= BLK_RING_SIZE);
330 +       BUG_ON(free >= RING_SIZE(&info->ring));
331         info->shadow_free = info->shadow[free].req.id;
332         info->shadow[free].req.id = 0x0fffffee; /* debug */
333         return free;
334 @@ -613,6 +722,44 @@ static inline void flush_requests(struct
335  
336  static void kick_pending_request_queues(struct blkfront_info *info)
337  {
338 +       bool queued = false;
339 +
340 +       /* Recover stage 3: Re-queue pending requests. */
341 +       while (!list_empty(&info->resume_list) && !RING_FULL(&info->ring)) {
342 +               /* Grab a request slot and copy shadow state into it. */
343 +               struct blk_resume_entry *ent =
344 +                       list_first_entry(&info->resume_list,
345 +                                        struct blk_resume_entry, list);
346 +               blkif_request_t *req =
347 +                       RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
348 +               unsigned int i;
349 +
350 +               *req = ent->copy.req;
351 +
352 +               /* We get a new request id, and must reset the shadow state. */
353 +               req->id = GET_ID_FROM_FREELIST(info);
354 +               info->shadow[req->id] = ent->copy;
355 +               info->shadow[req->id].req.id = req->id;
356 +
357 +               /* Rewrite any grant references invalidated by susp/resume. */
358 +               for (i = 0; i < req->nr_segments; i++)
359 +                       gnttab_grant_foreign_access_ref(req->seg[i].gref,
360 +                               info->xbdev->otherend_id,
361 +                               pfn_to_mfn(ent->copy.frame[i]),
362 +                               rq_data_dir(ent->copy.request) ?
363 +                               GTF_readonly : 0);
364 +
365 +               info->ring.req_prod_pvt++;
366 +               queued = true;
367 +
368 +               __list_del_entry(&ent->list);
369 +               kfree(ent);
370 +       }
371 +
372 +       /* Send off requeued requests */
373 +       if (queued)
374 +               flush_requests(info);
375 +
376         if (!RING_FULL(&info->ring)) {
377                 /* Re-enable calldowns. */
378                 blk_start_queue(info->rq);
379 @@ -980,11 +1127,11 @@ static irqreturn_t blkif_int(int irq, vo
380                 int ret;
381  
382                 bret = RING_GET_RESPONSE(&info->ring, i);
383 -               if (unlikely(bret->id >= BLK_RING_SIZE)) {
384 +               if (unlikely(bret->id >= RING_SIZE(&info->ring))) {
385                         /*
386                          * The backend has messed up and given us an id that
387                          * we would never have given to it (we stamp it up to
388 -                        * BLK_RING_SIZE - see GET_ID_FROM_FREELIST()).
389 +                        * RING_SIZE() - see GET_ID_FROM_FREELIST()).
390                          */
391                         pr_warning("%s: response to %s has incorrect id (%#Lx)\n",
392                                    info->gd->disk_name,
393 @@ -1096,12 +1243,10 @@ static void blkif_free(struct blkfront_i
394         flush_work(&info->work);
395  
396         /* Free resources associated with old device channel. */
397 -       if (info->ring_ref != GRANT_INVALID_REF) {
398 -               gnttab_end_foreign_access(info->ring_ref, 
399 -                                         (unsigned long)info->ring.sring);
400 -               info->ring_ref = GRANT_INVALID_REF;
401 -               info->ring.sring = NULL;
402 -       }
403 +       vunmap(info->ring.sring);
404 +       info->ring.sring = NULL;
405 +       gnttab_multi_end_foreign_access(info->ring_size,
406 +                                       info->ring_refs, info->ring_pages);
407         if (info->irq)
408                 unbind_from_irqhandler(info->irq, info);
409         info->irq = 0;
410 @@ -1117,55 +1262,41 @@ static void blkif_completion(struct blk_
411                 gnttab_end_foreign_access(s->req.seg[i].gref, 0UL);
412  }
413  
414 -static int blkif_recover(struct blkfront_info *info)
415 -{
416 -       int i;
417 -       blkif_request_t *req;
418 -       struct blk_shadow *copy;
419 -       int j;
420 +static int blkif_recover(struct blkfront_info *info,
421 +                        unsigned int old_ring_size,
422 +                        unsigned int ring_size)
423 +{
424 +       unsigned int i;
425 +       struct blk_resume_entry *ent;
426 +       LIST_HEAD(list);
427  
428         /* Stage 1: Make a safe copy of the shadow state. */
429 -       copy = kmemdup(info->shadow, sizeof(info->shadow),
430 -                      GFP_NOIO | __GFP_NOFAIL | __GFP_HIGH);
431 -       if (!copy)
432 +       for (i = 0; i < old_ring_size; i++) {
433 +               /* Not in use? */
434 +               if (!info->shadow[i].request)
435 +                       continue;
436 +               ent = kmalloc(sizeof(*ent),
437 +                             GFP_NOIO | __GFP_NOFAIL | __GFP_HIGH);
438 +               if (!ent)
439 +                       break;
440 +               ent->copy = info->shadow[i];
441 +               list_add_tail(&ent->list, &list);
442 +       }
443 +       if (i < old_ring_size) {
444 +               while (!list_empty(&list)) {
445 +                       ent = list_first_entry(&list, struct blk_resume_entry,
446 +                                              list);
447 +                       __list_del_entry(&ent->list);
448 +                       kfree(ent);
449 +               }
450                 return -ENOMEM;
451 +       }
452 +       list_splice_tail(&list, &info->resume_list);
453  
454         /* Stage 2: Set up free list. */
455         memset(&info->shadow, 0, sizeof(info->shadow));
456 -       for (i = 0; i < BLK_RING_SIZE; i++)
457 -               info->shadow[i].req.id = i+1;
458 +       shadow_init(info->shadow, ring_size);
459         info->shadow_free = info->ring.req_prod_pvt;
460 -       info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
461 -
462 -       /* Stage 3: Find pending requests and requeue them. */
463 -       for (i = 0; i < BLK_RING_SIZE; i++) {
464 -               /* Not in use? */
465 -               if (!copy[i].request)
466 -                       continue;
467 -
468 -               /* Grab a request slot and copy shadow state into it. */
469 -               req = RING_GET_REQUEST(
470 -                       &info->ring, info->ring.req_prod_pvt);
471 -               *req = copy[i].req;
472 -
473 -               /* We get a new request id, and must reset the shadow state. */
474 -               req->id = GET_ID_FROM_FREELIST(info);
475 -               memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
476 -
477 -               /* Rewrite any grant references invalidated by susp/resume. */
478 -               for (j = 0; j < req->nr_segments; j++)
479 -                       gnttab_grant_foreign_access_ref(
480 -                               req->seg[j].gref,
481 -                               info->xbdev->otherend_id,
482 -                               pfn_to_mfn(info->shadow[req->id].frame[j]),
483 -                               rq_data_dir(info->shadow[req->id].request) ?
484 -                               GTF_readonly : 0);
485 -               info->shadow[req->id].req = *req;
486 -
487 -               info->ring.req_prod_pvt++;
488 -       }
489 -
490 -       kfree(copy);
491  
492         (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
493  
494 @@ -1174,9 +1305,6 @@ static int blkif_recover(struct blkfront
495         /* Now safe for us to use the shared ring */
496         info->connected = BLKIF_STATE_CONNECTED;
497  
498 -       /* Send off requeued requests */
499 -       flush_requests(info);
500 -
501         /* Kick any other new requests queued since we resumed */
502         kick_pending_request_queues(info);
503  
504 --- 13.1.orig/drivers/xen/blkfront/block.h      2013-05-24 14:28:50.000000000 +0200
505 +++ 13.1/drivers/xen/blkfront/block.h   2012-06-08 10:50:19.000000000 +0200
506 @@ -81,7 +81,10 @@ struct blk_shadow {
507         unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
508  };
509  
510 -#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
511 +#define BLK_MAX_RING_PAGE_ORDER 4U
512 +#define BLK_MAX_RING_PAGES (1U << BLK_MAX_RING_PAGE_ORDER)
513 +#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, \
514 +                                           BLK_MAX_RING_PAGES * PAGE_SIZE)
515  
516  /*
517   * We have one of these per vbd, whether ide, scsi or 'other'.  They
518 @@ -96,7 +99,7 @@ struct blkfront_info
519         int vdevice;
520         blkif_vdev_t handle;
521         int connected;
522 -       int ring_ref;
523 +       unsigned int ring_size;
524         blkif_front_ring_t ring;
525         spinlock_t io_lock;
526         struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
527 @@ -105,7 +108,10 @@ struct blkfront_info
528         struct request_queue *rq;
529         struct work_struct work;
530         struct gnttab_free_callback callback;
531 -       struct blk_shadow shadow[BLK_RING_SIZE];
532 +       struct blk_shadow shadow[BLK_MAX_RING_SIZE];
533 +       struct list_head resume_list;
534 +       grant_ref_t ring_refs[BLK_MAX_RING_PAGES];
535 +       struct page *ring_pages[BLK_MAX_RING_PAGES];
536         unsigned long shadow_free;
537         unsigned int feature_flush;
538         unsigned int flush_op;