Annotation of src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c, Revision 1.6.14.2
1.1 haad 1: /*
2: * CDDL HEADER START
3: *
4: * The contents of this file are subject to the terms of the
5: * Common Development and Distribution License (the "License").
6: * You may not use this file except in compliance with the License.
7: *
8: * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9: * or http://www.opensolaris.org/os/licensing.
10: * See the License for the specific language governing permissions
11: * and limitations under the License.
12: *
13: * When distributing Covered Code, include this CDDL HEADER in each
14: * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15: * If applicable, add the following below this CDDL HEADER, with the
16: * fields enclosed by brackets "[]" replaced with your own identifying
17: * information: Portions Copyright [yyyy] [name of copyright owner]
18: *
19: * CDDL HEADER END
20: */
21: /*
1.6.14.1 pgoyette 22: * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23: * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24: * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25: * Copyright (c) 2013 Joyent, Inc. All rights reserved.
1.1 haad 26: */
27:
28: #include <sys/zfs_context.h>
29: #include <sys/spa.h>
30: #include <sys/refcount.h>
31: #include <sys/vdev_disk.h>
32: #include <sys/vdev_impl.h>
33: #include <sys/fs/zfs.h>
34: #include <sys/zio.h>
35: #include <sys/sunldi.h>
36: #include <sys/fm/fs/zfs.h>
1.2 haad 37: #include <sys/disklabel.h>
38: #include <sys/dkio.h>
39: #include <sys/workqueue.h>
1.1 haad 40:
1.6.14.1 pgoyette 41: #ifdef __NetBSD__
42: static int
43: geterror(struct buf *bp)
44: {
45:
46: return (bp->b_error);
47: }
48: #endif
49:
1.1 haad 50: /*
51: * Virtual device vector for disks.
52: */
53:
1.2 haad 54: static void vdev_disk_io_intr(buf_t *);
1.1 haad 55:
1.2 haad 56: static void
1.6.14.1 pgoyette 57: vdev_disk_alloc(vdev_t *vd)
58: {
59: vdev_disk_t *dvd;
60:
61: dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
62:
63: #ifdef illumos
64: /*
65: * Create the LDI event callback list.
66: */
67: list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
68: offsetof(vdev_disk_ldi_cb_t, lcb_next));
69: #endif
70: }
71:
72:
73: static void
74: vdev_disk_free(vdev_t *vd)
75: {
76: vdev_disk_t *dvd = vd->vdev_tsd;
77: #ifdef illumos
78: vdev_disk_ldi_cb_t *lcb;
79: #endif
80:
81: if (dvd == NULL)
82: return;
83:
84: #ifdef illumos
85: /*
86: * We have already closed the LDI handle. Clean up the LDI event
87: * callbacks and free vd->vdev_tsd.
88: */
89: while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
90: list_remove(&dvd->vd_ldi_cbs, lcb);
91: (void) ldi_ev_remove_callbacks(lcb->lcb_id);
92: kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
93: }
94: list_destroy(&dvd->vd_ldi_cbs);
95: #endif
96: kmem_free(dvd, sizeof (vdev_disk_t));
97: vd->vdev_tsd = NULL;
98: }
99:
100:
101: /*
102: * It's not clear what these hold/rele functions are supposed to do.
103: */
104: static void
105: vdev_disk_hold(vdev_t *vd)
106: {
107:
108: ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
109:
110: }
111:
112: static void
113: vdev_disk_rele(vdev_t *vd)
114: {
115:
116: ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
117:
118: }
119:
120: static void
1.2 haad 121: vdev_disk_flush(struct work *work, void *cookie)
122: {
123: vdev_disk_t *dvd;
124: int error, cmd;
125: buf_t *bp;
126: vnode_t *vp;
127:
128: bp = (struct buf *)work;
129: vp = bp->b_vp;
130: dvd = cookie;
131:
1.6.14.1 pgoyette 132: KASSERT(vp == dvd->vd_vp);
133:
1.2 haad 134: cmd = 1;
1.6.14.1 pgoyette 135: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
136: error = VOP_IOCTL(vp, DIOCCACHESYNC, &cmd, FREAD|FWRITE, kcred);
137: VOP_UNLOCK(vp, 0);
1.2 haad 138: bp->b_error = error;
139: vdev_disk_io_intr(bp);
140: }
1.1 haad 141:
142: static int
1.6.14.1 pgoyette 143: vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
144: uint64_t *ashift, uint64_t *pashift)
1.1 haad 145: {
1.3 haad 146: spa_t *spa = vd->vdev_spa;
1.1 haad 147: vdev_disk_t *dvd;
1.2 haad 148: vnode_t *vp;
149: int error, cmd;
1.3 haad 150: struct partinfo pinfo;
1.1 haad 151:
152: /*
153: * We must have a pathname, and it must be absolute.
154: */
155: if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
156: vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1.6.14.1 pgoyette 157: return (SET_ERROR(EINVAL));
1.1 haad 158: }
159:
1.3 haad 160: /*
161: * Reopen the device if it's not currently open. Otherwise,
162: * just update the physical size of the device.
163: */
164: if (vd->vdev_tsd != NULL) {
165: ASSERT(vd->vdev_reopening);
166: dvd = vd->vdev_tsd;
1.6.14.1 pgoyette 167: vp = dvd->vd_vp;
168: KASSERT(vp != NULL);
1.3 haad 169: goto skip_open;
170: }
171:
1.6.14.1 pgoyette 172: /*
173: * Create vd->vdev_tsd.
174: */
175: vdev_disk_alloc(vd);
176: dvd = vd->vdev_tsd;
1.1 haad 177:
178: /*
179: * When opening a disk device, we want to preserve the user's original
180: * intent. We always want to open the device by the path the user gave
181: * us, even if it is one of multiple paths to the save device. But we
182: * also want to be able to survive disks being removed/recabled.
183: * Therefore the sequence of opening devices is:
184: *
185: * 1. Try opening the device by path. For legacy pools without the
186: * 'whole_disk' property, attempt to fix the path by appending 's0'.
187: *
188: * 2. If the devid of the device matches the stored value, return
189: * success.
190: *
191: * 3. Otherwise, the device may have moved. Try opening the device
192: * by the devid instead.
193: */
194: if (vd->vdev_devid != NULL) {
1.2 haad 195: /* XXXNETBSD wedges */
1.6.14.1 pgoyette 196: #ifdef illumos
197: if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
198: &dvd->vd_minor) != 0) {
199: vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
200: return (SET_ERROR(EINVAL));
201: }
202: #endif
1.1 haad 203: }
204:
205: error = EINVAL; /* presume failure */
206:
1.2 haad 207: error = vn_open(vd->vdev_path, UIO_SYSSPACE, FREAD|FWRITE, 0,
208: &vp, CRCREAT, 0);
209: if (error != 0) {
210: vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
1.6.14.1 pgoyette 211: return (SET_ERROR(error));
1.1 haad 212: }
1.2 haad 213: if (vp->v_type != VBLK) {
214: vrele(vp);
1.1 haad 215: vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
1.6.14.1 pgoyette 216: return (SET_ERROR(EINVAL));
1.1 haad 217: }
218:
219: /*
1.2 haad 220: * XXXNETBSD Compare the devid to the stored value.
1.1 haad 221: */
222:
1.6.14.1 pgoyette 223: /*
224: * Create a workqueue to process cache-flushes concurrently.
225: */
226: error = workqueue_create(&dvd->vd_wq, "vdevsync",
227: vdev_disk_flush, dvd, PRI_NONE, IPL_NONE, WQ_MPSAFE);
228: if (error != 0) {
229: vrele(vp);
230: return (SET_ERROR(error));
231: }
232:
233: dvd->vd_vp = vp;
234:
1.3 haad 235: skip_open:
1.1 haad 236: /*
237: * Determine the actual size of the device.
1.2 haad 238: * XXXNETBSD wedges.
1.1 haad 239: */
1.6.14.1 pgoyette 240: error = VOP_IOCTL(vp, DIOCGPARTINFO, &pinfo, FREAD|FWRITE, kcred);
1.2 haad 241: if (error != 0) {
1.1 haad 242: vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
1.6.14.1 pgoyette 243: return (SET_ERROR(error));
1.1 haad 244: }
1.6 christos 245: *psize = pinfo.pi_size * pinfo.pi_secsize;
1.6.14.1 pgoyette 246: *max_psize = *psize;
247:
1.6 christos 248: *ashift = highbit(MAX(pinfo.pi_secsize, SPA_MINBLOCKSIZE)) - 1;
1.6.14.1 pgoyette 249: *pashift = *ashift;
1.6 christos 250: vd->vdev_wholedisk = (pinfo.pi_offset == 0); /* XXXNETBSD */
1.1 haad 251:
252: /*
253: * Clear the nowritecache bit, so that on a vdev_reopen() we will
254: * try again.
255: */
256: vd->vdev_nowritecache = B_FALSE;
257:
1.6.14.1 pgoyette 258: return (0);
1.1 haad 259: }
260:
261: static void
262: vdev_disk_close(vdev_t *vd)
263: {
264: vdev_disk_t *dvd = vd->vdev_tsd;
265:
1.3 haad 266: if (vd->vdev_reopening || dvd == NULL)
1.1 haad 267: return;
268:
1.6.14.1 pgoyette 269: #ifdef illumos
270: if (dvd->vd_minor != NULL) {
271: ddi_devid_str_free(dvd->vd_minor);
272: dvd->vd_minor = NULL;
273: }
274:
275: if (dvd->vd_devid != NULL) {
276: ddi_devid_free(dvd->vd_devid);
277: dvd->vd_devid = NULL;
278: }
279:
280: if (dvd->vd_lh != NULL) {
281: (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
282: dvd->vd_lh = NULL;
283: }
284: #endif
285:
286: #ifdef __NetBSD__
287: if (dvd->vd_vp != NULL) {
288: vn_close(dvd->vd_vp, FREAD|FWRITE, kcred);
289: dvd->vd_vp = NULL;
290: }
291: if (dvd->vd_wq != NULL) {
1.2 haad 292: workqueue_destroy(dvd->vd_wq);
1.6.14.1 pgoyette 293: dvd->vd_wq = NULL;
1.2 haad 294: }
1.6.14.1 pgoyette 295: #endif
1.3 haad 296:
1.6.14.1 pgoyette 297: vd->vdev_delayed_close = B_FALSE;
298: #ifdef illumos
299: /*
300: * If we closed the LDI handle due to an offline notify from LDI,
301: * don't free vd->vdev_tsd or unregister the callbacks here;
302: * the offline finalize callback or a reopen will take care of it.
303: */
304: if (dvd->vd_ldi_offline)
305: return;
306: #endif
307:
308: vdev_disk_free(vd);
309: }
310:
311: int
312: vdev_disk_physio(vdev_t *vd, caddr_t data,
313: size_t size, uint64_t offset, int flags, boolean_t isdump)
314: {
315: #ifdef illumos
316: vdev_disk_t *dvd = vd->vdev_tsd;
317:
318: /*
319: * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
320: * Nothing to be done here but return failure.
321: */
322: if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
323: return (EIO);
324:
325: ASSERT(vd->vdev_ops == &vdev_disk_ops);
326:
327: /*
328: * If in the context of an active crash dump, use the ldi_dump(9F)
329: * call instead of ldi_strategy(9F) as usual.
330: */
331: if (isdump) {
332: ASSERT3P(dvd, !=, NULL);
333: return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
334: lbtodb(size)));
335: }
336:
337: return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
338: #endif
339: #ifdef __NetBSD__
340: return (EIO);
341: #endif
1.1 haad 342: }
343:
344: static void
345: vdev_disk_io_intr(buf_t *bp)
346: {
1.2 haad 347: zio_t *zio = bp->b_private;
1.1 haad 348:
349: /*
350: * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
351: * Rather than teach the rest of the stack about other error
352: * possibilities (EFAULT, etc), we normalize the error value here.
353: */
1.6.14.1 pgoyette 354: zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
1.1 haad 355:
1.6.14.1 pgoyette 356: if (zio->io_error == 0 && bp->b_resid != 0)
357: zio->io_error = SET_ERROR(EIO);
1.3 haad 358:
1.2 haad 359: putiobuf(bp);
1.6.14.1 pgoyette 360: zio_delay_interrupt(zio);
1.1 haad 361: }
362:
1.3 haad 363: static void
364: vdev_disk_ioctl_free(zio_t *zio)
365: {
366: kmem_free(zio->io_vsd, sizeof (struct dk_callback));
367: }
368:
369: static const zio_vsd_ops_t vdev_disk_vsd_ops = {
370: vdev_disk_ioctl_free,
371: zio_vsd_default_cksum_report
372: };
373:
374: static void
375: vdev_disk_ioctl_done(void *zio_arg, int error)
376: {
377: zio_t *zio = zio_arg;
378:
379: zio->io_error = error;
380:
381: zio_interrupt(zio);
382: }
383:
1.6.14.1 pgoyette 384: static void
1.1 haad 385: vdev_disk_io_start(zio_t *zio)
386: {
387: vdev_t *vd = zio->io_vd;
388: vdev_disk_t *dvd = vd->vdev_tsd;
1.2 haad 389: vnode_t *vp;
390: buf_t *bp, *nbp;
391: int error, size, off, resid;
1.1 haad 392:
1.6.14.1 pgoyette 393: /*
394: * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
395: * Nothing to be done here but return failure.
396: */
397: #ifdef illumos
398: if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
399: zio->io_error = SET_ERROR(ENXIO);
400: zio_interrupt(zio);
401: return;
402: }
403: #endif
404: #ifdef __NetBSD__
405: if (dvd == NULL) {
406: zio->io_error = SET_ERROR(ENXIO);
407: zio_interrupt(zio);
408: return;
409: }
410: vp = dvd->vd_vp;
411: #endif
412:
1.1 haad 413: if (zio->io_type == ZIO_TYPE_IOCTL) {
414: /* XXPOLICY */
415: if (!vdev_readable(vd)) {
1.6.14.1 pgoyette 416: zio->io_error = SET_ERROR(ENXIO);
417: zio_interrupt(zio);
418: return;
1.1 haad 419: }
420:
421: switch (zio->io_cmd) {
422: case DKIOCFLUSHWRITECACHE:
423:
424: if (zfs_nocacheflush)
425: break;
426:
427: if (vd->vdev_nowritecache) {
428: zio->io_error = ENOTSUP;
429: break;
430: }
431:
1.2 haad 432: bp = getiobuf(vp, true);
433: bp->b_private = zio;
434: workqueue_enqueue(dvd->vd_wq, &bp->b_work, NULL);
1.6.14.1 pgoyette 435: return;
1.1 haad 436:
437: default:
1.6.14.1 pgoyette 438: zio->io_error = SET_ERROR(ENOTSUP);
1.2 haad 439: break;
1.1 haad 440: }
441:
1.6.14.1 pgoyette 442: zio_execute(zio);
443: return;
1.1 haad 444: }
445:
1.2 haad 446: bp = getiobuf(vp, true);
447: bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
448: bp->b_cflags = BC_BUSY | BC_NOCACHE;
449: bp->b_data = zio->io_data;
450: bp->b_blkno = btodb(zio->io_offset);
1.1 haad 451: bp->b_bcount = zio->io_size;
1.2 haad 452: bp->b_resid = zio->io_size;
453: bp->b_iodone = vdev_disk_io_intr;
454: bp->b_private = zio;
455:
456: if (!(bp->b_flags & B_READ)) {
1.5 christos 457: mutex_enter(vp->v_interlock);
1.2 haad 458: vp->v_numoutput++;
1.5 christos 459: mutex_exit(vp->v_interlock);
1.2 haad 460: }
1.6.14.1 pgoyette 461:
1.2 haad 462: if (bp->b_bcount <= MAXPHYS) {
463: /* We can do this I/O in one pass. */
464: (void)VOP_STRATEGY(vp, bp);
465: } else {
466: /*
467: * The I/O is larger than we can process in one pass.
468: * Split it into smaller pieces.
469: */
470: resid = zio->io_size;
471: off = 0;
472: while (resid != 0) {
1.6.14.2! pgoyette 473: size = uimin(resid, MAXPHYS);
1.2 haad 474: nbp = getiobuf(vp, true);
475: nbp->b_blkno = btodb(zio->io_offset + off);
476: /* Below call increments v_numoutput. */
477: nestiobuf_setup(bp, nbp, off, size);
478: (void)VOP_STRATEGY(vp, nbp);
479: resid -= size;
480: off += size;
481: }
482: }
1.1 haad 483: }
484:
485: static void
486: vdev_disk_io_done(zio_t *zio)
487: {
1.6.14.1 pgoyette 488: #ifdef illumos
489: vdev_t *vd = zio->io_vd;
1.1 haad 490:
1.6.14.1 pgoyette 491: /*
492: * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
493: * the device has been removed. If this is the case, then we trigger an
494: * asynchronous removal of the device. Otherwise, probe the device and
495: * make sure it's still accessible.
496: */
497: if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
498: vdev_disk_t *dvd = vd->vdev_tsd;
499: int state = DKIO_NONE;
500:
501: if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
502: FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
503: /*
504: * We post the resource as soon as possible, instead of
505: * when the async removal actually happens, because the
506: * DE is using this information to discard previous I/O
507: * errors.
508: */
509: zfs_post_remove(zio->io_spa, vd);
510: vd->vdev_remove_wanted = B_TRUE;
511: spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
512: } else if (!vd->vdev_delayed_close) {
513: vd->vdev_delayed_close = B_TRUE;
514: }
515: }
516: #endif
1.1 haad 517: }
518:
519: vdev_ops_t vdev_disk_ops = {
520: vdev_disk_open,
521: vdev_disk_close,
522: vdev_default_asize,
523: vdev_disk_io_start,
524: vdev_disk_io_done,
525: NULL,
1.6.14.1 pgoyette 526: vdev_disk_hold,
527: vdev_disk_rele,
1.1 haad 528: VDEV_TYPE_DISK, /* name of this vdev type */
529: B_TRUE /* leaf vdev */
530: };
531:
532: /*
533: * Given the root disk device devid or pathname, read the label from
534: * the device, and construct a configuration nvlist.
535: */
536: int
537: vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
538: {
1.6.14.1 pgoyette 539: #ifdef __NetBSD__
540: return (ENOTSUP);
541: #else
542: ldi_handle_t vd_lh;
543: vdev_label_t *label;
544: uint64_t s, size;
545: int l;
546: ddi_devid_t tmpdevid;
547: int error = -1;
548: char *minor_name;
549:
550: /*
551: * Read the device label and build the nvlist.
552: */
553: if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
554: &minor_name) == 0) {
555: error = ldi_open_by_devid(tmpdevid, minor_name,
556: FREAD, kcred, &vd_lh, zfs_li);
557: ddi_devid_free(tmpdevid);
558: ddi_devid_str_free(minor_name);
559: }
560:
561: if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
562: zfs_li)))
563: return (error);
564:
565: if (ldi_get_size(vd_lh, &s)) {
566: (void) ldi_close(vd_lh, FREAD, kcred);
567: return (SET_ERROR(EIO));
568: }
569:
570: size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
571: label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
572:
573: *config = NULL;
574: for (l = 0; l < VDEV_LABELS; l++) {
575: uint64_t offset, state, txg = 0;
576:
577: /* read vdev label */
578: offset = vdev_label_offset(size, l, 0);
579: if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
580: VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
581: continue;
582:
583: if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
584: sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
585: *config = NULL;
586: continue;
587: }
588:
589: if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
590: &state) != 0 || state >= POOL_STATE_DESTROYED) {
591: nvlist_free(*config);
592: *config = NULL;
593: continue;
594: }
595:
596: if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
597: &txg) != 0 || txg == 0) {
598: nvlist_free(*config);
599: *config = NULL;
600: continue;
601: }
602:
603: break;
604: }
605:
606: kmem_free(label, sizeof (vdev_label_t));
607: (void) ldi_close(vd_lh, FREAD, kcred);
608: if (*config == NULL)
609: error = SET_ERROR(EIDRM);
1.1 haad 610:
1.6.14.1 pgoyette 611: return (error);
612: #endif
1.1 haad 613: }
CVSweb <webmaster@jp.NetBSD.org>