summaryrefslogtreecommitdiff
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/00-INDEX24
-rw-r--r--Documentation/CodingStyle7
-rw-r--r--Documentation/DMA-API.txt3
-rw-r--r--Documentation/DMA-mapping.txt24
-rw-r--r--Documentation/DocBook/Makefile4
-rw-r--r--Documentation/DocBook/deviceiobook.tmpl12
-rw-r--r--Documentation/DocBook/filesystems.tmpl36
-rw-r--r--Documentation/DocBook/gadget.tmpl6
-rw-r--r--Documentation/DocBook/kernel-api.tmpl31
-rw-r--r--Documentation/DocBook/kernel-hacking.tmpl4
-rw-r--r--Documentation/DocBook/mcabook.tmpl2
-rw-r--r--Documentation/DocBook/mtdnand.tmpl11
-rw-r--r--Documentation/DocBook/s390-drivers.tmpl149
-rw-r--r--Documentation/HOWTO4
-rw-r--r--Documentation/IPMI.txt25
-rw-r--r--Documentation/Intel-IOMMU.txt115
-rw-r--r--Documentation/MSI-HOWTO.txt69
-rw-r--r--Documentation/RCU/00-INDEX22
-rw-r--r--Documentation/SM501.txt5
-rw-r--r--Documentation/SubmitChecklist2
-rw-r--r--Documentation/SubmittingDrivers3
-rw-r--r--Documentation/accounting/cgroupstats.txt27
-rw-r--r--Documentation/accounting/getdelays.c1
-rw-r--r--Documentation/arm/00-INDEX22
-rw-r--r--Documentation/arm/Samsung-S3C24XX/DMA.txt18
-rw-r--r--Documentation/atomic_ops.txt73
-rw-r--r--Documentation/block/00-INDEX20
-rw-r--r--Documentation/block/as-iosched.txt21
-rw-r--r--Documentation/block/biodoc.txt24
-rw-r--r--Documentation/block/deadline-iosched.txt25
-rw-r--r--Documentation/block/ioprio.txt13
-rw-r--r--Documentation/block/request.txt2
-rw-r--r--Documentation/block/switching-sched.txt21
-rw-r--r--Documentation/cachetlb.txt33
-rw-r--r--Documentation/cdrom/cdrom-standard.tex2
-rw-r--r--Documentation/cgroups.txt545
-rw-r--r--Documentation/cpu-hotplug.txt4
-rw-r--r--Documentation/cpusets.txt250
-rw-r--r--Documentation/device-mapper/dm-uevent.txt97
-rw-r--r--Documentation/devices.txt2
-rw-r--r--Documentation/dontdiff8
-rw-r--r--Documentation/driver-model/devres.txt4
-rw-r--r--Documentation/dvb/faq.txt2
-rw-r--r--Documentation/early-userspace/README6
-rw-r--r--Documentation/email-clients.txt217
-rw-r--r--Documentation/fb/00-INDEX46
-rw-r--r--Documentation/fb/deferred_io.txt4
-rw-r--r--Documentation/fb/uvesafb.txt188
-rw-r--r--Documentation/feature-removal-schedule.txt112
-rw-r--r--Documentation/filesystems/00-INDEX10
-rw-r--r--Documentation/filesystems/9p.txt32
-rw-r--r--Documentation/filesystems/Exporting115
-rw-r--r--Documentation/filesystems/Locking11
-rw-r--r--Documentation/filesystems/ext3.txt14
-rw-r--r--Documentation/filesystems/files.txt6
-rw-r--r--Documentation/filesystems/locks.txt (renamed from Documentation/locks.txt)10
-rw-r--r--Documentation/filesystems/mandatory-locking.txt (renamed from Documentation/mandatory.txt)21
-rw-r--r--Documentation/filesystems/ntfs.txt4
-rw-r--r--Documentation/filesystems/proc.txt37
-rw-r--r--Documentation/filesystems/quota.txt59
-rw-r--r--Documentation/filesystems/ramfs-rootfs-initramfs.txt14
-rw-r--r--Documentation/filesystems/sysfs.txt2
-rw-r--r--Documentation/filesystems/vfs.txt53
-rw-r--r--Documentation/firmware_class/firmware_sample_firmware_class.c10
-rw-r--r--Documentation/hwmon/coretemp2
-rw-r--r--Documentation/hwmon/dme173733
-rw-r--r--Documentation/hwmon/f71805f7
-rw-r--r--Documentation/hwmon/it873
-rw-r--r--Documentation/hwmon/lm7810
-rw-r--r--Documentation/hwmon/lm93126
-rw-r--r--Documentation/hwmon/sysfs-interface131
-rw-r--r--Documentation/hwmon/w83791d96
-rw-r--r--Documentation/i2c/busses/i2c-i8013
-rw-r--r--Documentation/i2c/chips/pcf85748
-rw-r--r--Documentation/i2c/dev-interface11
-rw-r--r--Documentation/i2c/i2c-protocol2
-rw-r--r--Documentation/i2c/i2c-stub18
-rw-r--r--Documentation/i386/boot.txt34
-rw-r--r--Documentation/ia64/err_inject.txt6
-rw-r--r--Documentation/ide.txt6
-rw-r--r--Documentation/infiniband/user_mad.txt14
-rw-r--r--Documentation/initrd.txt12
-rw-r--r--Documentation/input/atarikbd.txt8
-rw-r--r--Documentation/input/ff.txt2
-rw-r--r--Documentation/input/iforce-protocol.txt20
-rw-r--r--Documentation/input/input-programming.txt17
-rw-r--r--Documentation/isdn/CREDITS2
-rw-r--r--Documentation/isdn/README.concap2
-rw-r--r--Documentation/ja_JP/HOWTO84
-rw-r--r--Documentation/java.txt2
-rw-r--r--Documentation/kbuild/kconfig-language.txt14
-rw-r--r--Documentation/kbuild/makefiles.txt84
-rw-r--r--Documentation/kdump/kdump.txt89
-rw-r--r--Documentation/kernel-docs.txt4
-rw-r--r--Documentation/kernel-parameters.txt159
-rw-r--r--Documentation/keys-request-key.txt25
-rw-r--r--Documentation/keys.txt93
-rw-r--r--Documentation/kobject.txt22
-rw-r--r--Documentation/lguest/Makefile26
-rw-r--r--Documentation/lguest/lguest.c1721
-rw-r--r--Documentation/lguest/lguest.txt72
-rw-r--r--Documentation/local_ops.txt25
-rw-r--r--Documentation/m68k/kernel-options.txt9
-rw-r--r--Documentation/make/headers_install.txt46
-rw-r--r--Documentation/markers.txt81
-rw-r--r--Documentation/memory-barriers.txt14
-rw-r--r--Documentation/memory-hotplug.txt58
-rw-r--r--Documentation/mips/00-INDEX6
-rw-r--r--Documentation/mips/AU1xxx_IDE.README2
-rw-r--r--Documentation/mips/time.README173
-rw-r--r--Documentation/mutex-design.txt7
-rw-r--r--Documentation/networking/00-INDEX2
-rw-r--r--Documentation/networking/NAPI_HOWTO.txt766
-rw-r--r--Documentation/networking/bcm43xx.txt2
-rw-r--r--Documentation/networking/bonding.txt33
-rw-r--r--Documentation/networking/dccp.txt21
-rw-r--r--Documentation/networking/dgrs.txt52
-rw-r--r--Documentation/networking/ip-sysctl.txt19
-rw-r--r--Documentation/networking/mac80211-injection.txt32
-rw-r--r--Documentation/networking/net-modules.txt315
-rw-r--r--Documentation/networking/netconsole.txt99
-rw-r--r--Documentation/networking/netdevices.txt15
-rw-r--r--Documentation/networking/proc_net_tcp.txt5
-rw-r--r--Documentation/networking/rxrpc.txt9
-rw-r--r--Documentation/networking/tc-actions-env-rules.txt29
-rw-r--r--Documentation/networking/udplite.txt6
-rw-r--r--Documentation/parport-lowlevel.txt29
-rw-r--r--Documentation/power/00-INDEX34
-rw-r--r--Documentation/power/basic-pm-debugging.txt4
-rw-r--r--Documentation/power/drivers-testing.txt4
-rw-r--r--Documentation/power/freezing-of-tasks.txt44
-rw-r--r--Documentation/power/interface.txt2
-rw-r--r--Documentation/power/swsusp-and-swap-files.txt2
-rw-r--r--Documentation/powerpc/00-INDEX4
-rw-r--r--Documentation/powerpc/booting-without-of.txt490
-rw-r--r--Documentation/powerpc/eeh-pci-error-recovery.txt4
-rw-r--r--Documentation/powerpc/mpc52xx-device-tree-bindings.txt16
-rw-r--r--Documentation/ramdisk.txt18
-rw-r--r--Documentation/rfkill.txt89
-rw-r--r--Documentation/s390/00-INDEX26
-rw-r--r--Documentation/s390/CommonIO51
-rw-r--r--Documentation/s390/cds.txt8
-rw-r--r--Documentation/sched-design-CFS.txt67
-rw-r--r--Documentation/scsi/00-INDEX34
-rw-r--r--Documentation/scsi/ChangeLog.arcmsr17
-rw-r--r--Documentation/scsi/ChangeLog.ncr53c8xx6
-rw-r--r--Documentation/scsi/aacraid.txt8
-rw-r--r--Documentation/scsi/advansys.txt243
-rw-r--r--Documentation/scsi/aic79xx.txt2
-rw-r--r--Documentation/scsi/aic7xxx.txt6
-rw-r--r--Documentation/scsi/arcmsr_spec.txt2
-rw-r--r--Documentation/scsi/ibmmca.txt60
-rw-r--r--Documentation/scsi/link_power_management_policy.txt19
-rw-r--r--Documentation/scsi/ncr53c8xx.txt14
-rw-r--r--Documentation/scsi/sym53c8xx_2.txt21
-rw-r--r--Documentation/sharedsubtree.txt1
-rw-r--r--Documentation/sound/alsa/ALSA-Configuration.txt115
-rw-r--r--Documentation/sound/alsa/CMIPCI.txt17
-rw-r--r--Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl184
-rw-r--r--Documentation/sound/alsa/OSS-Emulation.txt7
-rw-r--r--Documentation/sound/alsa/hda_codec.txt49
-rw-r--r--Documentation/sound/alsa/powersave.txt41
-rw-r--r--Documentation/sound/alsa/soc/DAI.txt8
-rw-r--r--Documentation/sound/alsa/soc/clocking.txt10
-rw-r--r--Documentation/sound/alsa/soc/codec.txt6
-rw-r--r--Documentation/sound/alsa/soc/dapm.txt4
-rw-r--r--Documentation/sound/alsa/soc/overview.txt17
-rw-r--r--Documentation/sound/alsa/soc/platform.txt2
-rw-r--r--Documentation/sound/alsa/soc/pops_clicks.txt6
-rw-r--r--Documentation/sound/oss/es137164
-rw-r--r--Documentation/sparc/sbus_drivers.txt4
-rw-r--r--Documentation/spi/pxa2xx2
-rw-r--r--Documentation/spi/spi-summary25
-rw-r--r--Documentation/spi/spidev_test.c6
-rw-r--r--Documentation/sysctl/00-INDEX16
-rw-r--r--Documentation/sysctl/kernel.txt8
-rw-r--r--Documentation/sysctl/vm.txt28
-rw-r--r--Documentation/telephony/00-INDEX4
-rw-r--r--Documentation/thinkpad-acpi.txt29
-rw-r--r--Documentation/usb/authorization.txt92
-rw-r--r--Documentation/usb/power-management.txt517
-rw-r--r--Documentation/usb/usb-serial.txt13
-rw-r--r--Documentation/usb/usbmon.txt9
-rw-r--r--Documentation/video4linux/CARDLIST.bttv1
-rw-r--r--Documentation/video4linux/CARDLIST.cx238855
-rw-r--r--Documentation/video4linux/CARDLIST.saa71345
-rw-r--r--Documentation/vm/00-INDEX20
-rw-r--r--Documentation/vm/numa_memory_policy.txt33
-rw-r--r--Documentation/vm/slabinfo.c27
-rw-r--r--Documentation/w1/00-INDEX8
-rw-r--r--Documentation/w1/masters/00-INDEX6
-rw-r--r--Documentation/w1/masters/ds24822
-rw-r--r--Documentation/w1/masters/ds24902
-rw-r--r--Documentation/watchdog/src/watchdog-simple.c18
-rw-r--r--Documentation/x86_64/mm.txt1
-rw-r--r--Documentation/xterm-linux.xpm61
196 files changed, 6659 insertions, 3675 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 43e89b1537d9..299615d821ac 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -22,6 +22,8 @@ CodingStyle
- how the boss likes the C code in the kernel to look.
DMA-API.txt
- DMA API, pci_ API & extensions for non-consistent memory machines.
+DMA-ISA-LPC.txt
+ - How to do DMA with ISA (and LPC) devices.
DMA-mapping.txt
- info for PCI drivers using DMA portably across all platforms.
DocBook/
@@ -50,6 +52,8 @@ README.cycladesZ
- info on Cyclades-Z firmware loading.
SAK.txt
- info on Secure Attention Keys.
+SM501.txt
+ - Silicon Motion SM501 multimedia companion chip
SecurityBugs
- procedure for reporting security bugs found in the kernel.
SubmitChecklist
@@ -145,7 +149,7 @@ fb/
feature-removal-schedule.txt
- list of files and features that are going to be removed.
filesystems/
- - directory with info on the various filesystems that Linux supports.
+ - info on the vfs and the various filesystems that Linux supports.
firmware_class/
- request_firmware() hotplug interface info.
floppy.txt
@@ -230,8 +234,6 @@ local_ops.txt
- semantics and behavior of local atomic operations.
lockdep-design.txt
- documentation on the runtime locking correctness validator.
-locks.txt
- - info on file locking implementations, flock() vs. fcntl(), etc.
logo.gif
- full colour GIF image of Linux logo (penguin - Tux).
logo.txt
@@ -240,14 +242,14 @@ m68k/
- directory with info about Linux on Motorola 68k architecture.
magic-number.txt
- list of magic numbers used to mark/protect kernel data structures.
-mandatory.txt
- - info on the Linux implementation of Sys V mandatory file locking.
mca.txt
- info on supporting Micro Channel Architecture (e.g. PS/2) systems.
md.txt
- info on boot arguments for the multiple devices driver.
memory-barriers.txt
- info on Linux kernel memory barriers.
+memory-hotplug.txt
+ - Hotpluggable memory support, how to use and current status.
memory.txt
- info on typical Linux memory problems.
mips/
@@ -298,6 +300,8 @@ pm.txt
- info on Linux power management support.
pnp.txt
- Linux Plug and Play documentation.
+power_supply_class.txt
+ - Tells userspace about battery, UPS, AC or DC power supply properties
power/
- directory with info on Linux PCI power management.
powerpc/
@@ -334,8 +338,12 @@ sched-coding.txt
- reference for various scheduler-related methods in the O(1) scheduler.
sched-design.txt
- goals, design and implementation of the Linux O(1) scheduler.
+sched-design-CFS.txt
+ - goals, design and implementation of the Complete Fair Scheduler.
sched-domains.txt
- information on scheduling domains.
+sched-nice-design.txt
+ - How and why the scheduler's nice levels are implemented.
sched-stats.txt
- information on schedstats (Linux Scheduler Statistics).
scsi/
@@ -380,6 +388,8 @@ stallion.txt
- info on using the Stallion multiport serial driver.
svga.txt
- short guide on selecting video modes at boot via VGA BIOS.
+sysfs-rules.txt
+ - How not to use sysfs.
sx.txt
- info on the Specialix SX/SI multiport serial driver.
sysctl/
@@ -410,6 +420,8 @@ video4linux/
- directory with info regarding video/TV/radio cards and linux.
vm/
- directory with info on the Linux vm code.
+volatile-considered-harmful.txt
+ - Why the "volatile" type class should not be used
voyager.txt
- guide to running Linux on the Voyager architecture.
w1/
@@ -418,7 +430,5 @@ watchdog/
- how to auto-reboot Linux if it has "fallen and can't get up". ;-)
x86_64/
- directory with info on Linux support for AMD x86-64 (Hammer) machines.
-xterm-linux.xpm
- - XPM image of penguin logo (see logo.txt) sitting on an xterm.
zorro.txt
- info on writing drivers for Zorro bus devices found on Amigas.
diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle
index 7f1730f1a1ae..6caa14615578 100644
--- a/Documentation/CodingStyle
+++ b/Documentation/CodingStyle
@@ -77,12 +77,15 @@ Get a decent editor and don't leave whitespace at the end of lines.
Coding style is all about readability and maintainability using commonly
available tools.
-The limit on the length of lines is 80 columns and this is a hard limit.
+The limit on the length of lines is 80 columns and this is a strongly
+preferred limit.
Statements longer than 80 columns will be broken into sensible chunks.
Descendants are always substantially shorter than the parent and are placed
substantially to the right. The same applies to function headers with a long
-argument list. Long strings are as well broken into shorter strings.
+argument list. Long strings are as well broken into shorter strings. The
+only exception to this is where exceeding 80 columns significantly increases
+readability and does not hide information.
void fun(int a, int b, int c)
{
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index cc7a8c39fb6f..b939ebb62871 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -68,6 +68,9 @@ size and dma_handle must all be the same as those passed into the
consistent allocate. cpu_addr must be the virtual address returned by
the consistent allocate.
+Note that unlike their sibling allocation calls, these routines
+may only be called with IRQs enabled.
+
Part Ib - Using small dma-coherent buffers
------------------------------------------
diff --git a/Documentation/DMA-mapping.txt b/Documentation/DMA-mapping.txt
index e07f2530326b..d84f89dbf921 100644
--- a/Documentation/DMA-mapping.txt
+++ b/Documentation/DMA-mapping.txt
@@ -189,12 +189,6 @@ smaller mask as pci_set_dma_mask(). However for the rare case that a
device driver only uses consistent allocations, one would have to
check the return value from pci_set_consistent_dma_mask().
-If your 64-bit device is going to be an enormous consumer of DMA
-mappings, this can be problematic since the DMA mappings are a
-finite resource on many platforms. Please see the "DAC Addressing
-for Address Space Hungry Devices" section near the end of this
-document for how to handle this case.
-
Finally, if your device can only drive the low 24-bits of
address during PCI bus mastering you might do something like:
@@ -203,8 +197,6 @@ address during PCI bus mastering you might do something like:
"mydev: 24-bit DMA addressing not available.\n");
goto ignore_this_device;
}
-[Better use DMA_24BIT_MASK instead of 0x00ffffff.
-See linux/include/dma-mapping.h for reference.]
When pci_set_dma_mask() is successful, and returns zero, the PCI layer
saves away this mask you have provided. The PCI layer will use this
@@ -514,7 +506,7 @@ With scatterlists, you map a region gathered from several regions by:
int i, count = pci_map_sg(dev, sglist, nents, direction);
struct scatterlist *sg;
- for (i = 0, sg = sglist; i < count; i++, sg++) {
+ for_each_sg(sglist, sg, count, i) {
hw_address[i] = sg_dma_address(sg);
hw_len[i] = sg_dma_len(sg);
}
@@ -652,18 +644,6 @@ It is planned to completely remove virt_to_bus() and bus_to_virt() as
they are entirely deprecated. Some ports already do not provide these
as it is impossible to correctly support them.
- 64-bit DMA and DAC cycle support
-
-Do you understand all of the text above? Great, then you already
-know how to use 64-bit DMA addressing under Linux. Simply make
-the appropriate pci_set_dma_mask() calls based upon your cards
-capabilities, then use the mapping APIs above.
-
-It is that simple.
-
-Well, not for some odd devices. See the next section for information
-about that.
-
Optimizing Unmap State Space Consumption
On many platforms, pci_unmap_{single,page}() is simply a nop.
@@ -782,5 +762,5 @@ following people:
Jay Estabrook <Jay.Estabrook@compaq.com>
Thomas Sailer <sailer@ife.ee.ethz.ch>
Andrea Arcangeli <andrea@suse.de>
- Jens Axboe <axboe@suse.de>
+ Jens Axboe <jens.axboe@oracle.com>
David Mosberger-Tang <davidm@hpl.hp.com>
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index 08687e45e19d..054a7ecf64c6 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -11,7 +11,7 @@ DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml videobook.xml \
procfs-guide.xml writing_usb_driver.xml \
kernel-api.xml filesystems.xml lsm.xml usb.xml \
gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
- genericirq.xml
+ genericirq.xml s390-drivers.xml
###
# The build process is as follows (targets):
@@ -165,7 +165,7 @@ quiet_cmd_db2man = MAN $@
@touch $@
###
-# Rules to generate postscripts and PNG imgages from .fig format files
+# Rules to generate postscripts and PNG images from .fig format files
quiet_cmd_fig2eps = FIG2EPS $@
cmd_fig2eps = fig2dev -Leps $< $@
diff --git a/Documentation/DocBook/deviceiobook.tmpl b/Documentation/DocBook/deviceiobook.tmpl
index c917de681ccd..9ee6f3cbb414 100644
--- a/Documentation/DocBook/deviceiobook.tmpl
+++ b/Documentation/DocBook/deviceiobook.tmpl
@@ -85,7 +85,7 @@
<chapter id="mmio">
<title>Memory Mapped IO</title>
- <sect1>
+ <sect1 id="getting_access_to_the_device">
<title>Getting Access to the Device</title>
<para>
The most widely supported form of IO is memory mapped IO.
@@ -114,7 +114,7 @@
</para>
</sect1>
- <sect1>
+ <sect1 id="accessing_the_device">
<title>Accessing the device</title>
<para>
The part of the interface most used by drivers is reading and
@@ -272,9 +272,9 @@ CPU B: spin_unlock_irqrestore(&amp;dev_lock, flags)
</chapter>
- <chapter>
+ <chapter id="port_space_accesses">
<title>Port Space Accesses</title>
- <sect1>
+ <sect1 id="port_space_explained">
<title>Port Space Explained</title>
<para>
@@ -291,7 +291,7 @@ CPU B: spin_unlock_irqrestore(&amp;dev_lock, flags)
</para>
</sect1>
- <sect1>
+ <sect1 id="accessing_port_space">
<title>Accessing Port Space</title>
<para>
Accesses to this space are provided through a set of functions
@@ -316,7 +316,7 @@ CPU B: spin_unlock_irqrestore(&amp;dev_lock, flags)
<chapter id="pubfunctions">
<title>Public Functions Provided</title>
-!Iinclude/asm-i386/io.h
+!Iinclude/asm-x86/io_32.h
!Elib/iomap.c
</chapter>
diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl
index 39fa2aba7f9b..5eaef87e8f1b 100644
--- a/Documentation/DocBook/filesystems.tmpl
+++ b/Documentation/DocBook/filesystems.tmpl
@@ -40,25 +40,25 @@
<chapter id="vfs">
<title>The Linux VFS</title>
- <sect1><title>The Filesystem types</title>
+ <sect1 id="the_filesystem_types"><title>The Filesystem types</title>
!Iinclude/linux/fs.h
</sect1>
- <sect1><title>The Directory Cache</title>
+ <sect1 id="the_directory_cache"><title>The Directory Cache</title>
!Efs/dcache.c
!Iinclude/linux/dcache.h
</sect1>
- <sect1><title>Inode Handling</title>
+ <sect1 id="inode_handling"><title>Inode Handling</title>
!Efs/inode.c
!Efs/bad_inode.c
</sect1>
- <sect1><title>Registration and Superblocks</title>
+ <sect1 id="registration_and_superblocks"><title>Registration and Superblocks</title>
!Efs/super.c
</sect1>
- <sect1><title>File Locks</title>
+ <sect1 id="file_locks"><title>File Locks</title>
!Efs/locks.c
!Ifs/locks.c
</sect1>
- <sect1><title>Other Functions</title>
+ <sect1 id="other_functions"><title>Other Functions</title>
!Efs/mpage.c
!Efs/namei.c
!Efs/buffer.c
@@ -73,11 +73,11 @@
<chapter id="proc">
<title>The proc filesystem</title>
- <sect1><title>sysctl interface</title>
+ <sect1 id="sysctl_interface"><title>sysctl interface</title>
!Ekernel/sysctl.c
</sect1>
- <sect1><title>proc filesystem interface</title>
+ <sect1 id="proc_filesystem_interface"><title>proc filesystem interface</title>
!Ifs/proc/base.c
</sect1>
</chapter>
@@ -92,7 +92,7 @@
<chapter id="debugfs">
<title>The debugfs filesystem</title>
- <sect1><title>debugfs interface</title>
+ <sect1 id="debugfs_interface"><title>debugfs interface</title>
!Efs/debugfs/inode.c
!Efs/debugfs/file.c
</sect1>
@@ -134,9 +134,9 @@
<title>The Linux Journalling API</title>
- <sect1>
+ <sect1 id="journaling_overview">
<title>Overview</title>
- <sect2>
+ <sect2 id="journaling_details">
<title>Details</title>
<para>
The journalling layer is easy to use. You need to
@@ -307,7 +307,7 @@ particular inode.
</sect2>
- <sect2>
+ <sect2 id="jbd_summary">
<title>Summary</title>
<para>
Using the journal is a matter of wrapping the different context changes,
@@ -349,7 +349,7 @@ an example.
</sect1>
- <sect1>
+ <sect1 id="data_types">
<title>Data Types</title>
<para>
The journalling layer uses typedefs to 'hide' the concrete definitions
@@ -358,27 +358,27 @@ an example.
Obviously the hiding is not enforced as this is 'C'.
</para>
- <sect2><title>Structures</title>
+ <sect2 id="structures"><title>Structures</title>
!Iinclude/linux/jbd.h
</sect2>
</sect1>
- <sect1>
+ <sect1 id="functions">
<title>Functions</title>
<para>
The functions here are split into two groups those that
affect a journal as a whole, and those which are used to
manage transactions
</para>
- <sect2><title>Journal Level</title>
+ <sect2 id="journal_level"><title>Journal Level</title>
!Efs/jbd/journal.c
!Ifs/jbd/recovery.c
</sect2>
- <sect2><title>Transasction Level</title>
+ <sect2 id="transaction_level"><title>Transasction Level</title>
!Efs/jbd/transaction.c
</sect2>
</sect1>
- <sect1>
+ <sect1 id="see_also">
<title>See also</title>
<para>
<citation>
diff --git a/Documentation/DocBook/gadget.tmpl b/Documentation/DocBook/gadget.tmpl
index 6996d977bf8f..5a8ffa761e09 100644
--- a/Documentation/DocBook/gadget.tmpl
+++ b/Documentation/DocBook/gadget.tmpl
@@ -144,7 +144,7 @@ with the lowest level (which directly handles hardware).
<para>This is the lowest software level.
It is the only layer that talks to hardware,
through registers, fifos, dma, irqs, and the like.
- The <filename>&lt;linux/usb_gadget.h&gt;</filename> API abstracts
+ The <filename>&lt;linux/usb/gadget.h&gt;</filename> API abstracts
the peripheral controller endpoint hardware.
That hardware is exposed through endpoint objects, which accept
streams of IN/OUT buffers, and through callbacks that interact
@@ -494,7 +494,7 @@ side drivers (and usbcore).
<sect1 id="core"><title>Core Objects and Methods</title>
<para>These are declared in
-<filename>&lt;linux/usb_gadget.h&gt;</filename>,
+<filename>&lt;linux/usb/gadget.h&gt;</filename>,
and are used by gadget drivers to interact with
USB peripheral controller drivers.
</para>
@@ -509,7 +509,7 @@ USB peripheral controller drivers.
unless the explanations are trivial.
-->
-!Iinclude/linux/usb_gadget.h
+!Iinclude/linux/usb/gadget.h
</sect1>
<sect1 id="utils"><title>Optional Utilities</title>
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index b886f52a9aac..aa38cc5692a0 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -45,8 +45,8 @@
</sect1>
<sect1><title>Atomic and pointer manipulation</title>
-!Iinclude/asm-i386/atomic.h
-!Iinclude/asm-i386/unaligned.h
+!Iinclude/asm-x86/atomic_32.h
+!Iinclude/asm-x86/unaligned.h
</sect1>
<sect1><title>Delaying, scheduling, and timer routines</title>
@@ -119,7 +119,7 @@ X!Ilib/string.c
!Elib/string.c
</sect1>
<sect1><title>Bit Operations</title>
-!Iinclude/asm-i386/bitops.h
+!Iinclude/asm-x86/bitops_32.h
</sect1>
</chapter>
@@ -155,8 +155,8 @@ X!Ilib/string.c
!Emm/slab.c
</sect1>
<sect1><title>User Space Memory Access</title>
-!Iinclude/asm-i386/uaccess.h
-!Earch/i386/lib/usercopy.c
+!Iinclude/asm-x86/uaccess_32.h
+!Earch/x86/lib/usercopy_32.c
</sect1>
<sect1><title>More Memory Management Functions</title>
!Emm/readahead.c
@@ -240,17 +240,23 @@ X!Ilib/string.c
<sect1><title>Driver Support</title>
!Enet/core/dev.c
!Enet/ethernet/eth.c
+!Enet/sched/sch_generic.c
!Iinclude/linux/etherdevice.h
+!Iinclude/linux/netdevice.h
+ </sect1>
+ <sect1><title>PHY Support</title>
!Edrivers/net/phy/phy.c
!Idrivers/net/phy/phy.c
!Edrivers/net/phy/phy_device.c
!Idrivers/net/phy/phy_device.c
!Edrivers/net/phy/mdio_bus.c
!Idrivers/net/phy/mdio_bus.c
+ </sect1>
<!-- FIXME: Removed for now since no structured comments in source
+ <sect1><title>Wireless</title>
X!Enet/core/wireless.c
--->
</sect1>
+-->
<sect1><title>Synchronous PPP</title>
!Edrivers/net/wan/syncppp.c
</sect1>
@@ -287,7 +293,7 @@ X!Ekernel/module.c
</sect1>
<sect1><title>MTRR Handling</title>
-!Earch/i386/kernel/cpu/mtrr/main.c
+!Earch/x86/kernel/cpu/mtrr/main.c
</sect1>
<sect1><title>PCI Support Library</title>
@@ -310,14 +316,14 @@ X!Edrivers/pci/hotplug.c
<sect1><title>MCA Architecture</title>
<sect2><title>MCA Device Functions</title>
<para>
- Refer to the file arch/i386/kernel/mca.c for more information.
+ Refer to the file arch/x86/kernel/mca_32.c for more information.
</para>
<!-- FIXME: Removed for now since no structured comments in source
-X!Earch/i386/kernel/mca.c
+X!Earch/x86/kernel/mca_32.c
-->
</sect2>
<sect2><title>MCA Bus DMA</title>
-!Iinclude/asm-i386/mca_dma.h
+!Iinclude/asm-x86/mca_dma.h
</sect2>
</sect1>
</chapter>
@@ -334,7 +340,7 @@ X!Earch/i386/kernel/mca.c
<chapter id="security">
<title>Security Framework</title>
-!Esecurity/security.c
+!Isecurity/security.c
</chapter>
<chapter id="audit">
@@ -380,8 +386,7 @@ X!Edrivers/base/interface.c
!Edrivers/base/bus.c
</sect1>
<sect1><title>Device Drivers Power Management</title>
-!Edrivers/base/power/resume.c
-!Edrivers/base/power/suspend.c
+!Edrivers/base/power/main.c
</sect1>
<sect1><title>Device Drivers ACPI Support</title>
<!-- Internal functions only
diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
index 582032eea872..4c63e5864160 100644
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -1239,7 +1239,7 @@ static struct block_device_operations opt_fops = {
</para>
<para>
- <filename>include/asm-i386/delay.h:</filename>
+ <filename>include/asm-x86/delay_32.h:</filename>
</para>
<programlisting>
#define ndelay(n) (__builtin_constant_p(n) ? \
@@ -1265,7 +1265,7 @@ static struct block_device_operations opt_fops = {
</programlisting>
<para>
- <filename>include/asm-i386/uaccess.h:</filename>
+ <filename>include/asm-x86/uaccess_32.h:</filename>
</para>
<programlisting>
diff --git a/Documentation/DocBook/mcabook.tmpl b/Documentation/DocBook/mcabook.tmpl
index 42a760cd7467..529a53dc1389 100644
--- a/Documentation/DocBook/mcabook.tmpl
+++ b/Documentation/DocBook/mcabook.tmpl
@@ -101,7 +101,7 @@
<chapter id="dmafunctions">
<title>DMA Functions Provided</title>
-!Iinclude/asm-i386/mca_dma.h
+!Iinclude/asm-x86/mca_dma.h
</chapter>
</book>
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
index a8c8cce50633..957cf5c26831 100644
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -275,17 +275,14 @@ int __init board_init (void)
int err = 0;
/* Allocate memory for MTD device structure and private data */
- board_mtd = kmalloc (sizeof(struct mtd_info) + sizeof (struct nand_chip), GFP_KERNEL);
+ board_mtd = kzalloc(sizeof(struct mtd_info) + sizeof(struct nand_chip), GFP_KERNEL);
if (!board_mtd) {
printk ("Unable to allocate NAND MTD device structure.\n");
err = -ENOMEM;
goto out;
}
- /* Initialize structures */
- memset ((char *) board_mtd, 0, sizeof(struct mtd_info) + sizeof(struct nand_chip));
-
- /* map physical adress */
+ /* map physical address */
baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
if(!baseaddr){
printk("Ioremap to access NAND chip failed\n");
@@ -309,7 +306,7 @@ int __init board_init (void)
this->dev_ready = board_dev_ready;
this->eccmode = NAND_ECC_SOFT;
- /* Scan to find existance of the device */
+ /* Scan to find existence of the device */
if (nand_scan (board_mtd, 1)) {
err = -ENXIO;
goto out_ior;
@@ -343,7 +340,7 @@ static void __exit board_cleanup (void)
/* Release resources, unregister device */
nand_release (board_mtd);
- /* unmap physical adress */
+ /* unmap physical address */
iounmap((void *)baseaddr);
/* Free the MTD device structure */
diff --git a/Documentation/DocBook/s390-drivers.tmpl b/Documentation/DocBook/s390-drivers.tmpl
new file mode 100644
index 000000000000..254e769282a4
--- /dev/null
+++ b/Documentation/DocBook/s390-drivers.tmpl
@@ -0,0 +1,149 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="s390drivers">
+ <bookinfo>
+ <title>Writing s390 channel device drivers</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Cornelia</firstname>
+ <surname>Huck</surname>
+ <affiliation>
+ <address>
+ <email>cornelia.huck@de.ibm.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2007</year>
+ <holder>IBM Corp.</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ This document describes the interfaces available for device drivers that
+ drive s390 based channel attached devices. This includes interfaces for
+ interaction with the hardware and interfaces for interacting with the
+ common driver core. Those interfaces are provided by the s390 common I/O
+ layer.
+ </para>
+ <para>
+ The document assumes a familarity with the technical terms associated
+ with the s390 channel I/O architecture. For a description of this
+ architecture, please refer to the "z/Architecture: Principles of
+ Operation", IBM publication no. SA22-7832.
+ </para>
+ <para>
+ While most I/O devices on a s390 system are typically driven through the
+ channel I/O mechanism described here, there are various other methods
+ (like the diag interface). These are out of the scope of this document.
+ </para>
+ <para>
+ Some additional information can also be found in the kernel source
+ under Documentation/s390/driver-model.txt.
+ </para>
+ </chapter>
+ <chapter id="ccw">
+ <title>The ccw bus</title>
+ <para>
+ The ccw bus typically contains the majority of devices available to
+ a s390 system. Named after the channel command word (ccw), the basic
+ command structure used to address its devices, the ccw bus contains
+ so-called channel attached devices. They are addressed via subchannels,
+ visible on the css bus. A device driver, however, will never interact
+ with the subchannel directly, but only via the device on the ccw bus,
+ the ccw device.
+ </para>
+ <sect1 id="channelIO">
+ <title>I/O functions for channel-attached devices</title>
+ <para>
+ Some hardware structures have been translated into C structures for use
+ by the common I/O layer and device drivers. For more information on
+ the hardware structures represented here, please consult the Principles
+ of Operation.
+ </para>
+!Iinclude/asm-s390/cio.h
+ </sect1>
+ <sect1 id="ccwdev">
+ <title>ccw devices</title>
+ <para>
+ Devices that want to initiate channel I/O need to attach to the ccw bus.
+ Interaction with the driver core is done via the common I/O layer, which
+ provides the abstractions of ccw devices and ccw device drivers.
+ </para>
+ <para>
+ The functions that initiate or terminate channel I/O all act upon a
+ ccw device structure. Device drivers must not bypass those functions
+ or strange side effects may happen.
+ </para>
+!Iinclude/asm-s390/ccwdev.h
+!Edrivers/s390/cio/device.c
+!Edrivers/s390/cio/device_ops.c
+ </sect1>
+ <sect1 id="cmf">
+ <title>The channel-measurement facility</title>
+ <para>
+ The channel-measurement facility provides a means to collect
+ measurement data which is made available by the channel subsystem
+ for each channel attached device.
+ </para>
+!Iinclude/asm-s390/cmb.h
+!Edrivers/s390/cio/cmf.c
+ </sect1>
+ </chapter>
+
+ <chapter id="ccwgroup">
+ <title>The ccwgroup bus</title>
+ <para>
+ The ccwgroup bus only contains artificial devices, created by the user.
+ Many networking devices (e.g. qeth) are in fact composed of several
+ ccw devices (like read, write and data channel for qeth). The
+ ccwgroup bus provides a mechanism to create a meta-device which
+ contains those ccw devices as slave devices and can be associated
+ with the netdevice.
+ </para>
+ <sect1 id="ccwgroupdevices">
+ <title>ccw group devices</title>
+!Iinclude/asm-s390/ccwgroup.h
+!Edrivers/s390/cio/ccwgroup.c
+ </sect1>
+ </chapter>
+
+</book>
diff --git a/Documentation/HOWTO b/Documentation/HOWTO
index c64e969dc33b..54835610b3d6 100644
--- a/Documentation/HOWTO
+++ b/Documentation/HOWTO
@@ -77,7 +77,7 @@ documentation files are also added which explain how to use the feature.
When a kernel change causes the interface that the kernel exposes to
userspace to change, it is recommended that you send the information or
a patch to the manual pages explaining the change to the manual pages
-maintainer at mtk-manpages@gmx.net.
+maintainer at mtk.manpages@gmail.com.
Here is a list of files that are in the kernel source tree that are
required reading:
@@ -330,7 +330,7 @@ Here is a list of some of the different kernel trees available:
- ACPI development tree, Len Brown <len.brown@intel.com>
git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git
- - Block development tree, Jens Axboe <axboe@suse.de>
+ - Block development tree, Jens Axboe <jens.axboe@oracle.com>
git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git
- DRM development tree, Dave Airlie <airlied@linux.ie>
diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt
index 24dc3fcf1594..bc38283379f0 100644
--- a/Documentation/IPMI.txt
+++ b/Documentation/IPMI.txt
@@ -441,17 +441,20 @@ ACPI, and if none of those then a KCS device at the spec-specified
0xca2. If you want to turn this off, set the "trydefaults" option to
false.
-If you have high-res timers compiled into the kernel, the driver will
-use them to provide much better performance. Note that if you do not
-have high-res timers enabled in the kernel and you don't have
-interrupts enabled, the driver will run VERY slowly. Don't blame me,
+If your IPMI interface does not support interrupts and is a KCS or
+SMIC interface, the IPMI driver will start a kernel thread for the
+interface to help speed things up. This is a low-priority kernel
+thread that constantly polls the IPMI driver while an IPMI operation
+is in progress. The force_kipmid module parameter will all the user to
+force this thread on or off. If you force it off and don't have
+interrupts, the driver will run VERY slowly. Don't blame me,
these interfaces suck.
The driver supports a hot add and remove of interfaces. This way,
interfaces can be added or removed after the kernel is up and running.
-This is done using /sys/modules/ipmi_si/hotmod, which is a write-only
-parameter. You write a string to this interface. The string has the
-format:
+This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a
+write-only parameter. You write a string to this interface. The string
+has the format:
<op1>[:op2[:op3...]]
The "op"s are:
add|remove,kcs|bt|smic,mem|i/o,<address>[,<opt1>[,<opt2>[,...]]]
@@ -581,9 +584,11 @@ The watchdog will panic and start a 120 second reset timeout if it
gets a pre-action. During a panic or a reboot, the watchdog will
start a 120 timer if it is running to make sure the reboot occurs.
-Note that if you use the NMI preaction for the watchdog, you MUST
-NOT use nmi watchdog mode 1. If you use the NMI watchdog, you
-must use mode 2.
+Note that if you use the NMI preaction for the watchdog, you MUST NOT
+use the nmi watchdog. There is no reasonable way to tell if an NMI
+comes from the IPMI controller, so it must assume that if it gets an
+otherwise unhandled NMI, it must be from IPMI and it will panic
+immediately.
Once you open the watchdog timer, you must write a 'V' character to the
device to close it, or the timer will not stop. This is a new semantic
diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/Intel-IOMMU.txt
new file mode 100644
index 000000000000..c2321903aa09
--- /dev/null
+++ b/Documentation/Intel-IOMMU.txt
@@ -0,0 +1,115 @@
+Linux IOMMU Support
+===================
+
+The architecture spec can be obtained from the below location.
+
+http://www.intel.com/technology/virtualization/
+
+This guide gives a quick cheat sheet for some basic understanding.
+
+Some Keywords
+
+DMAR - DMA remapping
+DRHD - DMA Engine Reporting Structure
+RMRR - Reserved memory Region Reporting Structure
+ZLR - Zero length reads from PCI devices
+IOVA - IO Virtual address.
+
+Basic stuff
+-----------
+
+ACPI enumerates and lists the different DMA engines in the platform, and
+device scope relationships between PCI devices and which DMA engine controls
+them.
+
+What is RMRR?
+-------------
+
+There are some devices the BIOS controls, for e.g USB devices to perform
+PS2 emulation. The regions of memory used for these devices are marked
+reserved in the e820 map. When we turn on DMA translation, DMA to those
+regions will fail. Hence BIOS uses RMRR to specify these regions along with
+devices that need to access these regions. OS is expected to setup
+unity mappings for these regions for these devices to access these regions.
+
+How is IOVA generated?
+---------------------
+
+Well behaved drivers call pci_map_*() calls before sending command to device
+that needs to perform DMA. Once DMA is completed and mapping is no longer
+required, device performs a pci_unmap_*() calls to unmap the region.
+
+The Intel IOMMU driver allocates a virtual address per domain. Each PCIE
+device has its own domain (hence protection). Devices under p2p bridges
+share the virtual address with all devices under the p2p bridge due to
+transaction id aliasing for p2p bridges.
+
+IOVA generation is pretty generic. We used the same technique as vmalloc()
+but these are not global address spaces, but separate for each domain.
+Different DMA engines may support different number of domains.
+
+We also allocate gaurd pages with each mapping, so we can attempt to catch
+any overflow that might happen.
+
+
+Graphics Problems?
+------------------
+If you encounter issues with graphics devices, you can try adding
+option intel_iommu=igfx_off to turn off the integrated graphics engine.
+
+If it happens to be a PCI device included in the INCLUDE_ALL Engine,
+then try enabling CONFIG_DMAR_GFX_WA to setup a 1-1 map. We hear
+graphics drivers may be in process of using DMA api's in the near
+future and at that time this option can be yanked out.
+
+Some exceptions to IOVA
+-----------------------
+Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff).
+The same is true for peer to peer transactions. Hence we reserve the
+address from PCI MMIO ranges so they are not allocated for IOVA addresses.
+
+
+Fault reporting
+---------------
+When errors are reported, the DMA engine signals via an interrupt. The fault
+reason and device that caused it with fault reason is printed on console.
+
+See below for sample.
+
+
+Boot Message Sample
+-------------------
+
+Something like this gets printed indicating presence of DMAR tables
+in ACPI.
+
+ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0
+
+When DMAR is being processed and initialized by ACPI, prints DMAR locations
+and any RMRR's processed.
+
+ACPI DMAR:Host address width 36
+ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
+ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
+ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
+ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
+ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
+
+When DMAR is enabled for use, you will notice..
+
+PCI-DMA: Using DMAR IOMMU
+
+Fault reporting
+---------------
+
+DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
+DMAR:[fault reason 05] PTE Write access is not set
+DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
+DMAR:[fault reason 05] PTE Write access is not set
+
+TBD
+----
+
+- For compatibility testing, could use unity map domain for all devices, just
+ provide a 1-1 for all useful memory under a single domain for all devices.
+- API for paravirt ops for abstracting functionlity for VMM folks.
diff --git a/Documentation/MSI-HOWTO.txt b/Documentation/MSI-HOWTO.txt
index 0d8240774fca..a51f693c1541 100644
--- a/Documentation/MSI-HOWTO.txt
+++ b/Documentation/MSI-HOWTO.txt
@@ -241,68 +241,7 @@ address space of the MSI-X table/MSI-X PBA. Otherwise, the PCI subsystem
will fail enabling MSI-X on its hardware device when it calls the function
pci_enable_msix().
-5.3.2 Handling MSI-X allocation
-
-Determining the number of MSI-X vectors allocated to a function is
-dependent on the number of MSI capable devices and MSI-X capable
-devices populated in the system. The policy of allocating MSI-X
-vectors to a function is defined as the following:
-
-#of MSI-X vectors allocated to a function = (x - y)/z where
-
-x = The number of available PCI vector resources by the time
- the device driver calls pci_enable_msix(). The PCI vector
- resources is the sum of the number of unassigned vectors
- (new) and the number of released vectors when any MSI/MSI-X
- device driver switches its hardware device back to a legacy
- mode or is hot-removed. The number of unassigned vectors
- may exclude some vectors reserved, as defined in parameter
- NR_HP_RESERVED_VECTORS, for the case where the system is
- capable of supporting hot-add/hot-remove operations. Users
- may change the value defined in NR_HR_RESERVED_VECTORS to
- meet their specific needs.
-
-y = The number of MSI capable devices populated in the system.
- This policy ensures that each MSI capable device has its
- vector reserved to avoid the case where some MSI-X capable
- drivers may attempt to claim all available vector resources.
-
-z = The number of MSI-X capable devices populated in the system.
- This policy ensures that maximum (x - y) is distributed
- evenly among MSI-X capable devices.
-
-Note that the PCI subsystem scans y and z during a bus enumeration.
-When the PCI subsystem completes configuring MSI/MSI-X capability
-structure of a device as requested by its device driver, y/z is
-decremented accordingly.
-
-5.3.3 Handling MSI-X shortages
-
-For the case where fewer MSI-X vectors are allocated to a function
-than requested, the function pci_enable_msix() will return the
-maximum number of MSI-X vectors available to the caller. A device
-driver may re-send its request with fewer or equal vectors indicated
-in the return. For example, if a device driver requests 5 vectors, but
-the number of available vectors is 3 vectors, a value of 3 will be
-returned as a result of pci_enable_msix() call. A function could be
-designed for its driver to use only 3 MSI-X table entries as
-different combinations as ABC--, A-B-C, A--CB, etc. Note that this
-patch does not support multiple entries with the same vector. Such
-attempt by a device driver to use 5 MSI-X table entries with 3 vectors
-as ABBCC, AABCC, BCCBA, etc will result as a failure by the function
-pci_enable_msix(). Below are the reasons why supporting multiple
-entries with the same vector is an undesirable solution.
-
- - The PCI subsystem cannot determine the entry that
- generated the message to mask/unmask MSI while handling
- software driver ISR. Attempting to walk through all MSI-X
- table entries (2048 max) to mask/unmask any match vector
- is an undesirable solution.
-
- - Walking through all MSI-X table entries (2048 max) to handle
- SMP affinity of any match vector is an undesirable solution.
-
-5.3.4 API pci_enable_msix
+5.3.2 API pci_enable_msix
int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
@@ -339,7 +278,7 @@ a failure. This failure may be a result of duplicate entries
specified in second argument, or a result of no available vector,
or a result of failing to initialize MSI-X table entries.
-5.3.5 API pci_disable_msix
+5.3.3 API pci_disable_msix
void pci_disable_msix(struct pci_dev *dev)
@@ -349,7 +288,7 @@ always call free_irq() on all MSI-X vectors it has done request_irq()
on before calling this API. Failure to do so results in a BUG_ON() and
a device will be left with MSI-X enabled and leaks its vectors.
-5.3.6 MSI-X mode vs. legacy mode diagram
+5.3.4 MSI-X mode vs. legacy mode diagram
The below diagram shows the events which switch the interrupt
mode on the MSI-X capable device function between MSI-X mode and
@@ -407,7 +346,7 @@ between MSI mod MSI-X mode during a run-time.
MSI/MSI-X support requires support from both system hardware and
individual hardware device functions.
-5.5.1 System hardware support
+5.5.1 Required x86 hardware support
Since the target of MSI address is the local APIC CPU, enabling
MSI/MSI-X support in the Linux kernel is dependent on whether existing
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
new file mode 100644
index 000000000000..461481dfb7c3
--- /dev/null
+++ b/Documentation/RCU/00-INDEX
@@ -0,0 +1,22 @@
+00-INDEX
+ - This file
+arrayRCU.txt
+ - Using RCU to Protect Read-Mostly Arrays
+checklist.txt
+ - Review Checklist for RCU Patches
+listRCU.txt
+ - Using RCU to Protect Read-Mostly Linked Lists
+NMI-RCU.txt
+ - Using RCU to Protect Dynamic NMI Handlers
+rcuref.txt
+ - Reference-count design for elements of lists/arrays protected by RCU
+rcu.txt
+ - RCU Concepts
+RTFP.txt
+ - List of RCU papers (bibliography) going back to 1980.
+torture.txt
+ - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
+UP.txt
+ - RCU on Uniprocessor Systems
+whatisRCU.txt
+ - What is RCU?
diff --git a/Documentation/SM501.txt b/Documentation/SM501.txt
index 3a1bd95d3767..6fc656035925 100644
--- a/Documentation/SM501.txt
+++ b/Documentation/SM501.txt
@@ -3,6 +3,11 @@
Copyright 2006, 2007 Simtec Electronics
+The Silicon Motion SM501 multimedia companion chip is a multifunction device
+which may provide numerous interfaces including USB host controller USB gadget,
+Asyncronous Serial ports, Audio functions and a dual display video interface.
+The device may be connected by PCI or local bus with varying functions enabled.
+
Core
----
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index 19e7f65c269f..34e06d2f194f 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -67,7 +67,7 @@ kernel patches.
20: Check that it all passes `make headers_check'.
21: Has been checked with injection of at least slab and page-allocation
- fauilures. See Documentation/fault-injection/.
+ failures. See Documentation/fault-injection/.
If the new code is substantial, addition of subsystem-specific fault
injection might be appropriate.
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers
index d7e26427e426..24f2eb40cae5 100644
--- a/Documentation/SubmittingDrivers
+++ b/Documentation/SubmittingDrivers
@@ -36,8 +36,7 @@ Linux 2.4:
If the code area has a general maintainer then please submit it to
the maintainer listed in MAINTAINERS in the kernel file. If the
maintainer does not respond or you cannot find the appropriate
- maintainer then please contact Marcelo Tosatti
- <marcelo.tosatti@cyclades.com>.
+ maintainer then please contact Willy Tarreau <w@1wt.eu>.
Linux 2.6:
The same rules apply as 2.4 except that you should follow linux-kernel
diff --git a/Documentation/accounting/cgroupstats.txt b/Documentation/accounting/cgroupstats.txt
new file mode 100644
index 000000000000..eda40fd39cad
--- /dev/null
+++ b/Documentation/accounting/cgroupstats.txt
@@ -0,0 +1,27 @@
+Control Groupstats is inspired by the discussion at
+http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as
+suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263.
+
+Per cgroup statistics infrastructure re-uses code from the taskstats
+interface. A new set of cgroup operations are registered with commands
+and attributes specific to cgroups. It should be very easy to
+extend per cgroup statistics, by adding members to the cgroupstats
+structure.
+
+The current model for cgroupstats is a pull, a push model (to post
+statistics on interesting events), should be very easy to add. Currently
+user space requests for statistics by passing the cgroup path.
+Statistics about the state of all the tasks in the cgroup is returned to
+user space.
+
+NOTE: We currently rely on delay accounting for extracting information
+about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this
+information will not be available.
+
+To extract cgroup statistics a utility very similar to getdelays.c
+has been developed, the sample output of the utility is shown below
+
+~/balbir/cgroupstats # ./getdelays -C "/cgroup/a"
+sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0
+~/balbir/cgroupstats # ./getdelays -C "/cgroup"
+sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2
diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index cbee3a27f768..ab82b7f53312 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -21,7 +21,6 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
-#include <sys/types.h>
#include <signal.h>
#include <linux/genetlink.h>
diff --git a/Documentation/arm/00-INDEX b/Documentation/arm/00-INDEX
index 2c6a3b38967e..82e418d648d0 100644
--- a/Documentation/arm/00-INDEX
+++ b/Documentation/arm/00-INDEX
@@ -4,19 +4,29 @@ Booting
- requirements for booting
Interrupts
- ARM Interrupt subsystem documentation
+IXP2000
+ - Release Notes for Linux on Intel's IXP2000 Network Processor
Netwinder
- Netwinder specific documentation
+Porting
+ - Symbol definitions for porting Linux to a new ARM machine.
+Setup
+ - Kernel initialization parameters on ARM Linux
README
- General ARM documentation
-SA1100
+SA1100/
- SA1100 documentation
-XScale
- - XScale documentation
-empeg
- - Empeg documentation
+Samsung-S3C24XX
+ - S3C24XX ARM Linux Overview
+Sharp-LH
+ - Linux on Sharp LH79524 and LH7A40X System On a Chip (SOC)
+VFP/
+ - Release notes for Linux Kernel Vector Floating Point support code
+empeg/
+ - Ltd's Empeg MP3 Car Audio Player
mem_alignment
- alignment abort handler documentation
memory.txt
- description of the virtual memory layout
-nwfpe
+nwfpe/
- NWFPE floating point emulator documentation
diff --git a/Documentation/arm/Samsung-S3C24XX/DMA.txt b/Documentation/arm/Samsung-S3C24XX/DMA.txt
index 37f4edcc5d87..3ed82383efea 100644
--- a/Documentation/arm/Samsung-S3C24XX/DMA.txt
+++ b/Documentation/arm/Samsung-S3C24XX/DMA.txt
@@ -5,7 +5,7 @@ Introduction
------------
The kernel provides an interface to manage DMA transfers
- using the DMA channels in the cpu, so that the central
+ using the DMA channels in the CPU, so that the central
duty of managing channel mappings, and programming the
channel generators is in one place.
@@ -17,24 +17,24 @@ DMA Channel Ordering
channels to all sources, which means that some devices
have a restricted number of channels that can be used.
- To allow flexibilty for each cpu type and board, the
- dma code can be given an dma ordering structure which
+ To allow flexibility for each CPU type and board, the
+ DMA code can be given a DMA ordering structure which
allows the order of channel search to be specified, as
well as allowing the prohibition of certain claims.
struct s3c24xx_dma_order has a list of channels, and
- each channel within has a slot for a list of dma
- channel numbers. The slots are searched in order, for
- the presence of a dma channel number with DMA_CH_VALID
- orred in.
+ each channel within has a slot for a list of DMA
+ channel numbers. The slots are searched in order for
+ the presence of a DMA channel number with DMA_CH_VALID
+ or-ed in.
If the order has the flag DMA_CH_NEVER set, then after
checking the channel list, the system will return no
found channel, thus denying the request.
A board support file can call s3c24xx_dma_order_set()
- to register an complete ordering set. The routine will
- copy the data, so the original can be discared with
+ to register a complete ordering set. The routine will
+ copy the data, so the original can be discarded with
__initdata.
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 05851e9982ed..f20c10c2858f 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -14,8 +14,15 @@ suffice:
typedef struct { volatile int counter; } atomic_t;
- The first operations to implement for atomic_t's are the
-initializers and plain reads.
+Historically, counter has been declared volatile. This is now discouraged.
+See Documentation/volatile-considered-harmful.txt for the complete rationale.
+
+local_t is very similar to atomic_t. If the counter is per CPU and only
+updated by one CPU, local_t is probably more appropriate. Please see
+Documentation/local_ops.txt for the semantics of local_t.
+
+The first operations to implement for atomic_t's are the initializers and
+plain reads.
#define ATOMIC_INIT(i) { (i) }
#define atomic_set(v, i) ((v)->counter = (i))
@@ -24,6 +31,12 @@ The first macro is used in definitions, such as:
static atomic_t my_counter = ATOMIC_INIT(1);
+The initializer is atomic in that the return values of the atomic operations
+are guaranteed to be correct reflecting the initialized value if the
+initializer is used before runtime. If the initializer is used at runtime, a
+proper implicit or explicit read memory barrier is needed before reading the
+value with atomic_read from another thread.
+
The second interface can be used at runtime, as in:
struct foo { atomic_t counter; };
@@ -36,13 +49,43 @@ The second interface can be used at runtime, as in:
return -ENOMEM;
atomic_set(&k->counter, 0);
+The setting is atomic in that the return values of the atomic operations by
+all threads are guaranteed to be correct reflecting either the value that has
+been set with this operation or set with another operation. A proper implicit
+or explicit memory barrier is needed before the value set with the operation
+is guaranteed to be readable with atomic_read from another thread.
+
Next, we have:
#define atomic_read(v) ((v)->counter)
-which simply reads the current value of the counter.
-
-Now, we move onto the actual atomic operation interfaces.
+which simply reads the counter value currently visible to the calling thread.
+The read is atomic in that the return value is guaranteed to be one of the
+values initialized or modified with the interface operations if a proper
+implicit or explicit memory barrier is used after possible runtime
+initialization by any other thread and the value is modified only with the
+interface operations. atomic_read does not guarantee that the runtime
+initialization by any other thread is visible yet, so the user of the
+interface must take care of that with a proper implicit or explicit memory
+barrier.
+
+*** WARNING: atomic_read() and atomic_set() DO NOT IMPLY BARRIERS! ***
+
+Some architectures may choose to use the volatile keyword, barriers, or inline
+assembly to guarantee some degree of immediacy for atomic_read() and
+atomic_set(). This is not uniformly guaranteed, and may change in the future,
+so all users of atomic_t should treat atomic_read() and atomic_set() as simple
+C statements that may be reordered or optimized away entirely by the compiler
+or processor, and explicitly invoke the appropriate compiler and/or memory
+barrier for each use case. Failure to do so will result in code that may
+suddenly break when used with different architectures or compiler
+optimizations, or even changes in unrelated code which changes how the
+compiler optimizes the section accessing atomic_t variables.
+
+*** YOU HAVE BEEN WARNED! ***
+
+Now, we move onto the atomic operation interfaces typically implemented with
+the help of assembly code.
void atomic_add(int i, atomic_t *v);
void atomic_sub(int i, atomic_t *v);
@@ -117,6 +160,12 @@ operation.
Then:
+ int atomic_xchg(atomic_t *v, int new);
+
+This performs an atomic exchange operation on the atomic variable v, setting
+the given new value. It returns the old value that the atomic variable v had
+just before the operation.
+
int atomic_cmpxchg(atomic_t *v, int old, int new);
This performs an atomic compare exchange operation on the atomic value v,
@@ -369,6 +418,20 @@ brothers:
*/
smp_mb__after_clear_bit();
+There are two special bitops with lock barrier semantics (acquire/release,
+same as spinlocks). These operate in the same way as their non-_lock/unlock
+postfixed variants, except that they are to provide acquire/release semantics,
+respectively. This means they can be used for bit_spin_trylock and
+bit_spin_unlock type operations without specifying any more barriers.
+
+ int test_and_set_bit_lock(unsigned long nr, unsigned long *addr);
+ void clear_bit_unlock(unsigned long nr, unsigned long *addr);
+ void __clear_bit_unlock(unsigned long nr, unsigned long *addr);
+
+The __clear_bit_unlock version is non-atomic, however it still implements
+unlock barrier semantics. This can be useful if the lock itself is protecting
+the other bits in the word.
+
Finally, there are non-atomic versions of the bitmask operations
provided. They are used in contexts where some other higher-level SMP
locking scheme is being used to protect the bitmask, and thus less
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
new file mode 100644
index 000000000000..961a0513f8c3
--- /dev/null
+++ b/Documentation/block/00-INDEX
@@ -0,0 +1,20 @@
+00-INDEX
+ - This file
+as-iosched.txt
+ - Anticipatory IO scheduler
+barrier.txt
+ - I/O Barriers
+biodoc.txt
+ - Notes on the Generic Block Layer Rewrite in Linux 2.5
+capability.txt
+ - Generic Block Device Capability (/sys/block/<disk>/capability)
+deadline-iosched.txt
+ - Deadline IO scheduler tunables
+ioprio.txt
+ - Block io priorities (in CFQ scheduler)
+request.txt
+ - The members of struct request (in include/linux/blkdev.h)
+stat.txt
+ - Block layer statistics in /sys/block/<dev>/stat
+switching-sched.txt
+ - Switching I/O schedulers at runtime
diff --git a/Documentation/block/as-iosched.txt b/Documentation/block/as-iosched.txt
index a598fe10a297..738b72be128e 100644
--- a/Documentation/block/as-iosched.txt
+++ b/Documentation/block/as-iosched.txt
@@ -20,15 +20,10 @@ actually has a head for each physical device in the logical RAID device.
However, setting the antic_expire (see tunable parameters below) produces
very similar behavior to the deadline IO scheduler.
-
Selecting IO schedulers
-----------------------
-To choose IO schedulers at boot time, use the argument 'elevator=deadline'.
-'noop', 'as' and 'cfq' (the default) are also available. IO schedulers are
-assigned globally at boot time only presently. It's also possible to change
-the IO scheduler for a determined device on the fly, as described in
-Documentation/block/switching-sched.txt.
-
+Refer to Documentation/block/switching-sched.txt for information on
+selecting an io scheduler on a per-device basis.
Anticipatory IO scheduler Policies
----------------------------------
@@ -115,7 +110,7 @@ statistics (average think time, average seek distance) on the process
that submitted the just completed request are examined. If it seems
likely that that process will submit another request soon, and that
request is likely to be near the just completed request, then the IO
-scheduler will stop dispatching more read requests for up time (antic_expire)
+scheduler will stop dispatching more read requests for up to (antic_expire)
milliseconds, hoping that process will submit a new request near the one
that just completed. If such a request is made, then it is dispatched
immediately. If the antic_expire wait time expires, then the IO scheduler
@@ -165,3 +160,13 @@ The parameters are:
for big seek time devices though not a linear correspondence - most
processes have only a few ms thinktime.
+In addition to the tunables above there is a read-only file named est_time
+which, when read, will show:
+
+ - The probability of a task exiting without a cooperating task
+ submitting an anticipated IO.
+
+ - The current mean think time.
+
+ - The seek distance used to determine if an incoming IO is better.
+
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 8af392fc6ef0..93f223b9723f 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -2,7 +2,7 @@
=====================================================
Notes Written on Jan 15, 2002:
- Jens Axboe <axboe@suse.de>
+ Jens Axboe <jens.axboe@oracle.com>
Suparna Bhattacharya <suparna@in.ibm.com>
Last Updated May 2, 2002
@@ -21,7 +21,7 @@ Credits:
---------
2.5 bio rewrite:
- Jens Axboe <axboe@suse.de>
+ Jens Axboe <jens.axboe@oracle.com>
Many aspects of the generic block layer redesign were driven by and evolved
over discussions, prior patches and the collective experience of several
@@ -477,9 +477,9 @@ With this multipage bio design:
the same bi_io_vec array, but with the index and size accordingly modified)
- A linked list of bios is used as before for unrelated merges (*) - this
avoids reallocs and makes independent completions easier to handle.
-- Code that traverses the req list needs to make a distinction between
- segments of a request (bio_for_each_segment) and the distinct completion
- units/bios (rq_for_each_bio).
+- Code that traverses the req list can find all the segments of a bio
+ by using rq_for_each_segment. This handles the fact that a request
+ has multiple bios, each of which can have multiple segments.
- Drivers which can't process a large bio in one shot can use the bi_idx
field to keep track of the next bio_vec entry to process.
(e.g a 1MB bio_vec needs to be handled in max 128kB chunks for IDE)
@@ -664,14 +664,14 @@ in lvm or md.
3.2.1 Traversing segments and completion units in a request
-The macros bio_for_each_segment() and rq_for_each_bio() should be used for
-traversing the bios in the request list (drivers should avoid directly
-trying to do it themselves). Using these helpers should also make it easier
-to cope with block changes in the future.
+The macro rq_for_each_segment() should be used for traversing the bios
+in the request list (drivers should avoid directly trying to do it
+themselves). Using these helpers should also make it easier to cope
+with block changes in the future.
- rq_for_each_bio(bio, rq)
- bio_for_each_segment(bio_vec, bio, i)
- /* bio_vec is now current segment */
+ struct req_iterator iter;
+ rq_for_each_segment(bio_vec, rq, iter)
+ /* bio_vec is now current segment */
I/O completion callbacks are per-bio rather than per-segment, so drivers
that traverse bio chains on completion need to keep that in mind. Drivers
diff --git a/Documentation/block/deadline-iosched.txt b/Documentation/block/deadline-iosched.txt
index be08ffd1e9b8..c23cab13c3d1 100644
--- a/Documentation/block/deadline-iosched.txt
+++ b/Documentation/block/deadline-iosched.txt
@@ -5,16 +5,10 @@ This little file attempts to document how the deadline io scheduler works.
In particular, it will clarify the meaning of the exposed tunables that may be
of interest to power users.
-Each io queue has a set of io scheduler tunables associated with it. These
-tunables control how the io scheduler works. You can find these entries
-in:
-
-/sys/block/<device>/queue/iosched
-
-assuming that you have sysfs mounted on /sys. If you don't have sysfs mounted,
-you can do so by typing:
-
-# mount none /sys -t sysfs
+Selecting IO schedulers
+-----------------------
+Refer to Documentation/block/switching-sched.txt for information on
+selecting an io scheduler on a per-device basis.
********************************************************************************
@@ -41,14 +35,11 @@ fifo_batch
When a read request expires its deadline, we must move some requests from
the sorted io scheduler list to the block device dispatch queue. fifo_batch
-controls how many requests we move, based on the cost of each request. A
-request is either qualified as a seek or a stream. The io scheduler knows
-the last request that was serviced by the drive (or will be serviced right
-before this one). See seek_cost and stream_unit.
+controls how many requests we move.
-write_starved (number of dispatches)
--------------
+writes_starved (number of dispatches)
+--------------
When we have to move requests from the io scheduler queue to the block
device dispatch queue, we always give a preference to reads. However, we
@@ -73,6 +64,6 @@ that comes at basically 0 cost we leave that on. We simply disable the
rbtree front sector lookup when the io scheduler merge function is called.
-Nov 11 2002, Jens Axboe <axboe@suse.de>
+Nov 11 2002, Jens Axboe <jens.axboe@oracle.com>
diff --git a/Documentation/block/ioprio.txt b/Documentation/block/ioprio.txt
index 1b930ef5a079..8ed8c59380b4 100644
--- a/Documentation/block/ioprio.txt
+++ b/Documentation/block/ioprio.txt
@@ -86,8 +86,15 @@ extern int sys_ioprio_get(int, int);
#error "Unsupported arch"
#endif
-_syscall3(int, ioprio_set, int, which, int, who, int, ioprio);
-_syscall2(int, ioprio_get, int, which, int, who);
+static inline int ioprio_set(int which, int who, int ioprio)
+{
+ return syscall(__NR_ioprio_set, which, who, ioprio);
+}
+
+static inline int ioprio_get(int which, int who)
+{
+ return syscall(__NR_ioprio_get, which, who);
+}
enum {
IOPRIO_CLASS_NONE,
@@ -173,4 +180,4 @@ int main(int argc, char *argv[])
---> snip ionice.c tool <---
-March 11 2005, Jens Axboe <axboe@suse.de>
+March 11 2005, Jens Axboe <jens.axboe@oracle.com>
diff --git a/Documentation/block/request.txt b/Documentation/block/request.txt
index fff58acb40a3..754e104ed369 100644
--- a/Documentation/block/request.txt
+++ b/Documentation/block/request.txt
@@ -1,7 +1,7 @@
struct request documentation
-Jens Axboe <axboe@suse.de> 27/05/02
+Jens Axboe <jens.axboe@oracle.com> 27/05/02
1.0
Index
diff --git a/Documentation/block/switching-sched.txt b/Documentation/block/switching-sched.txt
index 5fa130a67531..634c952e1964 100644
--- a/Documentation/block/switching-sched.txt
+++ b/Documentation/block/switching-sched.txt
@@ -1,3 +1,18 @@
+To choose IO schedulers at boot time, use the argument 'elevator=deadline'.
+'noop', 'as' and 'cfq' (the default) are also available. IO schedulers are
+assigned globally at boot time only presently.
+
+Each io queue has a set of io scheduler tunables associated with it. These
+tunables control how the io scheduler works. You can find these entries
+in:
+
+/sys/block/<device>/queue/iosched
+
+assuming that you have sysfs mounted on /sys. If you don't have sysfs mounted,
+you can do so by typing:
+
+# mount none /sys -t sysfs
+
As of the Linux 2.6.10 kernel, it is now possible to change the
IO scheduler for a given block device on the fly (thus making it possible,
for instance, to set the CFQ scheduler for the system default, but
@@ -20,3 +35,9 @@ noop anticipatory deadline [cfq]
# echo anticipatory > /sys/block/hda/queue/scheduler
# cat /sys/block/hda/queue/scheduler
noop [anticipatory] deadline cfq
+
+Each io queue has a set of io scheduler tunables associated with it. These
+tunables control how the io scheduler works. You can find these entries
+in:
+
+/sys/block/<device>/queue/iosched
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index 866b76139420..da42ab414c48 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -87,30 +87,7 @@ changes occur:
This is used primarily during fault processing.
-5) void flush_tlb_pgtables(struct mm_struct *mm,
- unsigned long start, unsigned long end)
-
- The software page tables for address space 'mm' for virtual
- addresses in the range 'start' to 'end-1' are being torn down.
-
- Some platforms cache the lowest level of the software page tables
- in a linear virtually mapped array, to make TLB miss processing
- more efficient. On such platforms, since the TLB is caching the
- software page table structure, it needs to be flushed when parts
- of the software page table tree are unlinked/freed.
-
- Sparc64 is one example of a platform which does this.
-
- Usually, when munmap()'ing an area of user virtual address
- space, the kernel leaves the page table parts around and just
- marks the individual pte's as invalid. However, if very large
- portions of the address space are unmapped, the kernel frees up
- those portions of the software page tables to prevent potential
- excessive kernel memory usage caused by erratic mmap/mmunmap
- sequences. It is at these times that flush_tlb_pgtables will
- be invoked.
-
-6) void update_mmu_cache(struct vm_area_struct *vma,
+5) void update_mmu_cache(struct vm_area_struct *vma,
unsigned long address, pte_t pte)
At the end of every page fault, this routine is invoked to
@@ -123,7 +100,7 @@ changes occur:
translations for software managed TLB configurations.
The sparc64 port currently does this.
-7) void tlb_migrate_finish(struct mm_struct *mm)
+6) void tlb_migrate_finish(struct mm_struct *mm)
This interface is called at the end of an explicit
process migration. This interface provides a hook
@@ -133,12 +110,6 @@ changes occur:
The ia64 sn2 platform is one example of a platform
that uses this interface.
-8) void lazy_mmu_prot_update(pte_t pte)
- This interface is called whenever the protection on
- any user PTEs change. This interface provides a notification
- to architecture specific code to take appropriate action.
-
-
Next, we have the cache flushing interfaces. In general, when Linux
is changing an existing virtual-->physical mapping to a new value,
the sequence will be in one of the following forms:
diff --git a/Documentation/cdrom/cdrom-standard.tex b/Documentation/cdrom/cdrom-standard.tex
index 92f94e597582..c713aeb020c4 100644
--- a/Documentation/cdrom/cdrom-standard.tex
+++ b/Documentation/cdrom/cdrom-standard.tex
@@ -1009,7 +1009,7 @@ taken over the torch in maintaining \cdromc\ and integrating much
\cdrom-related code in the 2.1-kernel. Thanks to Scott Snyder and
Gerd Knorr, who were the first to implement this interface for SCSI
and IDE-CD drivers and added many ideas for extension of the data
-structures relative to kernel~2.0. Further thanks to Heiko Eissfeldt,
+structures relative to kernel~2.0. Further thanks to Heiko Ei{\sz}feldt,
Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard M\"onkeberg and Andrew
Kroll, the \linux\ \cdrom\ device driver developers who were kind
enough to give suggestions and criticisms during the writing. Finally
diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt
new file mode 100644
index 000000000000..98a26f81fa75
--- /dev/null
+++ b/Documentation/cgroups.txt
@@ -0,0 +1,545 @@
+ CGROUPS
+ -------
+
+Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
+
+Original copyright statements from cpusets.txt:
+Portions Copyright (C) 2004 BULL SA.
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+Modified by Paul Jackson <pj@sgi.com>
+Modified by Christoph Lameter <clameter@sgi.com>
+
+CONTENTS:
+=========
+
+1. Control Groups
+ 1.1 What are cgroups ?
+ 1.2 Why are cgroups needed ?
+ 1.3 How are cgroups implemented ?
+ 1.4 What does notify_on_release do ?
+ 1.5 How do I use cgroups ?
+2. Usage Examples and Syntax
+ 2.1 Basic Usage
+ 2.2 Attaching processes
+3. Kernel API
+ 3.1 Overview
+ 3.2 Synchronization
+ 3.3 Subsystem API
+4. Questions
+
+1. Control Groups
+==========
+
+1.1 What are cgroups ?
+----------------------
+
+Control Groups provide a mechanism for aggregating/partitioning sets of
+tasks, and all their future children, into hierarchical groups with
+specialized behaviour.
+
+Definitions:
+
+A *cgroup* associates a set of tasks with a set of parameters for one
+or more subsystems.
+
+A *subsystem* is a module that makes use of the task grouping
+facilities provided by cgroups to treat groups of tasks in
+particular ways. A subsystem is typically a "resource controller" that
+schedules a resource or applies per-cgroup limits, but it may be
+anything that wants to act on a group of processes, e.g. a
+virtualization subsystem.
+
+A *hierarchy* is a set of cgroups arranged in a tree, such that
+every task in the system is in exactly one of the cgroups in the
+hierarchy, and a set of subsystems; each subsystem has system-specific
+state attached to each cgroup in the hierarchy. Each hierarchy has
+an instance of the cgroup virtual filesystem associated with it.
+
+At any one time there may be multiple active hierachies of task
+cgroups. Each hierarchy is a partition of all tasks in the system.
+
+User level code may create and destroy cgroups by name in an
+instance of the cgroup virtual file system, specify and query to
+which cgroup a task is assigned, and list the task pids assigned to
+a cgroup. Those creations and assignments only affect the hierarchy
+associated with that instance of the cgroup file system.
+
+On their own, the only use for cgroups is for simple job
+tracking. The intention is that other subsystems hook into the generic
+cgroup support to provide new attributes for cgroups, such as
+accounting/limiting the resources which processes in a cgroup can
+access. For example, cpusets (see Documentation/cpusets.txt) allows
+you to associate a set of CPUs and a set of memory nodes with the
+tasks in each cgroup.
+
+1.2 Why are cgroups needed ?
+----------------------------
+
+There are multiple efforts to provide process aggregations in the
+Linux kernel, mainly for resource tracking purposes. Such efforts
+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
+namespaces. These all require the basic notion of a
+grouping/partitioning of processes, with newly forked processes ending
+in the same group (cgroup) as their parent process.
+
+The kernel cgroup patch provides the minimum essential kernel
+mechanisms required to efficiently implement such groups. It has
+minimal impact on the system fast paths, and provides hooks for
+specific subsystems such as cpusets to provide additional behaviour as
+desired.
+
+Multiple hierarchy support is provided to allow for situations where
+the division of tasks into cgroups is distinctly different for
+different subsystems - having parallel hierarchies allows each
+hierarchy to be a natural division of tasks, without having to handle
+complex combinations of tasks that would be present if several
+unrelated subsystems needed to be forced into the same tree of
+cgroups.
+
+At one extreme, each resource controller or subsystem could be in a
+separate hierarchy; at the other extreme, all subsystems
+would be attached to the same hierarchy.
+
+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
+that can benefit from multiple hierarchies, consider a large
+university server with various users - students, professors, system
+tasks etc. The resource planning for this server could be along the
+following lines:
+
+ CPU : Top cpuset
+ / \
+ CPUSet1 CPUSet2
+ | |
+ (Profs) (Students)
+
+ In addition (system tasks) are attached to topcpuset (so
+ that they can run anywhere) with a limit of 20%
+
+ Memory : Professors (50%), students (30%), system (20%)
+
+ Disk : Prof (50%), students (30%), system (20%)
+
+ Network : WWW browsing (20%), Network File System (60%), others (20%)
+ / \
+ Prof (15%) students (5%)
+
+Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
+into NFS network class.
+
+At the same time firefox/lynx will share an appropriate CPU/Memory class
+depending on who launched it (prof/student).
+
+With the ability to classify tasks differently for different resources
+(by putting those resource subsystems in different hierarchies) then
+the admin can easily set up a script which receives exec notifications
+and depending on who is launching the browser he can
+
+ # echo browser_pid > /mnt/<restype>/<userclass>/tasks
+
+With only a single hierarchy, he now would potentially have to create
+a separate cgroup for every browser launched and associate it with
+approp network and other resource class. This may lead to
+proliferation of such cgroups.
+
+Also lets say that the administrator would like to give enhanced network
+access temporarily to a student's browser (since it is night and the user
+wants to do online gaming :) OR give one of the students simulation
+apps enhanced CPU power,
+
+With ability to write pids directly to resource classes, its just a
+matter of :
+
+ # echo pid > /mnt/network/<new_class>/tasks
+ (after some time)
+ # echo pid > /mnt/network/<orig_class>/tasks
+
+Without this ability, he would have to split the cgroup into
+multiple separate ones and then associate the new cgroups with the
+new resource classes.
+
+
+
+1.3 How are cgroups implemented ?
+---------------------------------
+
+Control Groups extends the kernel as follows:
+
+ - Each task in the system has a reference-counted pointer to a
+ css_set.
+
+ - A css_set contains a set of reference-counted pointers to
+ cgroup_subsys_state objects, one for each cgroup subsystem
+ registered in the system. There is no direct link from a task to
+ the cgroup of which it's a member in each hierarchy, but this
+ can be determined by following pointers through the
+ cgroup_subsys_state objects. This is because accessing the
+ subsystem state is something that's expected to happen frequently
+ and in performance-critical code, whereas operations that require a
+ task's actual cgroup assignments (in particular, moving between
+ cgroups) are less common. A linked list runs through the cg_list
+ field of each task_struct using the css_set, anchored at
+ css_set->tasks.
+
+ - A cgroup hierarchy filesystem can be mounted for browsing and
+ manipulation from user space.
+
+ - You can list all the tasks (by pid) attached to any cgroup.
+
+The implementation of cgroups requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+
+ - in init/main.c, to initialize the root cgroups and initial
+ css_set at system boot.
+
+ - in fork and exit, to attach and detach a task from its css_set.
+
+In addition a new file system, of type "cgroup" may be mounted, to
+enable browsing and modifying the cgroups presently known to the
+kernel. When mounting a cgroup hierarchy, you may specify a
+comma-separated list of subsystems to mount as the filesystem mount
+options. By default, mounting the cgroup filesystem attempts to
+mount a hierarchy containing all registered subsystems.
+
+If an active hierarchy with exactly the same set of subsystems already
+exists, it will be reused for the new mount. If no existing hierarchy
+matches, and any of the requested subsystems are in use in an existing
+hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
+is activated, associated with the requested subsystems.
+
+It's not currently possible to bind a new subsystem to an active
+cgroup hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy. This may be possible in future, but is fraught with nasty
+error-recovery issues.
+
+When a cgroup filesystem is unmounted, if there are any
+child cgroups created below the top-level cgroup, that hierarchy
+will remain active even though unmounted; if there are no
+child cgroups then the hierarchy will be deactivated.
+
+No new system calls are added for cgroups - all support for
+querying and modifying cgroups is via this cgroup file system.
+
+Each task under /proc has an added file named 'cgroup' displaying,
+for each active hierarchy, the subsystem names and the cgroup name
+as the path relative to the root of the cgroup file system.
+
+Each cgroup is represented by a directory in the cgroup file system
+containing the following files describing that cgroup:
+
+ - tasks: list of tasks (by pid) attached to that cgroup
+ - notify_on_release flag: run /sbin/cgroup_release_agent on exit?
+
+Other subsystems such as cpusets may add additional files in each
+cgroup dir
+
+New cgroups are created using the mkdir system call or shell
+command. The properties of a cgroup, such as its flags, are
+modified by writing to the appropriate file in that cgroups
+directory, as listed above.
+
+The named hierarchical structure of nested cgroups allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cgroup allows organizing the work load
+on a system into related sets of tasks. A task may be re-attached to
+any other cgroup, if allowed by the permissions on the necessary
+cgroup file system directories.
+
+When a task is moved from one cgroup to another, it gets a new
+css_set pointer - if there's an already existing css_set with the
+desired collection of cgroups then that group is reused, else a new
+css_set is allocated. Note that the current implementation uses a
+linear search to locate an appropriate existing css_set, so isn't
+very efficient. A future version will use a hash table for better
+performance.
+
+To allow access from a cgroup to the css_sets (and hence tasks)
+that comprise it, a set of cg_cgroup_link objects form a lattice;
+each cg_cgroup_link is linked into a list of cg_cgroup_links for
+a single cgroup on its cont_link_list field, and a list of
+cg_cgroup_links for a single css_set on its cg_link_list.
+
+Thus the set of tasks in a cgroup can be listed by iterating over
+each css_set that references the cgroup, and sub-iterating over
+each css_set's task set.
+
+The use of a Linux virtual file system (vfs) to represent the
+cgroup hierarchy provides for a familiar permission and name space
+for cgroups, with a minimum of additional kernel code.
+
+1.4 What does notify_on_release do ?
+------------------------------------
+
+*** notify_on_release is disabled in the current patch set. It will be
+*** reactivated in a future patch in a less-intrusive manner
+
+If the notify_on_release flag is enabled (1) in a cgroup, then
+whenever the last task in the cgroup leaves (exits or attaches to
+some other cgroup) and the last child cgroup of that cgroup
+is removed, then the kernel runs the command specified by the contents
+of the "release_agent" file in that hierarchy's root directory,
+supplying the pathname (relative to the mount point of the cgroup
+file system) of the abandoned cgroup. This enables automatic
+removal of abandoned cgroups. The default value of
+notify_on_release in the root cgroup at system boot is disabled
+(0). The default value of other cgroups at creation is the current
+value of their parents notify_on_release setting. The default value of
+a cgroup hierarchy's release_agent path is empty.
+
+1.5 How do I use cgroups ?
+--------------------------
+
+To start a new job that is to be contained within a cgroup, using
+the "cpuset" cgroup subsystem, the steps are something like:
+
+ 1) mkdir /dev/cgroup
+ 2) mount -t cgroup -ocpuset cpuset /dev/cgroup
+ 3) Create the new cgroup by doing mkdir's and write's (or echo's) in
+ the /dev/cgroup virtual file system.
+ 4) Start a task that will be the "founding father" of the new job.
+ 5) Attach that task to the new cgroup by writing its pid to the
+ /dev/cgroup tasks file for that cgroup.
+ 6) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cgroup
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cgroup:
+
+ mount -t cgroup cpuset -ocpuset /dev/cgroup
+ cd /dev/cgroup
+ mkdir Charlie
+ cd Charlie
+ /bin/echo 2-3 > cpus
+ /bin/echo 1 > mems
+ /bin/echo $$ > tasks
+ sh
+ # The subshell 'sh' is now running in cgroup Charlie
+ # The next line should display '/Charlie'
+ cat /proc/self/cgroup
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using the cgroups can be done through the cgroup
+virtual filesystem.
+
+To mount a cgroup hierarchy will all available subsystems, type:
+# mount -t cgroup xxx /dev/cgroup
+
+The "xxx" is not interpreted by the cgroup code, but will appear in
+/proc/mounts so may be any useful identifying string that you like.
+
+To mount a cgroup hierarchy with just the cpuset and numtasks
+subsystems, type:
+# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup
+
+To change the set of subsystems bound to a mounted hierarchy, just
+remount with different options:
+
+# mount -o remount,cpuset,ns /dev/cgroup
+
+Note that changing the set of subsystems is currently only supported
+when the hierarchy consists of a single (root) cgroup. Supporting
+the ability to arbitrarily bind/unbind subsystems from an existing
+cgroup hierarchy is intended to be implemented in the future.
+
+Then under /dev/cgroup you can find a tree that corresponds to the
+tree of the cgroups in the system. For instance, /dev/cgroup
+is the cgroup that holds the whole system.
+
+If you want to create a new cgroup under /dev/cgroup:
+# cd /dev/cgroup
+# mkdir my_cgroup
+
+Now you want to do something with this cgroup.
+# cd my_cgroup
+
+In this directory you can find several files:
+# ls
+notify_on_release release_agent tasks
+(plus whatever files are added by the attached subsystems)
+
+Now attach your shell to this cgroup:
+# /bin/echo $$ > tasks
+
+You can also create cgroups inside your cgroup by using mkdir in this
+directory.
+# mkdir my_sub_cs
+
+To remove a cgroup, just use rmdir:
+# rmdir my_sub_cs
+
+This will fail if the cgroup is in use (has cgroups inside, or
+has processes attached, or is held alive by other subsystem-specific
+reference).
+
+2.2 Attaching processes
+-----------------------
+
+# /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another:
+
+# /bin/echo PID1 > tasks
+# /bin/echo PID2 > tasks
+ ...
+# /bin/echo PIDn > tasks
+
+3. Kernel API
+=============
+
+3.1 Overview
+------------
+
+Each kernel subsystem that wants to hook into the generic cgroup
+system needs to create a cgroup_subsys object. This contains
+various methods, which are callbacks from the cgroup system, along
+with a subsystem id which will be assigned by the cgroup system.
+
+Other fields in the cgroup_subsys object include:
+
+- subsys_id: a unique array index for the subsystem, indicating which
+ entry in cgroup->subsys[] this subsystem should be
+ managing. Initialized by cgroup_register_subsys(); prior to this
+ it should be initialized to -1
+
+- hierarchy: an index indicating which hierarchy, if any, this
+ subsystem is currently attached to. If this is -1, then the
+ subsystem is not attached to any hierarchy, and all tasks should be
+ considered to be members of the subsystem's top_cgroup. It should
+ be initialized to -1.
+
+- name: should be initialized to a unique subsystem name prior to
+ calling cgroup_register_subsystem. Should be no longer than
+ MAX_CGROUP_TYPE_NAMELEN
+
+Each cgroup object created by the system has an array of pointers,
+indexed by subsystem id; this pointer is entirely managed by the
+subsystem; the generic cgroup code will never touch this pointer.
+
+3.2 Synchronization
+-------------------
+
+There is a global mutex, cgroup_mutex, used by the cgroup
+system. This should be taken by anything that wants to modify a
+cgroup. It may also be taken to prevent cgroups from being
+modified, but more specific locks may be more appropriate in that
+situation.
+
+See kernel/cgroup.c for more details.
+
+Subsystems can take/release the cgroup_mutex via the functions
+cgroup_lock()/cgroup_unlock(), and can
+take/release the callback_mutex via the functions
+cgroup_lock()/cgroup_unlock().
+
+Accessing a task's cgroup pointer may be done in the following ways:
+- while holding cgroup_mutex
+- while holding the task's alloc_lock (via task_lock())
+- inside an rcu_read_lock() section via rcu_dereference()
+
+3.3 Subsystem API
+--------------------------
+
+Each subsystem should:
+
+- add an entry in linux/cgroup_subsys.h
+- define a cgroup_subsys object called <name>_subsys
+
+Each subsystem may export the following methods. The only mandatory
+methods are create/destroy. Any others that are null are presumed to
+be successful no-ops.
+
+struct cgroup_subsys_state *create(struct cgroup *cont)
+LL=cgroup_mutex
+
+Called to create a subsystem state object for a cgroup. The
+subsystem should allocate its subsystem state object for the passed
+cgroup, returning a pointer to the new object on success or a
+negative error code. On success, the subsystem pointer should point to
+a structure of type cgroup_subsys_state (typically embedded in a
+larger subsystem-specific object), which will be initialized by the
+cgroup system. Note that this will be called at initialization to
+create the root subsystem state for this subsystem; this case can be
+identified by the passed cgroup object having a NULL parent (since
+it's the root of the hierarchy) and may be an appropriate place for
+initialization code.
+
+void destroy(struct cgroup *cont)
+LL=cgroup_mutex
+
+The cgroup system is about to destroy the passed cgroup; the
+subsystem should do any necessary cleanup
+
+int can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+ struct task_struct *task)
+LL=cgroup_mutex
+
+Called prior to moving a task into a cgroup; if the subsystem
+returns an error, this will abort the attach operation. If a NULL
+task is passed, then a successful result indicates that *any*
+unspecified task can be moved into the cgroup. Note that this isn't
+called on a fork. If this method returns 0 (success) then this should
+remain valid while the caller holds cgroup_mutex.
+
+void attach(struct cgroup_subsys *ss, struct cgroup *cont,
+ struct cgroup *old_cont, struct task_struct *task)
+LL=cgroup_mutex
+
+
+Called after the task has been attached to the cgroup, to allow any
+post-attachment activity that requires memory allocations or blocking.
+
+void fork(struct cgroup_subsy *ss, struct task_struct *task)
+LL=callback_mutex, maybe read_lock(tasklist_lock)
+
+Called when a task is forked into a cgroup. Also called during
+registration for all existing tasks.
+
+void exit(struct cgroup_subsys *ss, struct task_struct *task)
+LL=callback_mutex
+
+Called during task exit
+
+int populate(struct cgroup_subsys *ss, struct cgroup *cont)
+LL=none
+
+Called after creation of a cgroup to allow a subsystem to populate
+the cgroup directory with file entries. The subsystem should make
+calls to cgroup_add_file() with objects of type cftype (see
+include/linux/cgroup.h for details). Note that although this
+method can return an error code, the error code is currently not
+always handled well.
+
+void post_clone(struct cgroup_subsys *ss, struct cgroup *cont)
+
+Called at the end of cgroup_clone() to do any paramater
+initialization which might be required before a task could attach. For
+example in cpusets, no task may attach before 'cpus' and 'mems' are set
+up.
+
+void bind(struct cgroup_subsys *ss, struct cgroup *root)
+LL=callback_mutex
+
+Called when a cgroup subsystem is rebound to a different hierarchy
+and root cgroup. Currently this will only involve movement between
+the default hierarchy (which never has sub-cgroups) and a hierarchy
+that is being created/destroyed (and hence has no sub-cgroups).
+
+4. Questions
+============
+
+Q: what's up with this '/bin/echo' ?
+A: bash's builtin 'echo' command does not check calls to write() against
+ errors. If you use it in the cgroup file system, you won't be
+ able to tell whether a command succeeded or failed.
+
+Q: When I attach processes, only the first of the line gets really attached !
+A: We can only return one error code per call to write(). So you should also
+ put only ONE pid.
+
diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index b6d24c22274b..a741f658a3c9 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -220,7 +220,9 @@ A: The following happen, listed in no particular order :-)
CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the
CPU is being offlined while tasks are frozen due to a suspend operation in
progress
-- All process is migrated away from this outgoing CPU to a new CPU
+- All processes are migrated away from this outgoing CPU to new CPUs.
+ The new CPU is chosen from each process' current cpuset, which may be
+ a subset of all online CPUs.
- All interrupts targeted to this CPU is migrated to a new CPU
- timers/bottom half/task lets are also migrated to a new CPU
- Once all services are migrated, kernel calls an arch specific routine
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index f2c0a6842930..141bef1c8599 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -7,6 +7,7 @@ Written by Simon.Derr@bull.net
Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
Modified by Paul Jackson <pj@sgi.com>
Modified by Christoph Lameter <clameter@sgi.com>
+Modified by Paul Menage <menage@google.com>
CONTENTS:
=========
@@ -16,9 +17,9 @@ CONTENTS:
1.2 Why are cpusets needed ?
1.3 How are cpusets implemented ?
1.4 What are exclusive cpusets ?
- 1.5 What does notify_on_release do ?
- 1.6 What is memory_pressure ?
- 1.7 What is memory spread ?
+ 1.5 What is memory_pressure ?
+ 1.6 What is memory spread ?
+ 1.7 What is sched_load_balance ?
1.8 How do I use cpusets ?
2. Usage Examples and Syntax
2.1 Basic Usage
@@ -35,7 +36,8 @@ CONTENTS:
----------------------
Cpusets provide a mechanism for assigning a set of CPUs and Memory
-Nodes to a set of tasks.
+Nodes to a set of tasks. In this document "Memory Node" refers to
+an on-line node that contains memory.
Cpusets constrain the CPU and Memory placement of tasks to only
the resources within a tasks current cpuset. They form a nested
@@ -43,18 +45,19 @@ hierarchy visible in a virtual file system. These are the essential
hooks, beyond what is already present, required to manage dynamic
job placement on large systems.
-Each task has a pointer to a cpuset. Multiple tasks may reference
-the same cpuset. Requests by a task, using the sched_setaffinity(2)
-system call to include CPUs in its CPU affinity mask, and using the
-mbind(2) and set_mempolicy(2) system calls to include Memory Nodes
-in its memory policy, are both filtered through that tasks cpuset,
-filtering out any CPUs or Memory Nodes not in that cpuset. The
-scheduler will not schedule a task on a CPU that is not allowed in
-its cpus_allowed vector, and the kernel page allocator will not
-allocate a page on a node that is not allowed in the requesting tasks
-mems_allowed vector.
-
-User level code may create and destroy cpusets by name in the cpuset
+Cpusets use the generic cgroup subsystem described in
+Documentation/cgroup.txt.
+
+Requests by a task, using the sched_setaffinity(2) system call to
+include CPUs in its CPU affinity mask, and using the mbind(2) and
+set_mempolicy(2) system calls to include Memory Nodes in its memory
+policy, are both filtered through that tasks cpuset, filtering out any
+CPUs or Memory Nodes not in that cpuset. The scheduler will not
+schedule a task on a CPU that is not allowed in its cpus_allowed
+vector, and the kernel page allocator will not allocate a page on a
+node that is not allowed in the requesting tasks mems_allowed vector.
+
+User level code may create and destroy cpusets by name in the cgroup
virtual file system, manage the attributes and permissions of these
cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
specify and query to which cpuset a task is assigned, and list the
@@ -86,9 +89,6 @@ This can be especially valuable on:
and a database), or
* NUMA systems running large HPC applications with demanding
performance characteristics.
- * Also cpu_exclusive cpusets are useful for servers running orthogonal
- workloads such as RT applications requiring low latency and HPC
- applications that are throughput sensitive
These subsets, or "soft partitions" must be able to be dynamically
adjusted, as the job mix changes, without impacting other concurrently
@@ -117,7 +117,7 @@ Cpusets extends these two mechanisms as follows:
- Cpusets are sets of allowed CPUs and Memory Nodes, known to the
kernel.
- Each task in the system is attached to a cpuset, via a pointer
- in the task structure to a reference counted cpuset structure.
+ in the task structure to a reference counted cgroup structure.
- Calls to sched_setaffinity are filtered to just those CPUs
allowed in that tasks cpuset.
- Calls to mbind and set_mempolicy are filtered to just
@@ -131,8 +131,6 @@ Cpusets extends these two mechanisms as follows:
- A cpuset may be marked exclusive, which ensures that no other
cpuset (except direct ancestors and descendents) may contain
any overlapping CPUs or Memory Nodes.
- Also a cpu_exclusive cpuset would be associated with a sched
- domain.
- You can list all the tasks (by pid) attached to any cpuset.
The implementation of cpusets requires a few, simple hooks
@@ -144,23 +142,15 @@ into the rest of the kernel, none in performance critical paths:
allowed in that tasks cpuset.
- in sched.c migrate_all_tasks(), to keep migrating tasks within
the CPUs allowed by their cpuset, if possible.
- - in sched.c, a new API partition_sched_domains for handling
- sched domain changes associated with cpu_exclusive cpusets
- and related changes in both sched.c and arch/ia64/kernel/domain.c
- in the mbind and set_mempolicy system calls, to mask the requested
Memory Nodes by what's allowed in that tasks cpuset.
- in page_alloc.c, to restrict memory to allowed nodes.
- in vmscan.c, to restrict page recovery to the current cpuset.
-In addition a new file system, of type "cpuset" may be mounted,
-typically at /dev/cpuset, to enable browsing and modifying the cpusets
-presently known to the kernel. No new system calls are added for
-cpusets - all support for querying and modifying cpusets is via
-this cpuset file system.
-
-Each task under /proc has an added file named 'cpuset', displaying
-the cpuset name, as the path relative to the root of the cpuset file
-system.
+You should mount the "cgroup" filesystem type in order to enable
+browsing and modifying the cpusets presently known to the kernel. No
+new system calls are added for cpusets - all support for querying and
+modifying cpusets is via this cpuset file system.
The /proc/<pid>/status file for each task has two added lines,
displaying the tasks cpus_allowed (on which CPUs it may be scheduled)
@@ -170,16 +160,15 @@ in the format seen in the following example:
Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff
Mems_allowed: ffffffff,ffffffff
-Each cpuset is represented by a directory in the cpuset file system
-containing the following files describing that cpuset:
+Each cpuset is represented by a directory in the cgroup file system
+containing (on top of the standard cgroup files) the following
+files describing that cpuset:
- cpus: list of CPUs in that cpuset
- mems: list of Memory Nodes in that cpuset
- memory_migrate flag: if set, move pages to cpusets nodes
- cpu_exclusive flag: is cpu placement exclusive?
- mem_exclusive flag: is memory placement exclusive?
- - tasks: list of tasks (by pid) attached to that cpuset
- - notify_on_release flag: run /sbin/cpuset_release_agent on exit?
- memory_pressure: measure of how much paging pressure in cpuset
In addition, the root cpuset only has the following file:
@@ -220,8 +209,8 @@ and name space for cpusets, with a minimum of additional kernel code.
The cpus and mems files in the root (top_cpuset) cpuset are
read-only. The cpus file automatically tracks the value of
cpu_online_map using a CPU hotplug notifier, and the mems file
-automatically tracks the value of node_online_map using the
-cpuset_track_online_nodes() hook.
+automatically tracks the value of node_states[N_MEMORY]--i.e.,
+nodes with memory--using the cpuset_track_online_nodes() hook.
1.4 What are exclusive cpusets ?
@@ -231,15 +220,6 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than
a direct ancestor or descendent, may share any of the same CPUs or
Memory Nodes.
-A cpuset that is cpu_exclusive has a scheduler (sched) domain
-associated with it. The sched domain consists of all CPUs in the
-current cpuset that are not part of any exclusive child cpusets.
-This ensures that the scheduler load balancing code only balances
-against the CPUs that are in the sched domain as defined above and
-not all of the CPUs in the system. This removes any overhead due to
-load balancing code trying to pull tasks outside of the cpu_exclusive
-cpuset only to be prevented by the tasks' cpus_allowed mask.
-
A cpuset that is mem_exclusive restricts kernel allocations for
page, buffer and other data commonly shared by the kernel across
multiple users. All cpusets, whether mem_exclusive or not, restrict
@@ -253,21 +233,7 @@ such as requests from interrupt handlers, is allowed to be taken
outside even a mem_exclusive cpuset.
-1.5 What does notify_on_release do ?
-------------------------------------
-
-If the notify_on_release flag is enabled (1) in a cpuset, then whenever
-the last task in the cpuset leaves (exits or attaches to some other
-cpuset) and the last child cpuset of that cpuset is removed, then
-the kernel runs the command /sbin/cpuset_release_agent, supplying the
-pathname (relative to the mount point of the cpuset file system) of the
-abandoned cpuset. This enables automatic removal of abandoned cpusets.
-The default value of notify_on_release in the root cpuset at system
-boot is disabled (0). The default value of other cpusets at creation
-is the current value of their parents notify_on_release setting.
-
-
-1.6 What is memory_pressure ?
+1.5 What is memory_pressure ?
-----------------------------
The memory_pressure of a cpuset provides a simple per-cpuset metric
of the rate that the tasks in a cpuset are attempting to free up in
@@ -324,7 +290,7 @@ the tasks in the cpuset, in units of reclaims attempted per second,
times 1000.
-1.7 What is memory spread ?
+1.6 What is memory spread ?
---------------------------
There are two boolean flag files per cpuset that control where the
kernel allocates pages for the file system buffers and related in
@@ -394,6 +360,142 @@ policy, especially for jobs that might have one thread reading in the
data set, the memory allocation across the nodes in the jobs cpuset
can become very uneven.
+1.7 What is sched_load_balance ?
+--------------------------------
+
+The kernel scheduler (kernel/sched.c) automatically load balances
+tasks. If one CPU is underutilized, kernel code running on that
+CPU will look for tasks on other more overloaded CPUs and move those
+tasks to itself, within the constraints of such placement mechanisms
+as cpusets and sched_setaffinity.
+
+The algorithmic cost of load balancing and its impact on key shared
+kernel data structures such as the task list increases more than
+linearly with the number of CPUs being balanced. So the scheduler
+has support to partition the systems CPUs into a number of sched
+domains such that it only load balances within each sched domain.
+Each sched domain covers some subset of the CPUs in the system;
+no two sched domains overlap; some CPUs might not be in any sched
+domain and hence won't be load balanced.
+
+Put simply, it costs less to balance between two smaller sched domains
+than one big one, but doing so means that overloads in one of the
+two domains won't be load balanced to the other one.
+
+By default, there is one sched domain covering all CPUs, except those
+marked isolated using the kernel boot time "isolcpus=" argument.
+
+This default load balancing across all CPUs is not well suited for
+the following two situations:
+ 1) On large systems, load balancing across many CPUs is expensive.
+ If the system is managed using cpusets to place independent jobs
+ on separate sets of CPUs, full load balancing is unnecessary.
+ 2) Systems supporting realtime on some CPUs need to minimize
+ system overhead on those CPUs, including avoiding task load
+ balancing if that is not needed.
+
+When the per-cpuset flag "sched_load_balance" is enabled (the default
+setting), it requests that all the CPUs in that cpusets allowed 'cpus'
+be contained in a single sched domain, ensuring that load balancing
+can move a task (not otherwised pinned, as by sched_setaffinity)
+from any CPU in that cpuset to any other.
+
+When the per-cpuset flag "sched_load_balance" is disabled, then the
+scheduler will avoid load balancing across the CPUs in that cpuset,
+--except-- in so far as is necessary because some overlapping cpuset
+has "sched_load_balance" enabled.
+
+So, for example, if the top cpuset has the flag "sched_load_balance"
+enabled, then the scheduler will have one sched domain covering all
+CPUs, and the setting of the "sched_load_balance" flag in any other
+cpusets won't matter, as we're already fully load balancing.
+
+Therefore in the above two situations, the top cpuset flag
+"sched_load_balance" should be disabled, and only some of the smaller,
+child cpusets have this flag enabled.
+
+When doing this, you don't usually want to leave any unpinned tasks in
+the top cpuset that might use non-trivial amounts of CPU, as such tasks
+may be artificially constrained to some subset of CPUs, depending on
+the particulars of this flag setting in descendent cpusets. Even if
+such a task could use spare CPU cycles in some other CPUs, the kernel
+scheduler might not consider the possibility of load balancing that
+task to that underused CPU.
+
+Of course, tasks pinned to a particular CPU can be left in a cpuset
+that disables "sched_load_balance" as those tasks aren't going anywhere
+else anyway.
+
+There is an impedance mismatch here, between cpusets and sched domains.
+Cpusets are hierarchical and nest. Sched domains are flat; they don't
+overlap and each CPU is in at most one sched domain.
+
+It is necessary for sched domains to be flat because load balancing
+across partially overlapping sets of CPUs would risk unstable dynamics
+that would be beyond our understanding. So if each of two partially
+overlapping cpusets enables the flag 'sched_load_balance', then we
+form a single sched domain that is a superset of both. We won't move
+a task to a CPU outside it cpuset, but the scheduler load balancing
+code might waste some compute cycles considering that possibility.
+
+This mismatch is why there is not a simple one-to-one relation
+between which cpusets have the flag "sched_load_balance" enabled,
+and the sched domain configuration. If a cpuset enables the flag, it
+will get balancing across all its CPUs, but if it disables the flag,
+it will only be assured of no load balancing if no other overlapping
+cpuset enables the flag.
+
+If two cpusets have partially overlapping 'cpus' allowed, and only
+one of them has this flag enabled, then the other may find its
+tasks only partially load balanced, just on the overlapping CPUs.
+This is just the general case of the top_cpuset example given a few
+paragraphs above. In the general case, as in the top cpuset case,
+don't leave tasks that might use non-trivial amounts of CPU in
+such partially load balanced cpusets, as they may be artificially
+constrained to some subset of the CPUs allowed to them, for lack of
+load balancing to the other CPUs.
+
+1.7.1 sched_load_balance implementation details.
+------------------------------------------------
+
+The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary
+to most cpuset flags.) When enabled for a cpuset, the kernel will
+ensure that it can load balance across all the CPUs in that cpuset
+(makes sure that all the CPUs in the cpus_allowed of that cpuset are
+in the same sched domain.)
+
+If two overlapping cpusets both have 'sched_load_balance' enabled,
+then they will be (must be) both in the same sched domain.
+
+If, as is the default, the top cpuset has 'sched_load_balance' enabled,
+then by the above that means there is a single sched domain covering
+the whole system, regardless of any other cpuset settings.
+
+The kernel commits to user space that it will avoid load balancing
+where it can. It will pick as fine a granularity partition of sched
+domains as it can while still providing load balancing for any set
+of CPUs allowed to a cpuset having 'sched_load_balance' enabled.
+
+The internal kernel cpuset to scheduler interface passes from the
+cpuset code to the scheduler code a partition of the load balanced
+CPUs in the system. This partition is a set of subsets (represented
+as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all
+the CPUs that must be load balanced.
+
+Whenever the 'sched_load_balance' flag changes, or CPUs come or go
+from a cpuset with this flag enabled, or a cpuset with this flag
+enabled is removed, the cpuset code builds a new such partition and
+passes it to the scheduler sched domain setup code, to have the sched
+domains rebuilt as necessary.
+
+This partition exactly defines what sched domains the scheduler should
+setup - one sched domain for each element (cpumask_t) in the partition.
+
+The scheduler remembers the currently active sched domain partitions.
+When the scheduler routine partition_sched_domains() is invoked from
+the cpuset code to update these sched domains, it compares the new
+partition requested with the current, and updates its sched domains,
+removing the old and adding the new, for each change.
1.8 How do I use cpusets ?
--------------------------
@@ -485,7 +587,7 @@ than stress the kernel.
To start a new job that is to be contained within a cpuset, the steps are:
1) mkdir /dev/cpuset
- 2) mount -t cpuset none /dev/cpuset
+ 2) mount -t cgroup -ocpuset cpuset /dev/cpuset
3) Create the new cpuset by doing mkdir's and write's (or echo's) in
the /dev/cpuset virtual file system.
4) Start a task that will be the "founding father" of the new job.
@@ -497,7 +599,7 @@ For example, the following sequence of commands will setup a cpuset
named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
and then start a subshell 'sh' in that cpuset:
- mount -t cpuset none /dev/cpuset
+ mount -t cgroup -ocpuset cpuset /dev/cpuset
cd /dev/cpuset
mkdir Charlie
cd Charlie
@@ -529,7 +631,7 @@ Creating, modifying, using the cpusets can be done through the cpuset
virtual filesystem.
To mount it, type:
-# mount -t cpuset none /dev/cpuset
+# mount -t cgroup -o cpuset cpuset /dev/cpuset
Then under /dev/cpuset you can find a tree that corresponds to the
tree of the cpusets in the system. For instance, /dev/cpuset
@@ -572,6 +674,18 @@ To remove a cpuset, just use rmdir:
This will fail if the cpuset is in use (has cpusets inside, or has
processes attached).
+Note that for legacy reasons, the "cpuset" filesystem exists as a
+wrapper around the cgroup filesystem.
+
+The command
+
+mount -t cpuset X /dev/cpuset
+
+is equivalent to
+
+mount -t cgroup -ocpuset X /dev/cpuset
+echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent
+
2.2 Adding/removing cpus
------------------------
diff --git a/Documentation/device-mapper/dm-uevent.txt b/Documentation/device-mapper/dm-uevent.txt
new file mode 100644
index 000000000000..07edbd85c714
--- /dev/null
+++ b/Documentation/device-mapper/dm-uevent.txt
@@ -0,0 +1,97 @@
+The device-mapper uevent code adds the capability to device-mapper to create
+and send kobject uevents (uevents). Previously device-mapper events were only
+available through the ioctl interface. The advantage of the uevents interface
+is the event contains environment attributes providing increased context for
+the event avoiding the need to query the state of the device-mapper device after
+the event is received.
+
+There are two functions currently for device-mapper events. The first function
+listed creates the event and the second function sends the event(s).
+
+void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
+ const char *path, unsigned nr_valid_paths)
+
+void dm_send_uevents(struct list_head *events, struct kobject *kobj)
+
+
+The variables added to the uevent environment are:
+
+Variable Name: DM_TARGET
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description:
+Value: Name of device-mapper target that generated the event.
+
+Variable Name: DM_ACTION
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description:
+Value: Device-mapper specific action that caused the uevent action.
+ PATH_FAILED - A path has failed.
+ PATH_REINSTATED - A path has been reinstated.
+
+Variable Name: DM_SEQNUM
+Uevent Action(s): KOBJ_CHANGE
+Type: unsigned integer
+Description: A sequence number for this specific device-mapper device.
+Value: Valid unsigned integer range.
+
+Variable Name: DM_PATH
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description: Major and minor number of the path device pertaining to this
+event.
+Value: Path name in the form of "Major:Minor"
+
+Variable Name: DM_NR_VALID_PATHS
+Uevent Action(s): KOBJ_CHANGE
+Type: unsigned integer
+Description:
+Value: Valid unsigned integer range.
+
+Variable Name: DM_NAME
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description: Name of the device-mapper device.
+Value: Name
+
+Variable Name: DM_UUID
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description: UUID of the device-mapper device.
+Value: UUID. (Empty string if there isn't one.)
+
+An example of the uevents generated as captured by udevmonitor is shown
+below.
+
+1.) Path failure.
+UEVENT[1192521009.711215] change@/block/dm-3
+ACTION=change
+DEVPATH=/block/dm-3
+SUBSYSTEM=block
+DM_TARGET=multipath
+DM_ACTION=PATH_FAILED
+DM_SEQNUM=1
+DM_PATH=8:32
+DM_NR_VALID_PATHS=0
+DM_NAME=mpath2
+DM_UUID=mpath-35333333000002328
+MINOR=3
+MAJOR=253
+SEQNUM=1130
+
+2.) Path reinstate.
+UEVENT[1192521132.989927] change@/block/dm-3
+ACTION=change
+DEVPATH=/block/dm-3
+SUBSYSTEM=block
+DM_TARGET=multipath
+DM_ACTION=PATH_REINSTATED
+DM_SEQNUM=2
+DM_PATH=8:32
+DM_NR_VALID_PATHS=1
+DM_NAME=mpath2
+DM_UUID=mpath-35333333000002328
+MINOR=3
+MAJOR=253
+SEQNUM=1131
diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 6c46730c631a..e6244cde26e9 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -2188,7 +2188,7 @@ Your cooperation is appreciated.
136-143 char Unix98 PTY slaves
0 = /dev/pts/0 First Unix98 pseudo-TTY
- 1 = /dev/pts/1 Second Unix98 pesudo-TTY
+ 1 = /dev/pts/1 Second Unix98 pseudo-TTY
...
These device nodes are automatically generated with
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 7b9551fc6fe3..f2d658a6a942 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -42,6 +42,9 @@
*.9.gz
.*
.cscope
+.gitignore
+.mailmap
+.mm
53c700_d.h
53c7xx_d.h
53c7xx_u.h
@@ -121,7 +124,6 @@ kxgettext
lkc_defs.h
lex.c*
lex.*.c
-lk201-map.c
logo_*.c
logo_*_clut224.c
logo_*_mono.c
@@ -176,11 +178,13 @@ times.h*
tkparse
trix_boot.h
utsrelease.h*
+vdso.lds
version.h*
vmlinux
vmlinux-*
vmlinux.aout
-vmlinux.lds
+vmlinux*.lds*
+vmlinux*.scr
vsyscall.lds
wanxlfw.inc
uImage
diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 8569072fa387..387b8a720f4a 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -32,7 +32,7 @@ braindamaged document, if it's finally working, well, it's working.
For one reason or another, low level drivers don't receive as much
attention or testing as core code, and bugs on driver detach or
-initilaization failure doesn't happen often enough to be noticeable.
+initialization failure don't happen often enough to be noticeable.
Init failure path is worse because it's much less travelled while
needs to handle multiple entry points.
@@ -160,7 +160,7 @@ resources on failure. For example,
devres_release_group(dev, NULL);
return err_code;
-As resource acquision failure usually means probe failure, constructs
+As resource acquisition failure usually means probe failure, constructs
like above are usually useful in midlayer driver (e.g. libata core
layer) where interface function shouldn't have side effect on failure.
For LLDs, just returning error code suffices in most cases.
diff --git a/Documentation/dvb/faq.txt b/Documentation/dvb/faq.txt
index dbcedf5833ee..2511a335abd6 100644
--- a/Documentation/dvb/faq.txt
+++ b/Documentation/dvb/faq.txt
@@ -150,7 +150,7 @@ Some very frequently asked questions about linuxtv-dvb
- saa7146_vv: SAA7146 video and vbi functions. These are only needed
for full-featured cards.
- - video-buf: capture helper module for the saa7146_vv driver. This
+ - videobuf-dma-sg: capture helper module for the saa7146_vv driver. This
one is responsible to handle capture buffers.
- dvb-ttpci: The main driver for AV7110 based, full-featured
diff --git a/Documentation/early-userspace/README b/Documentation/early-userspace/README
index cddbac456c29..766d320c8eb6 100644
--- a/Documentation/early-userspace/README
+++ b/Documentation/early-userspace/README
@@ -19,7 +19,7 @@ It consists of several major infrastructure components:
- klibc, a userspace C library, currently packaged separately, that is
optimized for correctness and small size.
-The cpio file format used by initramfs is the "newc" (aka "cpio -c")
+The cpio file format used by initramfs is the "newc" (aka "cpio -H newc")
format, and is documented in the file "buffer-format.txt". There are
two ways to add an early userspace image: specify an existing cpio
archive to be used as the image or have the kernel build process build
@@ -44,7 +44,7 @@ The image is specified as one or more sources in
CONFIG_INITRAMFS_SOURCE. Sources can be either directories or files -
cpio archives are *not* allowed when building from sources.
-A source directory will have it and all of it's contents packaged. The
+A source directory will have it and all of its contents packaged. The
specified directory name will be mapped to '/'. When packaging a
directory, limited user and group ID translation can be performed.
INITRAMFS_ROOT_UID can be set to a user ID that needs to be mapped to
@@ -144,7 +144,7 @@ c) using initramfs. The call to prepare_namespace() must be skipped.
initrd format, an cpio archive. It must be called "/init". This binary
is responsible to do all the things prepare_namespace() would do.
- To remain backwards compatibility, the /init binary will only run if it
+ To maintain backwards compatibility, the /init binary will only run if it
comes via an initramfs cpio archive. If this is not the case,
init/main.c:init() will run prepare_namespace() to mount the final root
and exec one of the predefined init binaries.
diff --git a/Documentation/email-clients.txt b/Documentation/email-clients.txt
new file mode 100644
index 000000000000..113165b48305
--- /dev/null
+++ b/Documentation/email-clients.txt
@@ -0,0 +1,217 @@
+Email clients info for Linux
+======================================================================
+
+General Preferences
+----------------------------------------------------------------------
+Patches for the Linux kernel are submitted via email, preferably as
+inline text in the body of the email. Some maintainers accept
+attachments, but then the attachments should have content-type
+"text/plain". However, attachments are generally frowned upon because
+it makes quoting portions of the patch more difficult in the patch
+review process.
+
+Email clients that are used for Linux kernel patches should send the
+patch text untouched. For example, they should not modify or delete tabs
+or spaces, even at the beginning or end of lines.
+
+Don't send patches with "format=flowed". This can cause unexpected
+and unwanted line breaks.
+
+Don't let your email client do automatic word wrapping for you.
+This can also corrupt your patch.
+
+Email clients should not modify the character set encoding of the text.
+Emailed patches should be in ASCII or UTF-8 encoding only.
+If you configure your email client to send emails with UTF-8 encoding,
+you avoid some possible charset problems.
+
+Email clients should generate and maintain References: or In-Reply-To:
+headers so that mail threading is not broken.
+
+Copy-and-paste (or cut-and-paste) usually does not work for patches
+because tabs are converted to spaces. Using xclipboard, xclip, and/or
+xcutsel may work, but it's best to test this for yourself or just avoid
+copy-and-paste.
+
+Don't use PGP/GPG signatures in mail that contains patches.
+This breaks many scripts that read and apply the patches.
+(This should be fixable.)
+
+It's a good idea to send a patch to yourself, save the received message,
+and successfully apply it with 'patch' before sending patches to Linux
+mailing lists.
+
+
+Some email client (MUA) hints
+----------------------------------------------------------------------
+Here are some specific MUA configuration hints for editing and sending
+patches for the Linux kernel. These are not meant to be complete
+software package configuration summaries.
+
+Legend:
+TUI = text-based user interface
+GUI = graphical user interface
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Alpine (TUI)
+
+Config options:
+In the "Sending Preferences" section:
+
+- "Do Not Send Flowed Text" must be enabled
+- "Strip Whitespace Before Sending" must be disabled
+
+When composing the message, the cursor should be placed where the patch
+should appear, and then pressing CTRL-R let you specify the patch file
+to insert into the message.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Evolution (GUI)
+
+Some people use this successfully for patches.
+
+When composing mail select: Preformat
+ from Format->Heading->Preformatted (Ctrl-7)
+ or the toolbar
+
+Then use:
+ Insert->Text File... (Alt-n x)
+to insert the patch.
+
+You can also "diff -Nru old.c new.c | xclip", select Preformat, then
+paste with the middle button.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Kmail (GUI)
+
+Some people use Kmail successfully for patches.
+
+The default setting of not composing in HTML is appropriate; do not
+enable it.
+
+When composing an email, under options, uncheck "word wrap". The only
+disadvantage is any text you type in the email will not be word-wrapped
+so you will have to manually word wrap text before the patch. The easiest
+way around this is to compose your email with word wrap enabled, then save
+it as a draft. Once you pull it up again from your drafts it is now hard
+word-wrapped and you can uncheck "word wrap" without losing the existing
+wrapping.
+
+At the bottom of your email, put the commonly-used patch delimiter before
+inserting your patch: three hyphens (---).
+
+Then from the "Message" menu item, select insert file and choose your patch.
+As an added bonus you can customise the message creation toolbar menu
+and put the "insert file" icon there.
+
+You can safely GPG sign attachments, but inlined text is preferred for
+patches so do not GPG sign them. Signing patches that have been inserted
+as inlined text will make them tricky to extract from their 7-bit encoding.
+
+If you absolutely must send patches as attachments instead of inlining
+them as text, right click on the attachment and select properties, and
+highlight "Suggest automatic display" to make the attachment inlined to
+make it more viewable.
+
+When saving patches that are sent as inlined text, select the email that
+contains the patch from the message list pane, right click and select
+"save as". You can use the whole email unmodified as a patch if it was
+properly composed. There is no option currently to save the email when you
+are actually viewing it in its own window -- there has been a request filed
+at kmail's bugzilla and hopefully this will be addressed. Emails are saved
+as read-write for user only so you will have to chmod them to make them
+group and world readable if you copy them elsewhere.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Lotus Notes (GUI)
+
+Run away from it.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Mutt (TUI)
+
+Plenty of Linux developers use mutt, so it must work pretty well.
+
+Mutt doesn't come with an editor, so whatever editor you use should be
+used in a way that there are no automatic linebreaks. Most editors have
+an "insert file" option that inserts the contents of a file unaltered.
+
+To use 'vim' with mutt:
+ set editor="vi"
+
+ If using xclip, type the command
+ :set paste
+ before middle button or shift-insert or use
+ :r filename
+
+if you want to include the patch inline.
+(a)ttach works fine without "set paste".
+
+Config options:
+It should work with default settings.
+However, it's a good idea to set the "send_charset" to:
+ set send_charset="us-ascii:utf-8"
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Pine (TUI)
+
+Pine has had some whitespace truncation issues in the past, but these
+should all be fixed now.
+
+Use alpine (pine's successor) if you can.
+
+Config options:
+- quell-flowed-text is needed for recent versions
+- the "no-strip-whitespace-before-send" option is needed
+
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Sylpheed (GUI)
+
+- Works well for inlining text (or using attachments).
+- Allows use of an external editor.
+- Not good for IMAP.
+- Is slow on large folders.
+- Won't do TLS SMTP auth over a non-SSL connection.
+- Has a helpful ruler bar in the compose window.
+- Adding addresses to address book doesn't understand the display name
+ properly.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Thunderbird (GUI)
+
+By default, thunderbird likes to mangle text, but there are ways to
+coerce it into being nice.
+
+- Under account settings, composition and addressing, uncheck "Compose
+ messages in HTML format".
+
+- Edit your Thunderbird config settings to tell it not to wrap lines:
+ user_pref("mailnews.wraplength", 0);
+
+- Edit your Thunderbird config settings so that it won't use format=flowed:
+ user_pref("mailnews.send_plaintext_flowed", false);
+
+- You need to get Thunderbird into preformat mode:
+. If you compose HTML messages by default, it's not too hard. Just select
+ "Preformat" from the drop-down box just under the subject line.
+. If you compose in text by default, you have to tell it to compose a new
+ message in HTML (just as a one-off), and then force it from there back to
+ text, else it will wrap lines. To do this, use shift-click on the Write
+ icon to compose to get HTML compose mode, then select "Preformat" from
+ the drop-down box just under the subject line.
+
+- Allows use of an external editor:
+ The easiest thing to do with Thunderbird and patches is to use an
+ "external editor" extension and then just use your favorite $EDITOR
+ for reading/merging patches into the body text. To do this, download
+ and install the extension, then add a button for it using
+ View->Toolbars->Customize... and finally just click on it when in the
+ Compose dialog.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TkRat (GUI)
+
+Works. Use "Insert file..." or external editor.
+
+ ###
diff --git a/Documentation/fb/00-INDEX b/Documentation/fb/00-INDEX
index 92e89aeef52e..caabbd395e61 100644
--- a/Documentation/fb/00-INDEX
+++ b/Documentation/fb/00-INDEX
@@ -5,21 +5,49 @@ please mail me.
00-INDEX
- this file
+arkfb.txt
+ - info on the fbdev driver for ARK Logic chips.
+aty128fb.txt
+ - info on the ATI Rage128 frame buffer driver.
+cirrusfb.txt
+ - info on the driver for Cirrus Logic chipsets.
+cyblafb/
+ - directory with documentation files related to the cyblafb driver.
+deferred_io.txt
+ - an introduction to deferred IO.
+fbcon.txt
+ - intro to and usage guide for the framebuffer console (fbcon).
framebuffer.txt
- - introduction to frame buffer devices
+ - introduction to frame buffer devices.
+imacfb.txt
+ - info on the generic EFI platform driver for Intel based Macs.
+intel810.txt
+ - documentation for the Intel 810/815 framebuffer driver.
+intelfb.txt
+ - docs for Intel 830M/845G/852GM/855GM/865G/915G/945G fb driver.
internals.txt
- - quick overview of frame buffer device internals
+ - quick overview of frame buffer device internals.
+matroxfb.txt
+ - info on the Matrox framebuffer driver for Alpha, Intel and PPC.
modedb.txt
- - info on the video mode database
-aty128fb.txt
- - info on the ATI Rage128 frame buffer driver
-clgenfb.txt
- - info on the Cirrus Logic frame buffer driver
+ - info on the video mode database.
matroxfb.txt
- - info on the Matrox frame buffer driver
+ - info on the Matrox frame buffer driver.
pvr2fb.txt
- - info on the PowerVR 2 frame buffer driver
+ - info on the PowerVR 2 frame buffer driver.
+pxafb.txt
+ - info on the driver for the PXA25x LCD controller.
+s3fb.txt
+ - info on the fbdev driver for S3 Trio/Virge chips.
+sa1100fb.txt
+ - information about the driver for the SA-1100 LCD controller.
+sisfb.txt
+ - info on the framebuffer device driver for various SiS chips.
+sstfb.txt
+ - info on the frame buffer driver for 3dfx' Voodoo Graphics boards.
tgafb.txt
- info on the TGA (DECChip 21030) frame buffer driver
vesafb.txt
- info on the VESA frame buffer device
+vt8623fb.txt
+ - info on the fb driver for the graphics core in VIA VT8623 chipsets.
diff --git a/Documentation/fb/deferred_io.txt b/Documentation/fb/deferred_io.txt
index 73cf9fb7cf60..63883a892120 100644
--- a/Documentation/fb/deferred_io.txt
+++ b/Documentation/fb/deferred_io.txt
@@ -3,7 +3,7 @@ Deferred IO
Deferred IO is a way to delay and repurpose IO. It uses host memory as a
buffer and the MMU pagefault as a pretrigger for when to perform the device
-IO. The following example may be a useful explaination of how one such setup
+IO. The following example may be a useful explanation of how one such setup
works:
- userspace app like Xfbdev mmaps framebuffer
@@ -28,7 +28,7 @@ a relatively more expensive operation.
For some types of nonvolatile high latency displays, the desired image is
the final image rather than the intermediate stages which is why it's okay
-to not update for each write that is occuring.
+to not update for each write that is occurring.
It may be the case that this is useful in other scenarios as well. Paul Mundt
has mentioned a case where it is beneficial to use the page count to decide
diff --git a/Documentation/fb/uvesafb.txt b/Documentation/fb/uvesafb.txt
new file mode 100644
index 000000000000..bcfc233a0080
--- /dev/null
+++ b/Documentation/fb/uvesafb.txt
@@ -0,0 +1,188 @@
+
+uvesafb - A Generic Driver for VBE2+ compliant video cards
+==========================================================
+
+1. Requirements
+---------------
+
+uvesafb should work with any video card that has a Video BIOS compliant
+with the VBE 2.0 standard.
+
+Unlike other drivers, uvesafb makes use of a userspace helper called
+v86d. v86d is used to run the x86 Video BIOS code in a simulated and
+controlled environment. This allows uvesafb to function on arches other
+than x86. Check the v86d documentation for a list of currently supported
+arches.
+
+v86d source code can be downloaded from the following website:
+ http://dev.gentoo.org/~spock/projects/uvesafb
+
+Please refer to the v86d documentation for detailed configuration and
+installation instructions.
+
+Note that the v86d userspace helper has to be available at all times in
+order for uvesafb to work properly. If you want to use uvesafb during
+early boot, you will have to include v86d into an initramfs image, and
+either compile it into the kernel or use it as an initrd.
+
+2. Caveats and limitations
+--------------------------
+
+uvesafb is a _generic_ driver which supports a wide variety of video
+cards, but which is ultimately limited by the Video BIOS interface.
+The most important limitations are:
+
+- Lack of any type of acceleration.
+- A strict and limited set of supported video modes. Often the native
+ or most optimal resolution/refresh rate for your setup will not work
+ with uvesafb, simply because the Video BIOS doesn't support the
+ video mode you want to use. This can be especially painful with
+ widescreen panels, where native video modes don't have the 4:3 aspect
+ ratio, which is what most BIOS-es are limited to.
+- Adjusting the refresh rate is only possible with a VBE 3.0 compliant
+ Video BIOS. Note that many nVidia Video BIOS-es claim to be VBE 3.0
+ compliant, while they simply ignore any refresh rate settings.
+
+3. Configuration
+----------------
+
+uvesafb can be compiled either as a module, or directly into the kernel.
+In both cases it supports the same set of configuration options, which
+are either given on the kernel command line or as module parameters, e.g.:
+
+ video=uvesafb:1024x768-32,mtrr:3,ywrap (compiled into the kernel)
+
+ # modprobe uvesafb mode=1024x768-32 mtrr=3 scroll=ywrap (module)
+
+Accepted options:
+
+ypan Enable display panning using the VESA protected mode
+ interface. The visible screen is just a window of the
+ video memory, console scrolling is done by changing the
+ start of the window. Available on x86 only.
+
+ywrap Same as ypan, but assumes your gfx board can wrap-around
+ the video memory (i.e. starts reading from top if it
+ reaches the end of video memory). Faster than ypan.
+ Available on x86 only.
+
+redraw Scroll by redrawing the affected part of the screen, this
+ is the safe (and slow) default.
+
+(If you're using uvesafb as a module, the above three options are
+ used a parameter of the scroll option, e.g. scroll=ypan.)
+
+vgapal Use the standard VGA registers for palette changes.
+
+pmipal Use the protected mode interface for palette changes.
+ This is the default if the protected mode interface is
+ available. Available on x86 only.
+
+mtrr:n Setup memory type range registers for the framebuffer
+ where n:
+ 0 - disabled (equivalent to nomtrr) (default)
+ 1 - uncachable
+ 2 - write-back
+ 3 - write-combining
+ 4 - write-through
+
+ If you see the following in dmesg, choose the type that matches
+ the old one. In this example, use "mtrr:2".
+...
+mtrr: type mismatch for e0000000,8000000 old: write-back new: write-combining
+...
+
+nomtrr Do not use memory type range registers.
+
+vremap:n
+ Remap 'n' MiB of video RAM. If 0 or not specified, remap memory
+ according to video mode.
+
+vtotal:n
+ If the video BIOS of your card incorrectly determines the total
+ amount of video RAM, use this option to override the BIOS (in MiB).
+
+<mode> The mode you want to set, in the standard modedb format. Refer to
+ modedb.txt for a detailed description. When uvesafb is compiled as
+ a module, the mode string should be provided as a value of the
+ 'mode' option.
+
+vbemode:x
+ Force the use of VBE mode x. The mode will only be set if it's
+ found in the VBE-provided list of supported modes.
+ NOTE: The mode number 'x' should be specified in VESA mode number
+ notation, not the Linux kernel one (eg. 257 instead of 769).
+ HINT: If you use this option because normal <mode> parameter does
+ not work for you and you use a X server, you'll probably want to
+ set the 'nocrtc' option to ensure that the video mode is properly
+ restored after console <-> X switches.
+
+nocrtc Do not use CRTC timings while setting the video mode. This option
+ has any effect only if the Video BIOS is VBE 3.0 compliant. Use it
+ if you have problems with modes set the standard way. Note that
+ using this option implies that any refresh rate adjustments will
+ be ignored and the refresh rate will stay at your BIOS default (60 Hz).
+
+noedid Do not try to fetch and use EDID-provided modes.
+
+noblank Disable hardware blanking.
+
+v86d:path
+ Set path to the v86d executable. This option is only available as
+ a module parameter, and not as a part of the video= string. If you
+ need to use it and have uvesafb built into the kernel, use
+ uvesafb.v86d="path".
+
+Additionally, the following parameters may be provided. They all override the
+EDID-provided values and BIOS defaults. Refer to your monitor's specs to get
+the correct values for maxhf, maxvf and maxclk for your hardware.
+
+maxhf:n Maximum horizontal frequency (in kHz).
+maxvf:n Maximum vertical frequency (in Hz).
+maxclk:n Maximum pixel clock (in MHz).
+
+4. The sysfs interface
+----------------------
+
+uvesafb provides several sysfs nodes for configurable parameters and
+additional information.
+
+Driver attributes:
+
+/sys/bus/platform/drivers/uvesafb
+ - v86d (default: /sbin/v86d)
+ Path to the v86d executable. v86d is started by uvesafb
+ if an instance of the daemon isn't already running.
+
+Device attributes:
+
+/sys/bus/platform/drivers/uvesafb/uvesafb.0
+ - nocrtc
+ Use the default refresh rate (60 Hz) if set to 1.
+
+ - oem_product_name
+ - oem_product_rev
+ - oem_string
+ - oem_vendor
+ Information about the card and its maker.
+
+ - vbe_modes
+ A list of video modes supported by the Video BIOS along with their
+ VBE mode numbers in hex.
+
+ - vbe_version
+ A BCD value indicating the implemented VBE standard.
+
+5. Miscellaneous
+----------------
+
+Uvesafb will set a video mode with the default refresh rate and timings
+from the Video BIOS if you set pixclock to 0 in fb_var_screeninfo.
+
+
+--
+ Michal Januszewski <spock@gentoo.org>
+ Last updated: 2007-06-16
+
+ Documentation of the uvesafb options is loosely based on vesafb.txt.
+
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 00928d2ecfb2..6bb9be54ab76 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -14,18 +14,6 @@ Who: Jiri Slaby <jirislaby@gmail.com>
---------------------------
-What: V4L2 VIDIOC_G_MPEGCOMP and VIDIOC_S_MPEGCOMP
-When: October 2007
-Why: Broken attempt to set MPEG compression parameters. These ioctls are
- not able to implement the wide variety of parameters that can be set
- by hardware MPEG encoders. A new MPEG control mechanism was created
- in kernel 2.6.18 that replaces these ioctls. See the V4L2 specification
- (section 1.9: Extended controls) for more information on this topic.
-Who: Hans Verkuil <hverkuil@xs4all.nl> and
- Mauro Carvalho Chehab <mchehab@infradead.org>
-
----------------------------
-
What: dev->power.power_state
When: July 2007
Why: Broken design for runtime control over driver power states, confusing
@@ -49,10 +37,10 @@ Who: David Miller <davem@davemloft.net>
---------------------------
What: Video4Linux API 1 ioctls and video_decoder.h from Video devices.
-When: December 2006
-Files: include/linux/video_decoder.h
-Check: include/linux/video_decoder.h
-Why: V4L1 AP1 was replaced by V4L2 API. during migration from 2.4 to 2.6
+When: December 2008
+Files: include/linux/video_decoder.h include/linux/videodev.h
+Check: include/linux/video_decoder.h include/linux/videodev.h
+Why: V4L1 AP1 was replaced by V4L2 API during migration from 2.4 to 2.6
series. The old API have lots of drawbacks and don't provide enough
means to work with all video and audio standards. The newer API is
already available on the main drivers and should be used instead.
@@ -61,7 +49,9 @@ Why: V4L1 AP1 was replaced by V4L2 API. during migration from 2.4 to 2.6
Decoder iocts are using internally to allow video drivers to
communicate with video decoders. This should also be improved to allow
V4L2 calls being translated into compatible internal ioctls.
-Who: Mauro Carvalho Chehab <mchehab@brturbo.com.br>
+ Compatibility ioctls will be provided, for a while, via
+ v4l1-compat module.
+Who: Mauro Carvalho Chehab <mchehab@infradead.org>
---------------------------
@@ -82,6 +72,52 @@ Who: Dominik Brodowski <linux@brodo.de>
---------------------------
+What: sys_sysctl
+When: September 2010
+Option: CONFIG_SYSCTL_SYSCALL
+Why: The same information is available in a more convenient from
+ /proc/sys, and none of the sysctl variables appear to be
+ important performance wise.
+
+ Binary sysctls are a long standing source of subtle kernel
+ bugs and security issues.
+
+ When I looked several months ago all I could find after
+ searching several distributions were 5 user space programs and
+ glibc (which falls back to /proc/sys) using this syscall.
+
+ The man page for sysctl(2) documents it as unusable for user
+ space programs.
+
+ sysctl(2) is not generally ABI compatible to a 32bit user
+ space application on a 64bit and a 32bit kernel.
+
+ For the last several months the policy has been no new binary
+ sysctls and no one has put forward an argument to use them.
+
+ Binary sysctls issues seem to keep happening appearing so
+ properly deprecating them (with a warning to user space) and a
+ 2 year grace warning period will mean eventually we can kill
+ them and end the pain.
+
+ In the mean time individual binary sysctls can be dealt with
+ in a piecewise fashion.
+
+Who: Eric Biederman <ebiederm@xmission.com>
+
+---------------------------
+
+What: a.out interpreter support for ELF executables
+When: 2.6.25
+Files: fs/binfmt_elf.c
+Why: Using a.out interpreters for ELF executables was a feature for
+ transition from a.out to ELF. But now it is unlikely to be still
+ needed anymore and removing it would simplify the hairy ELF
+ loader code.
+Who: Andi Kleen <ak@suse.de>
+
+---------------------------
+
What: remove EXPORT_SYMBOL(kernel_thread)
When: August 2006
Files: arch/*/kernel/*_ksyms.c
@@ -173,13 +209,6 @@ Who: Jean Delvare <khali@linux-fr.org>,
---------------------------
-What: drivers depending on OBSOLETE_OSS
-When: options in 2.6.22, code in 2.6.24
-Why: OSS drivers with ALSA replacements
-Who: Adrian Bunk <bunk@stusta.de>
-
----------------------------
-
What: ACPI procfs interface
When: July 2008
Why: ACPI sysfs conversion should be finished by January 2008.
@@ -205,20 +234,6 @@ Who: Len Brown <len.brown@intel.com>
---------------------------
-What: Compaq touchscreen device emulation
-When: Oct 2007
-Files: drivers/input/tsdev.c
-Why: The code says it was obsolete when it was written in 2001.
- tslib is a userspace library which does anything tsdev can do and
- much more besides in userspace where this code belongs. There is no
- longer any need for tsdev and applications should have converted to
- use tslib by now.
- The name "tsdev" is also extremely confusing and lots of people have
- it loaded when they don't need/use it.
-Who: Richard Purdie <rpurdie@rpsys.net>
-
----------------------------
-
What: i2c-ixp2000, i2c-ixp4xx and scx200_i2c drivers
When: September 2007
Why: Obsolete. The new i2c-gpio driver replaces all hardware-specific
@@ -306,3 +321,24 @@ Why: In kernel tree version of driver is unmaintained. Sk98lin driver
Who: Stephen Hemminger <shemminger@linux-foundation.org>
---------------------------
+
+What: i386/x86_64 bzImage symlinks
+When: April 2008
+
+Why: The i386/x86_64 merge provides a symlink to the old bzImage
+ location so not yet updated user space tools, e.g. package
+ scripts, do not break.
+Who: Thomas Gleixner <tglx@linutronix.de>
+
+---------------------------
+
+What: shaper network driver
+When: January 2008
+Files: drivers/net/shaper.c, include/linux/if_shaper.h
+Why: This driver has been marked obsolete for many years.
+ It was only designed to work on lower speed links and has design
+ flaws that lead to machine crashes. The qdisc infrastructure in
+ 2.4 or later kernels, provides richer features and is more robust.
+Who: Stephen Hemminger <shemminger@linux-foundation.org>
+
+---------------------------
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 59db1bca7027..1de155e2dc36 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -44,14 +44,24 @@ files.txt
- info on file management in the Linux kernel.
fuse.txt
- info on the Filesystem in User SpacE including mount options.
+gfs2.txt
+ - info on the Global File System 2.
hfs.txt
- info on the Macintosh HFS Filesystem for Linux.
+hfsplus.txt
+ - info on the Macintosh HFSPlus Filesystem for Linux.
hpfs.txt
- info and mount options for the OS/2 HPFS.
+inotify.txt
+ - info on the powerful yet simple file change notification system.
isofs.txt
- info and mount options for the ISO 9660 (CDROM) filesystem.
jfs.txt
- info and mount options for the JFS filesystem.
+locks.txt
+ - info on file locking implementations, flock() vs. fcntl(), etc.
+mandatory-locking.txt
+ - info on the Linux implementation of Sys V mandatory file locking.
ncpfs.txt
- info on Novell Netware(tm) filesystem using NCP protocol.
ntfs.txt
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index cda6905cbe49..bf8080640eba 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -35,17 +35,19 @@ For remote file server:
For Plan 9 From User Space applications (http://swtch.com/plan9)
- mount -t 9p `namespace`/acme /mnt/9 -o proto=unix,uname=$USER
+ mount -t 9p `namespace`/acme /mnt/9 -o trans=unix,uname=$USER
OPTIONS
=======
- proto=name select an alternative transport. Valid options are
+ trans=name select an alternative transport. Valid options are
currently:
- unix - specifying a named pipe mount point
- tcp - specifying a normal TCP/IP connection
- fd - used passed file descriptors for connection
+ unix - specifying a named pipe mount point
+ tcp - specifying a normal TCP/IP connection
+ fd - used passed file descriptors for connection
(see rfdno and wfdno)
+ virtio - connect to the next virtio channel available
+ (from lguest or KVM with trans_virtio module)
uname=name user name to attempt mount as on the remote server. The
server may override or ignore this value. Certain user
@@ -54,7 +56,7 @@ OPTIONS
aname=name aname specifies the file tree to access when the server is
offering several exported file systems.
- cache=mode specifies a cacheing policy. By default, no caches are used.
+ cache=mode specifies a caching policy. By default, no caches are used.
loose = no attempts are made at consistency,
intended for exclusive, read-only mounts
@@ -68,9 +70,9 @@ OPTIONS
0x40 = display transport debug
0x80 = display allocation debug
- rfdno=n the file descriptor for reading with proto=fd
+ rfdno=n the file descriptor for reading with trans=fd
- wfdno=n the file descriptor for writing with proto=fd
+ wfdno=n the file descriptor for writing with trans=fd
maxdata=n the number of bytes to use for 9p packet payload (msize)
@@ -78,9 +80,9 @@ OPTIONS
noextend force legacy mode (no 9p2000.u semantics)
- uid attempt to mount as a particular uid
+ dfltuid attempt to mount as a particular uid
- gid attempt to mount with a particular gid
+ dfltgid attempt to mount with a particular gid
afid security channel - used by Plan 9 authentication protocols
@@ -88,6 +90,16 @@ OPTIONS
This can be used to share devices/named pipes/sockets between
hosts. This functionality will be expanded in later versions.
+ access there are three access modes.
+ user = if a user tries to access a file on v9fs
+ filesystem for the first time, v9fs sends an
+ attach command (Tattach) for that user.
+ This is the default mode.
+ <uid> = allows only user with uid=<uid> to access
+ the files on the mounted filesystem
+ any = v9fs does single attach and performs all
+ operations as one user
+
RESOURCES
=========
diff --git a/Documentation/filesystems/Exporting b/Documentation/filesystems/Exporting
index 31047e0fe14b..87019d2b5981 100644
--- a/Documentation/filesystems/Exporting
+++ b/Documentation/filesystems/Exporting
@@ -2,9 +2,12 @@
Making Filesystems Exportable
=============================
-Most filesystem operations require a dentry (or two) as a starting
+Overview
+--------
+
+All filesystem operations require a dentry (or two) as a starting
point. Local applications have a reference-counted hold on suitable
-dentrys via open file descriptors or cwd/root. However remote
+dentries via open file descriptors or cwd/root. However remote
applications that access a filesystem via a remote filesystem protocol
such as NFS may not be able to hold such a reference, and so need a
different way to refer to a particular dentry. As the alternative
@@ -13,14 +16,14 @@ server-reboot (among other things, though these tend to be the most
problematic), there is no simple answer like 'filename'.
The mechanism discussed here allows each filesystem implementation to
-specify how to generate an opaque (out side of the filesystem) byte
+specify how to generate an opaque (outside of the filesystem) byte
string for any dentry, and how to find an appropriate dentry for any
given opaque byte string.
This byte string will be called a "filehandle fragment" as it
corresponds to part of an NFS filehandle.
A filesystem which supports the mapping between filehandle fragments
-and dentrys will be termed "exportable".
+and dentries will be termed "exportable".
@@ -89,11 +92,9 @@ For a filesystem to be exportable it must:
1/ provide the filehandle fragment routines described below.
2/ make sure that d_splice_alias is used rather than d_add
when ->lookup finds an inode for a given parent and name.
- Typically the ->lookup routine will end:
- if (inode)
- return d_splice(inode, dentry);
- d_add(dentry, inode);
- return NULL;
+ Typically the ->lookup routine will end with a:
+
+ return d_splice_alias(inode, dentry);
}
@@ -101,67 +102,39 @@ For a filesystem to be exportable it must:
A file system implementation declares that instances of the filesystem
are exportable by setting the s_export_op field in the struct
super_block. This field must point to a "struct export_operations"
-struct which could potentially be full of NULLs, though normally at
-least get_parent will be set.
-
- The primary operations are decode_fh and encode_fh.
-decode_fh takes a filehandle fragment and tries to find or create a
-dentry for the object referred to by the filehandle.
-encode_fh takes a dentry and creates a filehandle fragment which can
-later be used to find/create a dentry for the same object.
-
-decode_fh will probably make use of "find_exported_dentry".
-This function lives in the "exportfs" module which a filesystem does
-not need unless it is being exported. So rather that calling
-find_exported_dentry directly, each filesystem should call it through
-the find_exported_dentry pointer in it's export_operations table.
-This field is set correctly by the exporting agent (e.g. nfsd) when a
-filesystem is exported, and before any export operations are called.
-
-find_exported_dentry needs three support functions from the
-filesystem:
- get_name. When given a parent dentry and a child dentry, this
- should find a name in the directory identified by the parent
- dentry, which leads to the object identified by the child dentry.
- If no get_name function is supplied, a default implementation is
- provided which uses vfs_readdir to find potential names, and
- matches inode numbers to find the correct match.
-
- get_parent. When given a dentry for a directory, this should return
- a dentry for the parent. Quite possibly the parent dentry will
- have been allocated by d_alloc_anon.
- The default get_parent function just returns an error so any
- filehandle lookup that requires finding a parent will fail.
- ->lookup("..") is *not* used as a default as it can leave ".."
- entries in the dcache which are too messy to work with.
-
- get_dentry. When given an opaque datum, this should find the
- implied object and create a dentry for it (possibly with
- d_alloc_anon).
- The opaque datum is whatever is passed down by the decode_fh
- function, and is often simply a fragment of the filehandle
- fragment.
- decode_fh passes two datums through find_exported_dentry. One that
- should be used to identify the target object, and one that can be
- used to identify the object's parent, should that be necessary.
- The default get_dentry function assumes that the datum contains an
- inode number and a generation number, and it attempts to get the
- inode using "iget" and check it's validity by matching the
- generation number. A filesystem should only depend on the default
- if iget can safely be used this way.
-
-If decode_fh and/or encode_fh are left as NULL, then default
-implementations are used. These defaults are suitable for ext2 and
-extremely similar filesystems (like ext3).
-
-The default encode_fh creates a filehandle fragment from the inode
-number and generation number of the target together with the inode
-number and generation number of the parent (if the parent is
-required).
-
-The default decode_fh extract the target and parent datums from the
-filehandle assuming the format used by the default encode_fh and
-passed them to find_exported_dentry.
+struct which has the following members:
+
+ encode_fh (optional)
+ Takes a dentry and creates a filehandle fragment which can later be used
+ to find or create a dentry for the same object. The default
+ implementation creates a filehandle fragment that encodes a 32bit inode
+ and generation number for the inode encoded, and if necessary the
+ same information for the parent.
+
+ fh_to_dentry (mandatory)
+ Given a filehandle fragment, this should find the implied object and
+ create a dentry for it (possibly with d_alloc_anon).
+
+ fh_to_parent (optional but strongly recommended)
+ Given a filehandle fragment, this should find the parent of the
+ implied object and create a dentry for it (possibly with d_alloc_anon).
+ May fail if the filehandle fragment is too small.
+
+ get_parent (optional but strongly recommended)
+ When given a dentry for a directory, this should return a dentry for
+ the parent. Quite possibly the parent dentry will have been allocated
+ by d_alloc_anon. The default get_parent function just returns an error
+ so any filehandle lookup that requires finding a parent will fail.
+ ->lookup("..") is *not* used as a default as it can leave ".." entries
+ in the dcache which are too messy to work with.
+
+ get_name (optional)
+ When given a parent dentry and a child dentry, this should find a name
+ in the directory identified by the parent dentry, which leads to the
+ object identified by the child dentry. If no get_name function is
+ supplied, a default implementation is provided which uses vfs_readdir
+ to find potential names, and matches inode numbers to find the correct
+ match.
A filehandle fragment consists of an array of 1 or more 4byte words,
@@ -172,5 +145,3 @@ generated by encode_fh, in which case it will have been padded with
nuls. Rather, the encode_fh routine should choose a "type" which
indicates the decode_fh how much of the filehandle is valid, and how
it should be interpreted.
-
-
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index f0f825808ca4..37c10cba7177 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -178,15 +178,18 @@ prototypes:
locking rules:
All except set_page_dirty may block
- BKL PageLocked(page)
+ BKL PageLocked(page) i_sem
writepage: no yes, unlocks (see below)
readpage: no yes, unlocks
sync_page: no maybe
writepages: no
set_page_dirty no no
readpages: no
-prepare_write: no yes
-commit_write: no yes
+prepare_write: no yes yes
+commit_write: no yes yes
+write_begin: no locks the page yes
+write_end: no yes, unlocks yes
+perform_write: no n/a yes
bmap: yes
invalidatepage: no yes
releasepage: no yes
@@ -221,7 +224,7 @@ against the page the filesystem should redirty the page with
redirty_page_for_writepage(), then unlock the page and return zero.
This may also be done to avoid internal deadlocks, but rarely.
-If the filesytem is called for sync then it must wait on any
+If the filesystem is called for sync then it must wait on any
in-progress I/O and then start new I/O.
The filesystem should unlock the page synchronously, before returning to the
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 4aecc9bdb273..b45f3c1b8b43 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -130,12 +130,12 @@ Device layer.
Journaling Block Device layer
-----------------------------
-The Journaling Block Device layer (JBD) isn't ext3 specific. It was design to
-add journaling capabilities on a block device. The ext3 filesystem code will
-inform the JBD of modifications it is performing (called a transaction). The
-journal supports the transactions start and stop, and in case of crash, the
-journal can replayed the transactions to put the partition back in a
-consistent state fast.
+The Journaling Block Device layer (JBD) isn't ext3 specific. It was designed
+to add journaling capabilities to a block device. The ext3 filesystem code
+will inform the JBD of modifications it is performing (called a transaction).
+The journal supports the transactions start and stop, and in case of a crash,
+the journal can replay the transactions to quickly put the partition back into
+a consistent state.
Handles represent a single atomic update to a filesystem. JBD can handle an
external journal on a block device.
@@ -164,7 +164,7 @@ written to the journal first, and then to its final location.
In the event of a crash, the journal can be replayed, bringing both data and
metadata into a consistent state. This mode is the slowest except when data
needs to be read from and written to disk at the same time where it
-outperforms all others modes.
+outperforms all other modes.
Compatibility
-------------
diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt
index 133e213ebb72..bb0142f61084 100644
--- a/Documentation/filesystems/files.txt
+++ b/Documentation/filesystems/files.txt
@@ -76,13 +76,13 @@ the fdtable structure -
5. Handling of the file structures is special. Since the look-up
of the fd (fget()/fget_light()) are lock-free, it is possible
that look-up may race with the last put() operation on the
- file structure. This is avoided using the rcuref APIs
+ file structure. This is avoided using atomic_inc_not_zero()
on ->f_count :
rcu_read_lock();
file = fcheck_files(files, fd);
if (file) {
- if (rcuref_inc_lf(&file->f_count))
+ if (atomic_inc_not_zero(&file->f_count))
*fput_needed = 1;
else
/* Didn't get the reference, someone's freed */
@@ -92,7 +92,7 @@ the fdtable structure -
....
return file;
- rcuref_inc_lf() detects if refcounts is already zero or
+ atomic_inc_not_zero() detects if refcounts is already zero or
goes to zero during increment. If it does, we fail
fget()/fget_light().
diff --git a/Documentation/locks.txt b/Documentation/filesystems/locks.txt
index e3b402ef33bd..fab857accbd6 100644
--- a/Documentation/locks.txt
+++ b/Documentation/filesystems/locks.txt
@@ -53,11 +53,11 @@ fcntl(), with all the problems that implies.
1.3 Mandatory Locking As A Mount Option
---------------------------------------
-Mandatory locking, as described in 'Documentation/mandatory.txt' was prior
-to this release a general configuration option that was valid for all
-mounted filesystems. This had a number of inherent dangers, not the least
-of which was the ability to freeze an NFS server by asking it to read a
-file for which a mandatory lock existed.
+Mandatory locking, as described in 'Documentation/filesystems/mandatory.txt'
+was prior to this release a general configuration option that was valid for
+all mounted filesystems. This had a number of inherent dangers, not the
+least of which was the ability to freeze an NFS server by asking it to read
+a file for which a mandatory lock existed.
From this release of the kernel, mandatory locking can be turned on and off
on a per-filesystem basis, using the mount options 'mand' and 'nomand'.
diff --git a/Documentation/mandatory.txt b/Documentation/filesystems/mandatory-locking.txt
index bc449d49eee5..0979d1d2ca8b 100644
--- a/Documentation/mandatory.txt
+++ b/Documentation/filesystems/mandatory-locking.txt
@@ -3,7 +3,26 @@
Andy Walker <andy@lysaker.kvaerner.no>
15 April 1996
-
+ (Updated September 2007)
+
+0. Why you should avoid mandatory locking
+-----------------------------------------
+
+The Linux implementation is prey to a number of difficult-to-fix race
+conditions which in practice make it not dependable:
+
+ - The write system call checks for a mandatory lock only once
+ at its start. It is therefore possible for a lock request to
+ be granted after this check but before the data is modified.
+ A process may then see file data change even while a mandatory
+ lock was held.
+ - Similarly, an exclusive lock may be granted on a file after
+ the kernel has decided to proceed with a read, but before the
+ read has actually completed, and the reading process may see
+ the file data in a state which should not have been visible
+ to it.
+ - Similar races make the claimed mutual exclusion between lock
+ and mmap similarly unreliable.
1. What is mandatory locking?
------------------------------
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt
index 8ee10ec88293..e79ee2db183a 100644
--- a/Documentation/filesystems/ntfs.txt
+++ b/Documentation/filesystems/ntfs.txt
@@ -407,7 +407,7 @@ raiddev /dev/md0
device /dev/hda5
raid-disk 0
device /dev/hdb1
- raid-disl 1
+ raid-disk 1
For linear raid, just change the raid-level above to "raid-level linear", for
mirrors, change it to "raid-level 1", and for stripe sets with parity, change
@@ -457,6 +457,8 @@ ChangeLog
Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog.
+2.1.29:
+ - Fix a deadlock when mounting read-write.
2.1.28:
- Fix a deadlock.
2.1.27:
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 4a37e25e694c..dec99455321f 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -347,7 +347,35 @@ connects the CPUs in a SMP system. This means that an error has been detected,
the IO-APIC automatically retry the transmission, so it should not be a big
problem, but you should read the SMP-FAQ.
-In this context it could be interesting to note the new irq directory in 2.4.
+In 2.6.2* /proc/interrupts was expanded again. This time the goal was for
+/proc/interrupts to display every IRQ vector in use by the system, not
+just those considered 'most important'. The new vectors are:
+
+ THR -- interrupt raised when a machine check threshold counter
+ (typically counting ECC corrected errors of memory or cache) exceeds
+ a configurable threshold. Only available on some systems.
+
+ TRM -- a thermal event interrupt occurs when a temperature threshold
+ has been exceeded for the CPU. This interrupt may also be generated
+ when the temperature drops back to normal.
+
+ SPU -- a spurious interrupt is some interrupt that was raised then lowered
+ by some IO device before it could be fully processed by the APIC. Hence
+ the APIC sees the interrupt but does not know what device it came from.
+ For this case the APIC will generate the interrupt with a IRQ vector
+ of 0xff. This might also be generated by chipset bugs.
+
+ RES, CAL, TLB -- rescheduling, call and TLB flush interrupts are
+ sent from one CPU to another per the needs of the OS. Typically,
+ their statistics are used by kernel developers and interested users to
+ determine the occurance of interrupt of the given type.
+
+The above IRQ vectors are displayed only when relevent. For example,
+the threshold vector does not exist on x86_64 platforms. Others are
+suppressed when the system is a uniprocessor. As of this writing, only
+i386 and x86_64 platforms support the new IRQ vector displays.
+
+Of some interest is the introduction of the /proc/irq directory to 2.4.
It could be used to set IRQ to CPU affinity, this means that you can "hook" an
IRQ to only one CPU, or to exclude a CPU of handling IRQs. The contents of the
irq subdir is one subdir for each IRQ, and one file; prof_cpu_mask
@@ -785,9 +813,9 @@ Various pieces of information about kernel activity are available in the
since the system first booted. For a quick look, simply cat the file:
> cat /proc/stat
- cpu 2255 34 2290 22625563 6290 127 456
- cpu0 1132 34 1441 11311718 3675 127 438
- cpu1 1123 0 849 11313845 2614 0 18
+ cpu 2255 34 2290 22625563 6290 127 456 0
+ cpu0 1132 34 1441 11311718 3675 127 438 0
+ cpu1 1123 0 849 11313845 2614 0 18 0
intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...]
ctxt 1990473
btime 1062191376
@@ -807,6 +835,7 @@ second). The meanings of the columns are as follows, from left to right:
- iowait: waiting for I/O to complete
- irq: servicing interrupts
- softirq: servicing softirqs
+- steal: involuntary wait
The "intr" line gives counts of interrupts serviced since boot time, for each
of the possible system interrupts. The first column is the total of all
diff --git a/Documentation/filesystems/quota.txt b/Documentation/filesystems/quota.txt
new file mode 100644
index 000000000000..a590c4093eff
--- /dev/null
+++ b/Documentation/filesystems/quota.txt
@@ -0,0 +1,59 @@
+
+Quota subsystem
+===============
+
+Quota subsystem allows system administrator to set limits on used space and
+number of used inodes (inode is a filesystem structure which is associated
+with each file or directory) for users and/or groups. For both used space and
+number of used inodes there are actually two limits. The first one is called
+softlimit and the second one hardlimit. An user can never exceed a hardlimit
+for any resource. User is allowed to exceed softlimit but only for limited
+period of time. This period is called "grace period" or "grace time". When
+grace time is over, user is not able to allocate more space/inodes until he
+frees enough of them to get below softlimit.
+
+Quota limits (and amount of grace time) are set independently for each
+filesystem.
+
+For more details about quota design, see the documentation in quota-tools package
+(http://sourceforge.net/projects/linuxquota).
+
+Quota netlink interface
+=======================
+When user exceeds a softlimit, runs out of grace time or reaches hardlimit,
+quota subsystem traditionally printed a message to the controlling terminal of
+the process which caused the excess. This method has the disadvantage that
+when user is using a graphical desktop he usually cannot see the message.
+Thus quota netlink interface has been designed to pass information about
+the above events to userspace. There they can be captured by an application
+and processed accordingly.
+
+The interface uses generic netlink framework (see
+http://lwn.net/Articles/208755/ and http://people.suug.ch/~tgr/libnl/ for more
+details about this layer). The name of the quota generic netlink interface
+is "VFS_DQUOT". Definitions of constants below are in <linux/quota.h>.
+ Currently, the interface supports only one message type QUOTA_NL_C_WARNING.
+This command is used to send a notification about any of the above mentioned
+events. Each message has six attributes. These are (type of the argument is
+in parentheses):
+ QUOTA_NL_A_QTYPE (u32)
+ - type of quota being exceeded (one of USRQUOTA, GRPQUOTA)
+ QUOTA_NL_A_EXCESS_ID (u64)
+ - UID/GID (depends on quota type) of user / group whose limit
+ is being exceeded.
+ QUOTA_NL_A_CAUSED_ID (u64)
+ - UID of a user who caused the event
+ QUOTA_NL_A_WARNING (u32)
+ - what kind of limit is exceeded:
+ QUOTA_NL_IHARDWARN - inode hardlimit
+ QUOTA_NL_ISOFTLONGWARN - inode softlimit is exceeded longer
+ than given grace period
+ QUOTA_NL_ISOFTWARN - inode softlimit
+ QUOTA_NL_BHARDWARN - space (block) hardlimit
+ QUOTA_NL_BSOFTLONGWARN - space (block) softlimit is exceeded
+ longer than given grace period.
+ QUOTA_NL_BSOFTWARN - space (block) softlimit
+ QUOTA_NL_A_DEV_MAJOR (u32)
+ - major number of a device with the affected filesystem
+ QUOTA_NL_A_DEV_MINOR (u32)
+ - minor number of a device with the affected filesystem
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
index 25981e2e51be..339c6a4f220e 100644
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
@@ -8,7 +8,7 @@ What is ramfs?
Ramfs is a very simple filesystem that exports Linux's disk caching
mechanisms (the page cache and dentry cache) as a dynamically resizable
-ram-based filesystem.
+RAM-based filesystem.
Normally all files are cached in memory by Linux. Pages of data read from
backing store (usually the block device the filesystem is mounted on) are kept
@@ -34,7 +34,7 @@ ramfs and ramdisk:
------------------
The older "ram disk" mechanism created a synthetic block device out of
-an area of ram and used it as backing store for a filesystem. This block
+an area of RAM and used it as backing store for a filesystem. This block
device was of fixed size, so the filesystem mounted on it was of fixed
size. Using a ram disk also required unnecessarily copying memory from the
fake block device into the page cache (and copying changes back out), as well
@@ -46,8 +46,8 @@ unnecessary work for the CPU, and pollutes the CPU caches. (There are tricks
to avoid this copying by playing with the page tables, but they're unpleasantly
complicated and turn out to be about as expensive as the copying anyway.)
More to the point, all the work ramfs is doing has to happen _anyway_,
-since all file access goes through the page and dentry caches. The ram
-disk is simply unnecessary, ramfs is internally much simpler.
+since all file access goes through the page and dentry caches. The RAM
+disk is simply unnecessary; ramfs is internally much simpler.
Another reason ramdisks are semi-obsolete is that the introduction of
loopback devices offered a more flexible and convenient way to create
@@ -103,7 +103,7 @@ All this differs from the old initrd in several ways:
initramfs archive is a gzipped cpio archive (like tar only simpler,
see cpio(1) and Documentation/early-userspace/buffer-format.txt). The
kernel's cpio extraction code is not only extremely small, it's also
- __init data that can be discarded during the boot process.
+ __init text and data that can be discarded during the boot process.
- The program run by the old initrd (which was called /initrd, not /init) did
some setup and then returned to the kernel, while the init program from
@@ -220,7 +220,7 @@ device) but the separate packaging of initrd (which is nice if you have
non-GPL code you'd like to run from initramfs, without conflating it with
the GPL licensed Linux kernel binary).
-It can also be used to supplement the kernel's built-in initamfs image. The
+It can also be used to supplement the kernel's built-in initramfs image. The
files in the external archive will overwrite any conflicting files in
the built-in initramfs archive. Some distributors also prefer to customize
a single kernel image with task-specific initramfs images, without recompiling.
@@ -339,7 +339,7 @@ smooth transition and allowing early boot functionality to gradually move to
The move to early userspace is necessary because finding and mounting the real
root device is complex. Root partitions can span multiple devices (raid or
separate journal). They can be out on the network (requiring dhcp, setting a
-specific mac address, logging into a server, etc). They can live on removable
+specific MAC address, logging into a server, etc). They can live on removable
media, with dynamically allocated major/minor numbers and persistent naming
issues requiring a full udev implementation to sort out. They can be
compressed, encrypted, copy-on-write, loopback mounted, strangely partitioned,
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 4b5ca26e5048..4598ef7b622b 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -51,7 +51,7 @@ for the attributes, providing a means to read and write kernel
attributes.
Attributes should be ASCII text files, preferably with only one value
-per file. It is noted that it may not be efficient to contain only
+per file. It is noted that it may not be efficient to contain only one
value per file, so it is socially acceptable to express an array of
values of the same type.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 045f3e055a28..9d019d35728f 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -537,6 +537,12 @@ struct address_space_operations {
struct list_head *pages, unsigned nr_pages);
int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+ int (*write_begin)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata);
+ int (*write_end)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
@@ -615,11 +621,7 @@ struct address_space_operations {
any basic-blocks on storage, then those blocks should be
pre-read (if they haven't been read already) so that the
updated blocks can be written out properly.
- The page will be locked. If prepare_write wants to unlock the
- page it, like readpage, may do so and return
- AOP_TRUNCATED_PAGE.
- In this case the prepare_write will be retried one the lock is
- regained.
+ The page will be locked.
Note: the page _must not_ be marked uptodate in this function
(or anywhere else) unless it actually is uptodate right now. As
@@ -633,6 +635,45 @@ struct address_space_operations {
operations. It should avoid returning an error if possible -
errors should have been handled by prepare_write.
+ write_begin: This is intended as a replacement for prepare_write. The
+ key differences being that:
+ - it returns a locked page (in *pagep) rather than being
+ given a pre locked page;
+ - it must be able to cope with short writes (where the
+ length passed to write_begin is greater than the number
+ of bytes copied into the page).
+
+ Called by the generic buffered write code to ask the filesystem to
+ prepare to write len bytes at the given offset in the file. The
+ address_space should check that the write will be able to complete,
+ by allocating space if necessary and doing any other internal
+ housekeeping. If the write will update parts of any basic-blocks on
+ storage, then those blocks should be pre-read (if they haven't been
+ read already) so that the updated blocks can be written out properly.
+
+ The filesystem must return the locked pagecache page for the specified
+ offset, in *pagep, for the caller to write into.
+
+ flags is a field for AOP_FLAG_xxx flags, described in
+ include/linux/fs.h.
+
+ A void * may be returned in fsdata, which then gets passed into
+ write_end.
+
+ Returns 0 on success; < 0 on failure (which is the error code), in
+ which case write_end is not called.
+
+ write_end: After a successful write_begin, and data copy, write_end must
+ be called. len is the original len passed to write_begin, and copied
+ is the amount that was able to be copied (copied == len is always true
+ if write_begin was called with the AOP_FLAG_UNINTERRUPTIBLE flag).
+
+ The filesystem must take care of unlocking the page and releasing it
+ refcount, and updating i_size.
+
+ Returns < 0 on failure, otherwise the number of bytes (<= 'copied')
+ that were able to be copied into pagecache.
+
bmap: called by the VFS to map a logical block offset within object to
physical block number. This method is used by the FIBMAP
ioctl and for working with swap-files. To be able to swap to
@@ -665,7 +706,7 @@ struct address_space_operations {
wants to make it a free page. If ->releasepage succeeds, the
page will be removed from the address_space and become free.
- The second case if when a request has been made to invalidate
+ The second case is when a request has been made to invalidate
some or all pages in an address_space. This can happen
through the fadvice(POSIX_FADV_DONTNEED) system call or by the
filesystem explicitly requesting it as nfs and 9fs do (when
diff --git a/Documentation/firmware_class/firmware_sample_firmware_class.c b/Documentation/firmware_class/firmware_sample_firmware_class.c
index fba943aacf93..2de62854f0e5 100644
--- a/Documentation/firmware_class/firmware_sample_firmware_class.c
+++ b/Documentation/firmware_class/firmware_sample_firmware_class.c
@@ -109,15 +109,15 @@ static int fw_setup_class_device(struct class_device *class_dev,
const char *fw_name,
struct device *device)
{
- int retval = 0;
- struct firmware_priv *fw_priv = kmalloc(sizeof(struct firmware_priv),
- GFP_KERNEL);
+ int retval;
+ struct firmware_priv *fw_priv;
- if(!fw_priv){
+ fw_priv = kzalloc(sizeof(struct firmware_priv), GFP_KERNEL);
+ if (!fw_priv) {
retval = -ENOMEM;
goto out;
}
- memset(fw_priv, 0, sizeof(*fw_priv));
+
memset(class_dev, 0, sizeof(*class_dev));
strncpy(fw_priv->fw_id, fw_name, FIRMWARE_NAME_MAX);
diff --git a/Documentation/hwmon/coretemp b/Documentation/hwmon/coretemp
index 870cda9416e9..170bf862437b 100644
--- a/Documentation/hwmon/coretemp
+++ b/Documentation/hwmon/coretemp
@@ -4,7 +4,7 @@ Kernel driver coretemp
Supported chips:
* All Intel Core family
Prefix: 'coretemp'
- CPUID: family 0x6, models 0xe, 0xf
+ CPUID: family 0x6, models 0xe, 0xf, 0x16
Datasheet: Intel 64 and IA-32 Architectures Software Developer's Manual
Volume 3A: System Programming Guide
diff --git a/Documentation/hwmon/dme1737 b/Documentation/hwmon/dme1737
index 1a0f3d64ab80..8f446070e64a 100644
--- a/Documentation/hwmon/dme1737
+++ b/Documentation/hwmon/dme1737
@@ -6,6 +6,10 @@ Supported chips:
Prefix: 'dme1737'
Addresses scanned: I2C 0x2c, 0x2d, 0x2e
Datasheet: Provided by SMSC upon request and under NDA
+ * SMSC SCH3112, SCH3114, SCH3116
+ Prefix: 'sch311x'
+ Addresses scanned: none, address read from Super-I/O config space
+ Datasheet: http://www.nuhorizons.com/FeaturedProducts/Volume1/SMSC/311x.pdf
Authors:
Juerg Haefliger <juergh@gmail.com>
@@ -27,16 +31,25 @@ Description
-----------
This driver implements support for the hardware monitoring capabilities of the
-SMSC DME1737 and Asus A8000 (which are the same) Super-I/O chips. This chip
-features monitoring of 3 temp sensors temp[1-3] (2 remote diodes and 1
-internal), 7 voltages in[0-6] (6 external and 1 internal) and 6 fan speeds
-fan[1-6]. Additionally, the chip implements 5 PWM outputs pwm[1-3,5-6] for
-controlling fan speeds both manually and automatically.
-
-Fan[3-6] and pwm[3,5-6] are optional features and their availability is
-dependent on the configuration of the chip. The driver will detect which
-features are present during initialization and create the sysfs attributes
-accordingly.
+SMSC DME1737 and Asus A8000 (which are the same) and SMSC SCH311x Super-I/O
+chips. These chips feature monitoring of 3 temp sensors temp[1-3] (2 remote
+diodes and 1 internal), 7 voltages in[0-6] (6 external and 1 internal) and up
+to 6 fan speeds fan[1-6]. Additionally, the chips implement up to 5 PWM
+outputs pwm[1-3,5-6] for controlling fan speeds both manually and
+automatically.
+
+For the DME1737 and A8000, fan[1-2] and pwm[1-2] are always present. Fan[3-6]
+and pwm[3,5-6] are optional features and their availability depends on the
+configuration of the chip. The driver will detect which features are present
+during initialization and create the sysfs attributes accordingly.
+
+For the SCH311x, fan[1-3] and pwm[1-3] are always present and fan[4-6] and
+pwm[5-6] don't exist.
+
+The hardware monitoring features of the DME1737 and A8000 are only accessible
+via SMBus, while the SCH311x only provides access via the ISA bus. The driver
+will therefore register itself as an I2C client driver if it detects a DME1737
+or A8000 and as a platform driver if it detects a SCH311x chip.
Voltage Monitoring
diff --git a/Documentation/hwmon/f71805f b/Documentation/hwmon/f71805f
index 94e0d2cbd3d2..f0d55976740a 100644
--- a/Documentation/hwmon/f71805f
+++ b/Documentation/hwmon/f71805f
@@ -6,6 +6,10 @@ Supported chips:
Prefix: 'f71805f'
Addresses scanned: none, address read from Super I/O config space
Datasheet: Available from the Fintek website
+ * Fintek F71806F/FG
+ Prefix: 'f71872f'
+ Addresses scanned: none, address read from Super I/O config space
+ Datasheet: Available from the Fintek website
* Fintek F71872F/FG
Prefix: 'f71872f'
Addresses scanned: none, address read from Super I/O config space
@@ -38,6 +42,9 @@ The Fintek F71872F/FG Super I/O chip is almost the same, with two
additional internal voltages monitored (VSB and battery). It also features
6 VID inputs. The VID inputs are not yet supported by this driver.
+The Fintek F71806F/FG Super-I/O chip is essentially the same as the
+F71872F/FG, and is undistinguishable therefrom.
+
The driver assumes that no more than one chip is present, which seems
reasonable.
diff --git a/Documentation/hwmon/it87 b/Documentation/hwmon/it87
index 81ecc7e41c50..5b704a40256b 100644
--- a/Documentation/hwmon/it87
+++ b/Documentation/hwmon/it87
@@ -90,7 +90,8 @@ upper VID bits share their pins with voltage inputs (in5 and in6) so you
can't have both on a given board.
The IT8716F, IT8718F and later IT8712F revisions have support for
-2 additional fans. They are not yet supported by the driver.
+2 additional fans. They are supported by the driver for the IT8716F and
+IT8718F but not for the IT8712F
The IT8716F and IT8718F, and late IT8712F and IT8705F also have optional
16-bit tachometer counters for fans 1 to 3. This is better (no more fan
diff --git a/Documentation/hwmon/lm78 b/Documentation/hwmon/lm78
index fd5dc7a19f0e..dfc318a60fd4 100644
--- a/Documentation/hwmon/lm78
+++ b/Documentation/hwmon/lm78
@@ -56,16 +56,6 @@ should work with. This is hardcoded by the mainboard and/or processor itself.
It is a value in volts. When it is unconnected, you will often find the
value 3.50 V here.
-In addition to the alarms described above, there are a couple of additional
-ones. There is a BTI alarm, which gets triggered when an external chip has
-crossed its limits. Usually, this is connected to all LM75 chips; if at
-least one crosses its limits, this bit gets set. The CHAS alarm triggers
-if your computer case is open. The FIFO alarms should never trigger; it
-indicates an internal error. The SMI_IN alarm indicates some other chip
-has triggered an SMI interrupt. As we do not use SMI interrupts at all,
-this condition usually indicates there is a problem with some other
-device.
-
If an alarm triggers, it will remain triggered until the hardware register
is read at least once. This means that the cause for the alarm may
already have disappeared! Note that in the current implementation, all
diff --git a/Documentation/hwmon/lm93 b/Documentation/hwmon/lm93
index 4e4a1dc1d2da..ac711f357faf 100644
--- a/Documentation/hwmon/lm93
+++ b/Documentation/hwmon/lm93
@@ -7,7 +7,7 @@ Supported chips:
Addresses scanned: I2C 0x2c-0x2e
Datasheet: http://www.national.com/ds.cgi/LM/LM93.pdf
-Author:
+Authors:
Mark M. Hoffman <mhoffman@lightlink.com>
Ported to 2.6 by Eric J. Bowersox <ericb@aspsys.com>
Adapted to 2.6.20 by Carsten Emde <ce@osadl.org>
@@ -16,7 +16,6 @@ Author:
Module Parameters
-----------------
-(specific to LM93)
* init: integer
Set to non-zero to force some initializations (default is 0).
* disable_block: integer
@@ -37,30 +36,13 @@ Module Parameters
I.e. this parameter controls the VID pin input thresholds; if your VID
inputs are not working, try changing this. The default value is "0".
-(common among sensor drivers)
-* force: short array (min = 1, max = 48)
- List of adapter,address pairs to assume to be present. Autodetection
- of the target device will still be attempted. Use one of the more
- specific force directives below if this doesn't detect the device.
-* force_lm93: short array (min = 1, max = 48)
- List of adapter,address pairs which are unquestionably assumed to contain
- a 'lm93' chip
-* ignore: short array (min = 1, max = 48)
- List of adapter,address pairs not to scan
-* ignore_range: short array (min = 1, max = 48)
- List of adapter,start-addr,end-addr triples not to scan
-* probe: short array (min = 1, max = 48)
- List of adapter,address pairs to scan additionally
-* probe_range: short array (min = 1, max = 48)
- List of adapter,start-addr,end-addr triples to scan additionally
-
Hardware Description
--------------------
(from the datasheet)
-The LM93, hardware monitor, has a two wire digital interface compatible with
+The LM93 hardware monitor has a two wire digital interface compatible with
SMBus 2.0. Using an 8-bit ADC, the LM93 measures the temperature of two remote
diode connected transistors as well as its own die and 16 power supply
voltages. To set fan speed, the LM93 has two PWM outputs that are each
@@ -69,18 +51,12 @@ table based. The LM93 includes a digital filter that can be invoked to smooth
temperature readings for better control of fan speed. The LM93 has four
tachometer inputs to measure fan speed. Limit and status registers for all
measured values are included. The LM93 builds upon the functionality of
-previous motherboard management ASICs and uses some of the LM85 s features
+previous motherboard management ASICs and uses some of the LM85's features
(i.e. smart tachometer mode). It also adds measurement and control support
for dynamic Vccp monitoring and PROCHOT. It is designed to monitor a dual
processor Xeon class motherboard with a minimum of external components.
-Driver Description
-------------------
-
-This driver implements support for the National Semiconductor LM93.
-
-
User Interface
--------------
@@ -101,7 +77,7 @@ These intervals can be found in the sysfs files prochot1_interval and
prochot2_interval. The values in these files specify the intervals for
#P1_PROCHOT and #P2_PROCHOT, respectively. Selecting a value not in this
list will cause the driver to use the next largest interval. The available
-intervals are:
+intervals are (in seconds):
#PROCHOT intervals: 0.73, 1.46, 2.9, 5.8, 11.7, 23.3, 46.6, 93.2, 186, 372
@@ -111,12 +87,12 @@ assert #P2_PROCHOT, and vice-versa. This mode is enabled by writing a
non-zero integer to the sysfs file prochot_short.
The LM93 can also override the #PROCHOT pins by driving a PWM signal onto
-one or both of them. When overridden, the signal has a period of 3.56 mS,
+one or both of them. When overridden, the signal has a period of 3.56 ms,
a minimum pulse width of 5 clocks (at 22.5kHz => 6.25% duty cycle), and
a maximum pulse width of 80 clocks (at 22.5kHz => 99.88% duty cycle).
The sysfs files prochot1_override and prochot2_override contain boolean
-intgers which enable or disable the override function for #P1_PROCHOT and
+integers which enable or disable the override function for #P1_PROCHOT and
#P2_PROCHOT, respectively. The sysfs file prochot_override_duty_cycle
contains a value controlling the duty cycle for the PWM signal used when
the override function is enabled. This value ranges from 0 to 15, with 0
@@ -166,7 +142,7 @@ frequency values are constrained by the hardware. Selecting a value which is
not available will cause the driver to use the next largest value. Also note
that this parameter has implications for the Smart Tach Mode (see above).
-PWM Output Frequencies: 12, 36, 48, 60, 72, 84, 96, 22500 (h/w default)
+PWM Output Frequencies (in Hz): 12, 36, 48, 60, 72, 84, 96, 22500 (default)
Automatic PWM:
@@ -178,7 +154,7 @@ individual control sources to which the PWM output is bound.
The eight control sources are: temp1-temp4 (aka "zones" in the datasheet),
#PROCHOT 1 & 2, and #VRDHOT 1 & 2. The bindings are expressed as a bitmask
in the sysfs files pwm<n>_auto_channels, where a "1" enables the binding, and
- a "0" disables it. The h/w default is 0x0f (all temperatures bound).
+a "0" disables it. The h/w default is 0x0f (all temperatures bound).
0x01 - Temp 1
0x02 - Temp 2
@@ -324,89 +300,3 @@ LM93 Unique sysfs Files
gpio input state of 8 GPIO pins; read-only
-
-Sample Configuration File
--------------------------
-
-Here is a sample LM93 chip config for sensors.conf:
-
----------- cut here ----------
-chip "lm93-*"
-
-# VOLTAGE INPUTS
-
- # labels and scaling based on datasheet recommendations
- label in1 "+12V1"
- compute in1 @ * 12.945, @ / 12.945
- set in1_min 12 * 0.90
- set in1_max 12 * 1.10
-
- label in2 "+12V2"
- compute in2 @ * 12.945, @ / 12.945
- set in2_min 12 * 0.90
- set in2_max 12 * 1.10
-
- label in3 "+12V3"
- compute in3 @ * 12.945, @ / 12.945
- set in3_min 12 * 0.90
- set in3_max 12 * 1.10
-
- label in4 "FSB_Vtt"
-
- label in5 "3GIO"
-
- label in6 "ICH_Core"
-
- label in7 "Vccp1"
-
- label in8 "Vccp2"
-
- label in9 "+3.3V"
- set in9_min 3.3 * 0.90
- set in9_max 3.3 * 1.10
-
- label in10 "+5V"
- set in10_min 5.0 * 0.90
- set in10_max 5.0 * 1.10
-
- label in11 "SCSI_Core"
-
- label in12 "Mem_Core"
-
- label in13 "Mem_Vtt"
-
- label in14 "Gbit_Core"
-
- # Assuming R1/R2 = 4.1143, and 3.3V reference
- # -12V = (4.1143 + 1) * (@ - 3.3) + 3.3
- label in15 "-12V"
- compute in15 @ * 5.1143 - 13.57719, (@ + 13.57719) / 5.1143
- set in15_min -12 * 0.90
- set in15_max -12 * 1.10
-
- label in16 "+3.3VSB"
- set in16_min 3.3 * 0.90
- set in16_max 3.3 * 1.10
-
-# TEMPERATURE INPUTS
-
- label temp1 "CPU1"
- label temp2 "CPU2"
- label temp3 "LM93"
-
-# TACHOMETER INPUTS
-
- label fan1 "Fan1"
- set fan1_min 3000
- label fan2 "Fan2"
- set fan2_min 3000
- label fan3 "Fan3"
- set fan3_min 3000
- label fan4 "Fan4"
- set fan4_min 3000
-
-# PWM OUTPUTS
-
- label pwm1 "CPU1"
- label pwm2 "CPU2"
-
diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface
index b3a9e1b9dbda..a17b692d2679 100644
--- a/Documentation/hwmon/sysfs-interface
+++ b/Documentation/hwmon/sysfs-interface
@@ -67,6 +67,10 @@ between readings to be caught and alarmed. The exact definition of an
alarm (for example, whether a threshold must be met or must be exceeded
to cause an alarm) is chip-dependent.
+When setting values of hwmon sysfs attributes, the string representation of
+the desired value must be written, note that strings which are not a number
+are interpreted as 0! For more on how written strings are interpreted see the
+"sysfs attribute writes interpretation" section at the end of this file.
-------------------------------------------------------------------------
@@ -78,8 +82,21 @@ RW read/write value
Read/write values may be read-only for some chips, depending on the
hardware implementation.
-All entries are optional, and should only be created in a given driver
-if the chip has the feature.
+All entries (except name) are optional, and should only be created in a
+given driver if the chip has the feature.
+
+
+********
+* Name *
+********
+
+name The chip name.
+ This should be a short, lowercase string, not containing
+ spaces nor dashes, representing the chip name. This is
+ the only mandatory attribute.
+ I2C devices get this attribute created automatically.
+ RO
+
************
* Voltages *
@@ -104,18 +121,17 @@ in[0-*]_input Voltage input value.
by the chip driver, and must be done by the application.
However, some drivers (notably lm87 and via686a)
do scale, because of internal resistors built into a chip.
- These drivers will output the actual voltage.
-
- Typical usage:
- in0_* CPU #1 voltage (not scaled)
- in1_* CPU #2 voltage (not scaled)
- in2_* 3.3V nominal (not scaled)
- in3_* 5.0V nominal (scaled)
- in4_* 12.0V nominal (scaled)
- in5_* -12.0V nominal (scaled)
- in6_* -5.0V nominal (scaled)
- in7_* varies
- in8_* varies
+ These drivers will output the actual voltage. Rule of
+ thumb: drivers should report the voltage values at the
+ "pins" of the chip.
+
+in[0-*]_label Suggested voltage channel label.
+ Text string
+ Should only be created if the driver has hints about what
+ this voltage channel is being used for, and user-space
+ doesn't. In all other cases, the label is provided by
+ user-space.
+ RO
cpu[0-*]_vid CPU core reference voltage.
Unit: millivolt
@@ -159,6 +175,13 @@ fan[1-*]_target
Only makes sense if the chip supports closed-loop fan speed
control based on the measured fan speed.
+fan[1-*]_label Suggested fan channel label.
+ Text string
+ Should only be created if the driver has hints about what
+ this fan channel is being used for, and user-space doesn't.
+ In all other cases, the label is provided by user-space.
+ RO
+
Also see the Alarms section for status flags associated with fans.
@@ -219,12 +242,12 @@ temp[1-*]_auto_point[1-*]_temp_hyst
****************
temp[1-*]_type Sensor type selection.
- Integers 1 to 6 or thermistor Beta value (typically 3435)
+ Integers 1 to 6
RW
1: PII/Celeron Diode
2: 3904 transistor
3: thermal diode
- 4: thermistor (default/unknown Beta)
+ 4: thermistor
5: AMD AMDSI
6: Intel PECI
Not all types are supported by all chips
@@ -260,18 +283,19 @@ temp[1-*]_crit_hyst
from the critical value.
RW
-temp[1-4]_offset
+temp[1-*]_offset
Temperature offset which is added to the temperature reading
by the chip.
Unit: millidegree Celsius
Read/Write value.
- If there are multiple temperature sensors, temp1_* is
- generally the sensor inside the chip itself,
- reported as "motherboard temperature". temp2_* to
- temp4_* are generally sensors external to the chip
- itself, for example the thermal diode inside the CPU or
- a thermistor nearby.
+temp[1-*]_label Suggested temperature channel label.
+ Text string
+ Should only be created if the driver has hints about what
+ this temperature channel is being used for, and user-space
+ doesn't. In all other cases, the label is provided by
+ user-space.
+ RO
Some chips measure temperature using external thermistors and an ADC, and
report the temperature measurement as a voltage. Converting this voltage
@@ -393,14 +417,53 @@ beep_mask Bitmask for beep.
RW
-*********
-* Other *
-*********
-
-eeprom Raw EEPROM data in binary form.
- RO
-
-pec Enable or disable PEC (SMBus only)
- 0: disable
- 1: enable
- RW
+sysfs attribute writes interpretation
+-------------------------------------
+
+hwmon sysfs attributes always contain numbers, so the first thing to do is to
+convert the input to a number, there are 2 ways todo this depending whether
+the number can be negative or not:
+unsigned long u = simple_strtoul(buf, NULL, 10);
+long s = simple_strtol(buf, NULL, 10);
+
+With buf being the buffer with the user input being passed by the kernel.
+Notice that we do not use the second argument of strto[u]l, and thus cannot
+tell when 0 is returned, if this was really 0 or is caused by invalid input.
+This is done deliberately as checking this everywhere would add a lot of
+code to the kernel.
+
+Notice that it is important to always store the converted value in an
+unsigned long or long, so that no wrap around can happen before any further
+checking.
+
+After the input string is converted to an (unsigned) long, the value should be
+checked if its acceptable. Be careful with further conversions on the value
+before checking it for validity, as these conversions could still cause a wrap
+around before the check. For example do not multiply the result, and only
+add/subtract if it has been divided before the add/subtract.
+
+What to do if a value is found to be invalid, depends on the type of the
+sysfs attribute that is being set. If it is a continuous setting like a
+tempX_max or inX_max attribute, then the value should be clamped to its
+limits using SENSORS_LIMIT(value, min_limit, max_limit). If it is not
+continuous like for example a tempX_type, then when an invalid value is
+written, -EINVAL should be returned.
+
+Example1, temp1_max, register is a signed 8 bit value (-128 - 127 degrees):
+
+ long v = simple_strtol(buf, NULL, 10) / 1000;
+ v = SENSORS_LIMIT(v, -128, 127);
+ /* write v to register */
+
+Example2, fan divider setting, valid values 2, 4 and 8:
+
+ unsigned long v = simple_strtoul(buf, NULL, 10);
+
+ switch (v) {
+ case 2: v = 1; break;
+ case 4: v = 2; break;
+ case 8: v = 3; break;
+ default:
+ return -EINVAL;
+ }
+ /* write v to register */
diff --git a/Documentation/hwmon/w83791d b/Documentation/hwmon/w83791d
index db9881df88a5..f153b2f6d62c 100644
--- a/Documentation/hwmon/w83791d
+++ b/Documentation/hwmon/w83791d
@@ -75,46 +75,64 @@ Voltage sensors (also known as IN sensors) report their values in millivolts.
An alarm is triggered if the voltage has crossed a programmable minimum
or maximum limit.
-The bit ordering for the alarm "realtime status register" and the
-"beep enable registers" are different.
-
-in0 (VCORE) : alarms: 0x000001 beep_enable: 0x000001
-in1 (VINR0) : alarms: 0x000002 beep_enable: 0x002000 <== mismatch
-in2 (+3.3VIN): alarms: 0x000004 beep_enable: 0x000004
-in3 (5VDD) : alarms: 0x000008 beep_enable: 0x000008
-in4 (+12VIN) : alarms: 0x000100 beep_enable: 0x000100
-in5 (-12VIN) : alarms: 0x000200 beep_enable: 0x000200
-in6 (-5VIN) : alarms: 0x000400 beep_enable: 0x000400
-in7 (VSB) : alarms: 0x080000 beep_enable: 0x010000 <== mismatch
-in8 (VBAT) : alarms: 0x100000 beep_enable: 0x020000 <== mismatch
-in9 (VINR1) : alarms: 0x004000 beep_enable: 0x004000
-temp1 : alarms: 0x000010 beep_enable: 0x000010
-temp2 : alarms: 0x000020 beep_enable: 0x000020
-temp3 : alarms: 0x002000 beep_enable: 0x000002 <== mismatch
-fan1 : alarms: 0x000040 beep_enable: 0x000040
-fan2 : alarms: 0x000080 beep_enable: 0x000080
-fan3 : alarms: 0x000800 beep_enable: 0x000800
-fan4 : alarms: 0x200000 beep_enable: 0x200000
-fan5 : alarms: 0x400000 beep_enable: 0x400000
-tart1 : alarms: 0x010000 beep_enable: 0x040000 <== mismatch
-tart2 : alarms: 0x020000 beep_enable: 0x080000 <== mismatch
-tart3 : alarms: 0x040000 beep_enable: 0x100000 <== mismatch
-case_open : alarms: 0x001000 beep_enable: 0x001000
-user_enable : alarms: -------- beep_enable: 0x800000
-
-*** NOTE: It is the responsibility of user-space code to handle the fact
-that the beep enable and alarm bits are in different positions when using that
-feature of the chip.
-
-When an alarm goes off, you can be warned by a beeping signal through your
-computer speaker. It is possible to enable all beeping globally, or only
-the beeping for some alarms.
-
-The driver only reads the chip values each 3 seconds; reading them more
-often will do no harm, but will return 'old' values.
+The w83791d has a global bit used to enable beeping from the speaker when an
+alarm is triggered as well as a bitmask to enable or disable the beep for
+specific alarms. You need both the global beep enable bit and the
+corresponding beep bit to be on for a triggered alarm to sound a beep.
+
+The sysfs interface to the gloabal enable is via the sysfs beep_enable file.
+This file is used for both legacy and new code.
+
+The sysfs interface to the beep bitmask has migrated from the original legacy
+method of a single sysfs beep_mask file to a newer method using multiple
+*_beep files as described in .../Documentation/hwmon/sysfs-interface.
+
+A similar change has occured for the bitmap corresponding to the alarms. The
+original legacy method used a single sysfs alarms file containing a bitmap
+of triggered alarms. The newer method uses multiple sysfs *_alarm files
+(again following the pattern described in sysfs-interface).
+
+Since both methods read and write the underlying hardware, they can be used
+interchangeably and changes in one will automatically be reflected by
+the other. If you use the legacy bitmask method, your user-space code is
+responsible for handling the fact that the alarms and beep_mask bitmaps
+are not the same (see the table below).
+
+NOTE: All new code should be written to use the newer sysfs-interface
+specification as that avoids bitmap problems and is the preferred interface
+going forward.
+
+The driver reads the hardware chip values at most once every three seconds.
+User mode code requesting values more often will receive cached values.
+
+Alarms bitmap vs. beep_mask bitmask
+------------------------------------
+For legacy code using the alarms and beep_mask files:
+
+in0 (VCORE) : alarms: 0x000001 beep_mask: 0x000001
+in1 (VINR0) : alarms: 0x000002 beep_mask: 0x002000 <== mismatch
+in2 (+3.3VIN): alarms: 0x000004 beep_mask: 0x000004
+in3 (5VDD) : alarms: 0x000008 beep_mask: 0x000008
+in4 (+12VIN) : alarms: 0x000100 beep_mask: 0x000100
+in5 (-12VIN) : alarms: 0x000200 beep_mask: 0x000200
+in6 (-5VIN) : alarms: 0x000400 beep_mask: 0x000400
+in7 (VSB) : alarms: 0x080000 beep_mask: 0x010000 <== mismatch
+in8 (VBAT) : alarms: 0x100000 beep_mask: 0x020000 <== mismatch
+in9 (VINR1) : alarms: 0x004000 beep_mask: 0x004000
+temp1 : alarms: 0x000010 beep_mask: 0x000010
+temp2 : alarms: 0x000020 beep_mask: 0x000020
+temp3 : alarms: 0x002000 beep_mask: 0x000002 <== mismatch
+fan1 : alarms: 0x000040 beep_mask: 0x000040
+fan2 : alarms: 0x000080 beep_mask: 0x000080
+fan3 : alarms: 0x000800 beep_mask: 0x000800
+fan4 : alarms: 0x200000 beep_mask: 0x200000
+fan5 : alarms: 0x400000 beep_mask: 0x400000
+tart1 : alarms: 0x010000 beep_mask: 0x040000 <== mismatch
+tart2 : alarms: 0x020000 beep_mask: 0x080000 <== mismatch
+tart3 : alarms: 0x040000 beep_mask: 0x100000 <== mismatch
+case_open : alarms: 0x001000 beep_mask: 0x001000
+global_enable: alarms: -------- beep_mask: 0x800000 (modified via beep_enable)
W83791D TODO:
---------------
-Provide a patch for per-file alarms and beep enables as defined in the hwmon
- documentation (Documentation/hwmon/sysfs-interface)
Provide a patch for smart-fan control (still need appropriate motherboard/fans)
diff --git a/Documentation/i2c/busses/i2c-i801 b/Documentation/i2c/busses/i2c-i801
index fe6406f2f9a6..fde4420e3f75 100644
--- a/Documentation/i2c/busses/i2c-i801
+++ b/Documentation/i2c/busses/i2c-i801
@@ -13,7 +13,8 @@ Supported adapters:
* Intel 631xESB/632xESB (ESB2)
* Intel 82801H (ICH8)
* Intel ICH9
- Datasheets: Publicly available at the Intel website
+ * Intel Tolapai
+ Datasheets: Publicly available at the Intel website
Authors:
Frodo Looijaard <frodol@dds.nl>,
diff --git a/Documentation/i2c/chips/pcf8574 b/Documentation/i2c/chips/pcf8574
index 2752c8ce3167..5c1ad1376b62 100644
--- a/Documentation/i2c/chips/pcf8574
+++ b/Documentation/i2c/chips/pcf8574
@@ -62,8 +62,6 @@ if the corresponding output is set as 1, otherwise the current output
value, that is to say 0.
The write file is read/write. Writing a value outputs it on the I/O
-port. Reading returns the last written value.
-
-On module initialization the chip is configured as eight inputs (all
-outputs to 1), so you can connect any circuit to the PCF8574(A) without
-being afraid of short-circuit.
+port. Reading returns the last written value. As it is not possible
+to read this value from the chip, you need to write at least once to
+this file before you can read back from it.
diff --git a/Documentation/i2c/dev-interface b/Documentation/i2c/dev-interface
index b849ad636583..9dd79123ddd9 100644
--- a/Documentation/i2c/dev-interface
+++ b/Documentation/i2c/dev-interface
@@ -90,12 +90,15 @@ ioctl(file,I2C_SLAVE,long addr)
ioctl(file,I2C_TENBIT,long select)
Selects ten bit addresses if select not equals 0, selects normal 7 bit
- addresses if select equals 0. Default 0.
+ addresses if select equals 0. Default 0. This request is only valid
+ if the adapter has I2C_FUNC_10BIT_ADDR.
ioctl(file,I2C_PEC,long select)
Selects SMBus PEC (packet error checking) generation and verification
if select not equals 0, disables if select equals 0. Default 0.
- Used only for SMBus transactions.
+ Used only for SMBus transactions. This request only has an effect if the
+ the adapter has I2C_FUNC_SMBUS_PEC; it is still safe if not, it just
+ doesn't have any effect.
ioctl(file,I2C_FUNCS,unsigned long *funcs)
Gets the adapter functionality and puts it in *funcs.
@@ -103,8 +106,10 @@ ioctl(file,I2C_FUNCS,unsigned long *funcs)
ioctl(file,I2C_RDWR,struct i2c_rdwr_ioctl_data *msgset)
Do combined read/write transaction without stop in between.
- The argument is a pointer to a struct i2c_rdwr_ioctl_data {
+ Only valid if the adapter has I2C_FUNC_I2C. The argument is
+ a pointer to a
+ struct i2c_rdwr_ioctl_data {
struct i2c_msg *msgs; /* ptr to array of simple messages */
int nmsgs; /* number of messages to exchange */
}
diff --git a/Documentation/i2c/i2c-protocol b/Documentation/i2c/i2c-protocol
index 579b92d5f3a3..10518dd58814 100644
--- a/Documentation/i2c/i2c-protocol
+++ b/Documentation/i2c/i2c-protocol
@@ -68,7 +68,7 @@ We have found some I2C devices that needs the following modifications:
Flags I2C_M_IGNORE_NAK
Normally message is interrupted immediately if there is [NA] from the
- client. Setting this flag treats any [NA] as [A], and all of
+ client. Setting this flag treats any [NA] as [A], and all of
message is sent.
These messages may still fail to SCL lo->hi timeout.
diff --git a/Documentation/i2c/i2c-stub b/Documentation/i2c/i2c-stub
index 9cc081e69764..89e69ad3436c 100644
--- a/Documentation/i2c/i2c-stub
+++ b/Documentation/i2c/i2c-stub
@@ -6,13 +6,14 @@ This module is a very simple fake I2C/SMBus driver. It implements four
types of SMBus commands: write quick, (r/w) byte, (r/w) byte data, and
(r/w) word data.
-You need to provide a chip address as a module parameter when loading
-this driver, which will then only react to SMBus commands to this address.
+You need to provide chip addresses as a module parameter when loading this
+driver, which will then only react to SMBus commands to these addresses.
No hardware is needed nor associated with this module. It will accept write
-quick commands to one address; it will respond to the other commands (also
-to one address) by reading from or writing to an array in memory. It will
-also spam the kernel logs for every command it handles.
+quick commands to the specified addresses; it will respond to the other
+commands (also to the specified addresses) by reading from or writing to
+arrays in memory. It will also spam the kernel logs for every command it
+handles.
A pointer register with auto-increment is implemented for all byte
operations. This allows for continuous byte reads like those supported by
@@ -26,8 +27,8 @@ The typical use-case is like this:
PARAMETERS:
-int chip_addr:
- The SMBus address to emulate a chip at.
+int chip_addr[10]:
+ The SMBus addresses to emulate chips at.
CAVEATS:
@@ -41,9 +42,6 @@ If the hardware for your driver has banked registers (e.g. Winbond sensors
chips) this module will not work well - although it could be extended to
support that pretty easily.
-Only one chip address is supported - although this module could be
-extended to support more.
-
If you spam it hard enough, printk can be lossy. This module really wants
something like relayfs.
diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt
index 35985b34d5a6..2f75e750e4f5 100644
--- a/Documentation/i386/boot.txt
+++ b/Documentation/i386/boot.txt
@@ -168,6 +168,8 @@ Offset Proto Name Meaning
0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not
0235/3 N/A pad2 Unused
0238/4 2.06+ cmdline_size Maximum size of the kernel command line
+023C/4 2.07+ hardware_subarch Hardware subarchitecture
+0240/8 2.07+ hardware_subarch_data Subarchitecture-specific data
(1) For backwards compatibility, if the setup_sects field contains 0, the
real value is 4.
@@ -204,7 +206,7 @@ boot loaders can ignore those fields.
The byte order of all fields is littleendian (this is x86, after all.)
-Field name: setup_secs
+Field name: setup_sects
Type: read
Offset/size: 0x1f1/1
Protocol: ALL
@@ -356,6 +358,13 @@ Protocol: 2.00+
- If 0, the protected-mode code is loaded at 0x10000.
- If 1, the protected-mode code is loaded at 0x100000.
+ Bit 6 (write): KEEP_SEGMENTS
+ Protocol: 2.07+
+ - if 0, reload the segment registers in the 32bit entry point.
+ - if 1, do not reload the segment registers in the 32bit entry point.
+ Assume that %cs %ds %ss %es are all set to flat segments with
+ a base of 0 (or the equivalent for their environment).
+
Bit 7 (write): CAN_USE_HEAP
Set this bit to 1 to indicate that the value entered in the
heap_end_ptr is valid. If this field is clear, some setup code
@@ -480,6 +489,29 @@ Protocol: 2.06+
cmdline_size characters. With protocol version 2.05 and earlier, the
maximum size was 255.
+Field name: hardware_subarch
+Type: write
+Offset/size: 0x23c/4
+Protocol: 2.07+
+
+ In a paravirtualized environment the hardware low level architectural
+ pieces such as interrupt handling, page table handling, and
+ accessing process control registers needs to be done differently.
+
+ This field allows the bootloader to inform the kernel we are in one
+ one of those environments.
+
+ 0x00000000 The default x86/PC environment
+ 0x00000001 lguest
+ 0x00000002 Xen
+
+Field name: hardware_subarch_data
+Type: write
+Offset/size: 0x240/8
+Protocol: 2.07+
+
+ A pointer to data that is specific to hardware subarch
+
**** THE KERNEL COMMAND LINE
diff --git a/Documentation/ia64/err_inject.txt b/Documentation/ia64/err_inject.txt
index 6449a7090dbb..223e4f0582d0 100644
--- a/Documentation/ia64/err_inject.txt
+++ b/Documentation/ia64/err_inject.txt
@@ -21,10 +21,10 @@ software test suits to do stressful testing on IPF.
Below is a sample application as part of the whole tool. The sample
can be used as a working test tool. Or it can be expanded to include
-more features. It also can be a integrated into a libary or other user
+more features. It also can be a integrated into a library or other user
application to have more thorough test.
-The sample application takes err.conf as error configuation input. Gcc
+The sample application takes err.conf as error configuration input. GCC
compiles the code. After you install err_inject driver, you can run
this sample application to inject errors.
@@ -809,7 +809,7 @@ int err_inj()
}
/* Create semaphore: If one_lock, one semaphore for all processors.
- Otherwise, one sempaphore for each processor. */
+ Otherwise, one semaphore for each processor. */
if (one_lock) {
if (create_sem(0)) {
printf("Can not create semaphore...exit\n");
diff --git a/Documentation/ide.txt b/Documentation/ide.txt
index 3bb9f9c98611..1d50f23a5cab 100644
--- a/Documentation/ide.txt
+++ b/Documentation/ide.txt
@@ -242,6 +242,8 @@ Summary of ide driver parameters for kernel command line
and quite likely to cause trouble with
older/odd IDE drives.
+ "hdx=nodma" : disallow DMA
+
"hdx=swapdata" : when the drive is a disk, byte swap all data
"hdx=bswap" : same as above..........
@@ -278,8 +280,6 @@ Summary of ide driver parameters for kernel command line
"idex=four" : four drives on idex and ide(x^1) share same ports
"idex=reset" : reset interface after probe
-
- "idex=dma" : automatically configure/use DMA if possible.
"idex=ata66" : informs the interface that it has an 80c cable
for chipsets that are ATA-66 capable, but the
@@ -288,8 +288,6 @@ Summary of ide driver parameters for kernel command line
"ide=reverse" : formerly called to pci sub-system, but now local.
- "ide=nodma" : disable DMA globally for the IDE subsystem.
-
The following are valid ONLY on ide0, which usually corresponds
to the first ATA interface found on the particular host, and the defaults for
the base,ctl ports must not be altered.
diff --git a/Documentation/infiniband/user_mad.txt b/Documentation/infiniband/user_mad.txt
index 8ec54b974b67..744687dd195b 100644
--- a/Documentation/infiniband/user_mad.txt
+++ b/Documentation/infiniband/user_mad.txt
@@ -99,6 +99,20 @@ Transaction IDs
request/response pairs. The upper 32 bits are reserved for use by
the kernel and will be overwritten before a MAD is sent.
+P_Key Index Handling
+
+ The old ib_umad interface did not allow setting the P_Key index for
+ MADs that are sent and did not provide a way for obtaining the P_Key
+ index of received MADs. A new layout for struct ib_user_mad_hdr
+ with a pkey_index member has been defined; however, to preserve
+ binary compatibility with older applications, this new layout will
+ not be used unless the IB_USER_MAD_ENABLE_PKEY ioctl is called
+ before a file descriptor is used for anything else.
+
+ In September 2008, the IB_USER_MAD_ABI_VERSION will be incremented
+ to 6, the new layout of struct ib_user_mad_hdr will be used by
+ default, and the IB_USER_MAD_ENABLE_PKEY ioctl will be removed.
+
Setting IsSM Capability Bit
To set the IsSM capability bit for a port, simply open the
diff --git a/Documentation/initrd.txt b/Documentation/initrd.txt
index d3dc505104da..74f68b35f7c1 100644
--- a/Documentation/initrd.txt
+++ b/Documentation/initrd.txt
@@ -80,8 +80,8 @@ Compressed cpio images
----------------------
Recent kernels have support for populating a ramdisk from a compressed cpio
-archive, on such systems, the creation of a ramdisk image doesn't need to
-involve special block devices or loopbacks, you merely create a directory on
+archive. On such systems, the creation of a ramdisk image doesn't need to
+involve special block devices or loopbacks; you merely create a directory on
disk with the desired initrd content, cd to that directory, and run (as an
example):
@@ -293,7 +293,7 @@ information as small as possible. In this case, a common initrd could be
generated with all the necessary modules. Then, only /sbin/init or a file
read by it would have to be different.
-A third scenario are more convenient recovery disks, because information
+A third scenario is more convenient recovery disks, because information
like the location of the root FS partition doesn't have to be provided at
boot time, but the system loaded from initrd can invoke a user-friendly
dialog and it can also perform some sanity checks (or even some form of
@@ -339,8 +339,8 @@ the new, supported mechanism is called "pivot_root".
Mixed change_root and pivot_root mechanism
------------------------------------------
-In case you did not want to use root=/dev/ram0 to trig the pivot_root mechanism,
-you may create both /linuxrc and /sbin/init in your initrd image.
+In case you did not want to use root=/dev/ram0 to trigger the pivot_root
+mechanism, you may create both /linuxrc and /sbin/init in your initrd image.
/linuxrc would contain only the following:
@@ -350,7 +350,7 @@ echo 0x0100 >/proc/sys/kernel/real-root-dev
umount -n /proc
Once linuxrc exited, the kernel would mount again your initrd as root,
-this time executing /sbin/init. Again, it would be duty of this init
+this time executing /sbin/init. Again, it would be the duty of this init
to build the right environment (maybe using the root= device passed on
the cmdline) before the final execution of the real /sbin/init.
diff --git a/Documentation/input/atarikbd.txt b/Documentation/input/atarikbd.txt
index ab050621e20f..f3a3ba8847ba 100644
--- a/Documentation/input/atarikbd.txt
+++ b/Documentation/input/atarikbd.txt
@@ -170,7 +170,7 @@ major controller faults (ROM checksum and RAM test) and such things as stuck
keys. Any keys down at power-up are presumed to be stuck, and their BREAK
(sic) code is returned (which without the preceding MAKE code is a flag for a
keyboard error). If the controller self-test completes without error, the code
-0xF0 is returned. (This code will be used to indicate the version/rlease of
+0xF0 is returned. (This code will be used to indicate the version/release of
the ikbd controller. The first release of the ikbd is version 0xF0, should
there be a second release it will be 0xF1, and so on.)
The ikbd defaults to a mouse position reporting with threshold of 1 unit in
@@ -413,7 +413,7 @@ INTERROGATION MODE.
%nnnnmmmm ; where m is JOYSTICK1 state
; and n is JOYSTICK0 state
-Sets the ikbd to do nothing but monitor the serial command lne, maintain the
+Sets the ikbd to do nothing but monitor the serial command line, maintain the
time-of-day clock, and monitor the joystick. The rate sets the interval
between joystick samples.
N.B. The user should not set the rate higher than the serial communications
@@ -446,10 +446,10 @@ The sample interval should be as constant as possible.
; until vertical cursor key is generated before RY
; has elapsed
VX ; length (in tenths of seconds) of joystick closure
- ; until horizontal cursor keystokes are generated
+ ; until horizontal cursor keystrokes are generated
; after RX has elapsed
VY ; length (in tenths of seconds) of joystick closure
- ; until vertical cursor keystokes are generated
+ ; until vertical cursor keystrokes are generated
; after RY has elapsed
In this mode, joystick 0 is scanned in a way that simulates cursor keystrokes.
diff --git a/Documentation/input/ff.txt b/Documentation/input/ff.txt
index 085eb15b45b7..ded4d5f53109 100644
--- a/Documentation/input/ff.txt
+++ b/Documentation/input/ff.txt
@@ -1,5 +1,5 @@
Force feedback for Linux.
-By Johann Deneux <deneux@ifrance.com> on 2001/04/22.
+By Johann Deneux <johann.deneux@gmail.com> on 2001/04/22.
Updated by Anssi Hannula <anssi.hannula@gmail.com> on 2006/04/09.
You may redistribute this file. Please remember to include shape.fig and
interactive.fig as well.
diff --git a/Documentation/input/iforce-protocol.txt b/Documentation/input/iforce-protocol.txt
index 8777d2d321e3..3ac92413c874 100644
--- a/Documentation/input/iforce-protocol.txt
+++ b/Documentation/input/iforce-protocol.txt
@@ -4,10 +4,10 @@ specify force effects to I-Force 2.0 devices. None of this information comes
from Immerse. That's why you should not trust what is written in this
document. This document is intended to help understanding the protocol.
This is not a reference. Comments and corrections are welcome. To contact me,
-send an email to: deneux@ifrance.com
+send an email to: johann.deneux@gmail.com
** WARNING **
-I may not be held responsible for any dammage or harm caused if you try to
+I shall not be held responsible for any damage or harm caused if you try to
send data to your I-Force device based on what you read in this document.
** Preliminary Notes:
@@ -151,13 +151,13 @@ OP= ff
Query command. Length varies according to the query type.
The general format of this packet is:
ff 01 QUERY [INDEX] CHECKSUM
-reponses are of the same form:
+responses are of the same form:
FF LEN QUERY VALUE_QUERIED CHECKSUM2
where LEN = 1 + length(VALUE_QUERIED)
**** Query ram size ****
QUERY = 42 ('B'uffer size)
-The device should reply with the same packet plus two additionnal bytes
+The device should reply with the same packet plus two additional bytes
containing the size of the memory:
ff 03 42 03 e8 CS would mean that the device has 1000 bytes of ram available.
@@ -234,19 +234,23 @@ is the amount of memory apparently needed for every set of parameters:
** Appendix: How to study the protocol ? **
-1. Generate effects using the force editor provided with the DirectX SDK, or use Immersion Studio (freely available at their web site in the developer section: www.immersion.com)
-2. Start a soft spying RS232 or USB (depending on where you connected your joystick/wheel). I used ComPortSpy from fCoder (alpha version!)
+1. Generate effects using the force editor provided with the DirectX SDK, or
+use Immersion Studio (freely available at their web site in the developer section:
+www.immersion.com)
+2. Start a soft spying RS232 or USB (depending on where you connected your
+joystick/wheel). I used ComPortSpy from fCoder (alpha version!)
3. Play the effect, and watch what happens on the spy screen.
A few words about ComPortSpy:
-At first glance, this soft seems, hum, well... buggy. In fact, data appear with a few seconds latency. Personnaly, I restart it every time I play an effect.
+At first glance, this software seems, hum, well... buggy. In fact, data appear with a
+few seconds latency. Personally, I restart it every time I play an effect.
Remember it's free (as in free beer) and alpha!
** URLS **
Check www.immerse.com for Immersion Studio, and www.fcoder.com for ComPortSpy.
** Author of this document **
-Johann Deneux <deneux@ifrance.com>
+Johann Deneux <johann.deneux@gmail.com>
Home page at http://www.esil.univ-mrs.fr/~jdeneux/projects/ff/
Additions by Vojtech Pavlik.
diff --git a/Documentation/input/input-programming.txt b/Documentation/input/input-programming.txt
index d9d523099bb7..47fc86830cd7 100644
--- a/Documentation/input/input-programming.txt
+++ b/Documentation/input/input-programming.txt
@@ -42,8 +42,8 @@ static int __init button_init(void)
goto err_free_irq;
}
- button_dev->evbit[0] = BIT(EV_KEY);
- button_dev->keybit[LONG(BTN_0)] = BIT(BTN_0);
+ button_dev->evbit[0] = BIT_MASK(EV_KEY);
+ button_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0);
error = input_register_device(button_dev);
if (error) {
@@ -79,7 +79,7 @@ In the _init function, which is called either upon module load or when
booting the kernel, it grabs the required resources (it should also check
for the presence of the device).
-Then it allocates a new input device structure with input_aloocate_device()
+Then it allocates a new input device structure with input_allocate_device()
and sets up input bitfields. This way the device driver tells the other
parts of the input systems what it is - what events can be generated or
accepted by this input device. Our example device can only generate EV_KEY
@@ -217,14 +217,15 @@ If you don't need absfuzz and absflat, you can set them to zero, which mean
that the thing is precise and always returns to exactly the center position
(if it has any).
-1.4 NBITS(), LONG(), BIT()
+1.4 BITS_TO_LONGS(), BIT_WORD(), BIT_MASK()
~~~~~~~~~~~~~~~~~~~~~~~~~~
-These three macros from input.h help some bitfield computations:
+These three macros from bitops.h help some bitfield computations:
- NBITS(x) - returns the length of a bitfield array in longs for x bits
- LONG(x) - returns the index in the array in longs for bit x
- BIT(x) - returns the index in a long for bit x
+ BITS_TO_LONGS(x) - returns the length of a bitfield array in longs for
+ x bits
+ BIT_WORD(x) - returns the index in the array in longs for bit x
+ BIT_MASK(x) - returns the index in a long for bit x
1.5 The id* and name fields
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/isdn/CREDITS b/Documentation/isdn/CREDITS
index 7c17c837064f..8cac6c2f23ee 100644
--- a/Documentation/isdn/CREDITS
+++ b/Documentation/isdn/CREDITS
@@ -40,7 +40,7 @@ Andreas Kool (akool@Kool.f.EUnet.de)
Pedro Roque Marques (roque@di.fc.ul.pt)
For lot of new ideas and the pcbit driver.
-Eberhard Moenkeberg (emoenke@gwdg.de)
+Eberhard Mönkeberg (emoenke@gwdg.de)
For testing and help to get into kernel.
Thomas Neumann (tn@ruhr.de)
diff --git a/Documentation/isdn/README.concap b/Documentation/isdn/README.concap
index 2f114babe4b6..a76d74845a4c 100644
--- a/Documentation/isdn/README.concap
+++ b/Documentation/isdn/README.concap
@@ -111,7 +111,7 @@ struct concap_proto_ops{
struct concap_proto * (*proto_new) (void);
/* delete encapsulation protocol instance and free all its resources.
- cprot may no loger be referenced after calling this */
+ cprot may no longer be referenced after calling this */
void (*proto_del)(struct concap_proto *cprot);
/* initialize the protocol's data. To be called at interface startup
diff --git a/Documentation/ja_JP/HOWTO b/Documentation/ja_JP/HOWTO
index 9f08dab1e75b..d9d832c010ef 100644
--- a/Documentation/ja_JP/HOWTO
+++ b/Documentation/ja_JP/HOWTO
@@ -1,4 +1,4 @@
-NOTE:
+NOTE:
This is a version of Documentation/HOWTO translated into Japanese.
This document is maintained by Tsugikazu Shibata <tshibata@ab.jp.nec.com>
and the JF Project team <www.linux.or.jp/JF>.
@@ -11,14 +11,14 @@ for non English (read: Japanese) speakers and is not intended as a
fork. So if you have any comments or updates for this file, please try
to update the original English file first.
-Last Updated: 2007/07/18
+Last Updated: 2007/09/23
==================================
ã“ã‚Œã¯ã€
-linux-2.6.22/Documentation/HOWTO
+linux-2.6.23/Documentation/HOWTO
ã®å’Œè¨³ã§ã™ã€‚
翻訳団体: JF プロジェクト < http://www.linux.or.jp/JF/ >
-翻訳日: 2007/07/16
+翻訳日: 2007/09/19
翻訳者: Tsugikazu Shibata <tshibata at ab dot jp dot nec dot com>
校正者: æ¾å€‰ã•ã‚“ <nbh--mats at nifty dot com>
å°æž— é›…å…¸ã•ã‚“ (Masanori Kobayasi) <zap03216 at nifty dot ne dot jp>
@@ -27,6 +27,7 @@ linux-2.6.22/Documentation/HOWTO
野å£ã•ã‚“ (Kenji Noguchi) <tokyo246 at gmail dot com>
河内ã•ã‚“ (Takayoshi Kochi) <t-kochi at bq dot jp dot nec dot com>
岩本ã•ã‚“ (iwamoto) <iwamoto.kn at ncos dot nec dot co dot jp>
+ 内田ã•ã‚“ (Satoshi Uchida) <s-uchida at ap dot jp dot nec dot com>
==================================
Linux カーãƒãƒ«é–‹ç™ºã®ã‚„ã‚Šæ–¹
@@ -40,7 +41,7 @@ Linux カーãƒãƒ«é–‹ç™ºã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¨å…±ã«æ´»å‹•ã™ã‚‹ã‚„り方を学ã
手助ã‘ã«ãªã‚Šã¾ã™ã€‚
ã‚‚ã—ã€ã“ã®ãƒ‰ã‚­ãƒ¥ãƒ¡ãƒ³ãƒˆã®ã©ã“ã‹ãŒå¤ããªã£ã¦ã„ãŸå ´åˆã«ã¯ã€ã“ã®ãƒ‰ã‚­ãƒ¥ãƒ¡ãƒ³
-トã®æœ€å¾Œã«ãƒªã‚¹ãƒˆã—ãŸãƒ¡ãƒ³ãƒ†ãƒŠãƒ¼ã«ãƒ‘ッãƒã‚’é€ã£ã¦ãã ã•ã„。
+トã®æœ€å¾Œã«ãƒªã‚¹ãƒˆã—ãŸãƒ¡ãƒ³ãƒ†ãƒŠã«ãƒ‘ッãƒã‚’é€ã£ã¦ãã ã•ã„。
ã¯ã˜ã‚ã«
---------
@@ -59,7 +60,7 @@ Linux カーãƒãƒ«é–‹ç™ºã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¨å…±ã«æ´»å‹•ã™ã‚‹ã‚„り方を学ã
ãƒãƒ«é–‹ç™ºè€…ã«ã¯å¿…è¦ã§ã™ã€‚アーキテクãƒãƒ£å‘ã‘ã®ä½Žãƒ¬ãƒ™ãƒ«éƒ¨åˆ†ã®é–‹ç™ºã‚’ã™ã‚‹ã®
ã§ãªã‘ã‚Œã°ã€(ã©ã‚“ãªã‚¢ãƒ¼ã‚­ãƒ†ã‚¯ãƒãƒ£ã§ã‚‚)アセンブリ(訳注: 言語)ã¯å¿…è¦ã‚ã‚Š
ã¾ã›ã‚“。以下ã®æœ¬ã¯ã€C 言語ã®å分ãªçŸ¥è­˜ã‚„何年もã®çµŒé¨“ã«å–ã£ã¦ä»£ã‚ã‚‹ã‚‚ã®
-ã§ã¯ã‚ã‚Šã¾ã›ã‚“ãŒã€å°‘ãªãã¨ã‚‚リファレンスã¨ã—ã¦ã¯ã„ã„本ã§ã™ã€‚
+ã§ã¯ã‚ã‚Šã¾ã›ã‚“ãŒã€å°‘ãªãã¨ã‚‚リファレンスã¨ã—ã¦ã¯è‰¯ã„本ã§ã™ã€‚
- "The C Programming Language" by Kernighan and Ritchie [Prentice Hall]
-『プログラミング言語C第2版ã€(B.W. カーニãƒãƒ³/D.M. リッãƒãƒ¼è‘— 石田晴久訳) [共立出版]
- "Practical C Programming" by Steve Oualline [O'Reilly]
@@ -76,7 +77,7 @@ Linux カーãƒãƒ«é–‹ç™ºã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¨å…±ã«æ´»å‹•ã™ã‚‹ã‚„り方を学ã
ã¨ãã©ãã€ã‚«ãƒ¼ãƒãƒ«ãŒãƒ„ールãƒã‚§ã‚¤ãƒ³ã‚„ C 言語拡張ã«ç½®ã„ã¦ã„ã‚‹å‰æãŒã©ã†
ãªã£ã¦ã„ã‚‹ã®ã‹ã‚ã‹ã‚Šã«ãã„ã“ã¨ãŒã‚ã‚Šã€ã¾ãŸã€æ®‹å¿µãªã“ã¨ã«æ±ºå®šçš„ãªãƒªãƒ•ã‚¡
レンスã¯å­˜åœ¨ã—ã¾ã›ã‚“。情報を得るã«ã¯ã€gcc ã® info ページ( info gcc )ã‚’
-ã¿ã¦ãã ã•ã„。
+見ã¦ãã ã•ã„。
ã‚ãªãŸã¯æ—¢å­˜ã®é–‹ç™ºã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¨ä¸€ç·’ã«ä½œæ¥­ã™ã‚‹æ–¹æ³•ã‚’å­¦ã¼ã†ã¨ã—ã¦ã„ã‚‹ã“
ã¨ã«ç•™æ„ã—ã¦ãã ã•ã„。ãã®ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¯ã€ã‚³ãƒ¼ãƒ‡ã‚£ãƒ³ã‚°ã€ã‚¹ã‚¿ã‚¤ãƒ«ã€
@@ -92,7 +93,7 @@ Linux カーãƒãƒ«é–‹ç™ºã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¨å…±ã«æ´»å‹•ã™ã‚‹ã‚„り方を学ã
Linux カーãƒãƒ«ã®ã‚½ãƒ¼ã‚¹ã‚³ãƒ¼ãƒ‰ã¯ GPL ライセンスã®ä¸‹ã§ãƒªãƒªãƒ¼ã‚¹ã•ã‚Œã¦ã„ã¾
ã™ã€‚ライセンスã®è©³ç´°ã«ã¤ã„ã¦ã¯ã€ã‚½ãƒ¼ã‚¹ãƒ„リーã®ãƒ¡ã‚¤ãƒ³ãƒ‡ã‚£ãƒ¬ã‚¯ãƒˆãƒªã«å­˜åœ¨
-ã™ã‚‹ã€COPYING ã®ãƒ•ã‚¡ã‚¤ãƒ«ã‚’ã¿ã¦ãã ã•ã„。もã—ライセンスã«ã¤ã„ã¦ã•ã‚‰ã«è³ª
+ã™ã‚‹ã€COPYING ã®ãƒ•ã‚¡ã‚¤ãƒ«ã‚’見ã¦ãã ã•ã„。もã—ライセンスã«ã¤ã„ã¦ã•ã‚‰ã«è³ª
å•ãŒã‚ã‚Œã°ã€Linux Kernel メーリングリストã«è³ªå•ã™ã‚‹ã®ã§ã¯ãªãã€ã©ã†ãž
法律家ã«ç›¸è«‡ã—ã¦ãã ã•ã„。メーリングリストã®äººé”ã¯æ³•å¾‹å®¶ã§ã¯ãªãã€æ³•çš„
å•é¡Œã«ã¤ã„ã¦ã¯å½¼ã‚‰ã®å£°æ˜Žã¯ã‚ã¦ã«ã™ã‚‹ã¹ãã§ã¯ã‚ã‚Šã¾ã›ã‚“。
@@ -109,7 +110,8 @@ Linux カーãƒãƒ«ã‚½ãƒ¼ã‚¹ãƒ„リーã¯å¹…広ã„範囲ã®ãƒ‰ã‚­ãƒ¥ãƒ¡ãƒ³ãƒˆã‚’å
æ–°ã—ã„ドキュメントファイルも追加ã™ã‚‹ã“ã¨ã‚’勧ã‚ã¾ã™ã€‚
カーãƒãƒ«ã®å¤‰æ›´ãŒã€ã‚«ãƒ¼ãƒãƒ«ãŒãƒ¦ãƒ¼ã‚¶ç©ºé–“ã«å…¬é–‹ã—ã¦ã„るインターフェイスã®
変更を引ãèµ·ã“ã™å ´åˆã€ãã®å¤‰æ›´ã‚’説明ã™ã‚‹ãƒžãƒ‹ãƒ¥ã‚¢ãƒ«ãƒšãƒ¼ã‚¸ã®ãƒ‘ッãƒã‚„情報
-をマニュアルページã®ãƒ¡ãƒ³ãƒ†ãƒŠ mtk-manpages@gmx.net ã«é€ã‚‹ã“ã¨ã‚’勧ã‚ã¾ã™ã€‚
+をマニュアルページã®ãƒ¡ãƒ³ãƒ†ãƒŠ mtk-manpages@gmx.net ã«é€ã‚‹ã“ã¨ã‚’勧ã‚ã¾
+ã™ã€‚
以下ã¯ã‚«ãƒ¼ãƒãƒ«ã‚½ãƒ¼ã‚¹ãƒ„リーã«å«ã¾ã‚Œã¦ã„る読んã§ãŠãã¹ãファイルã®ä¸€è¦§ã§
ã™-
@@ -117,7 +119,7 @@ Linux カーãƒãƒ«ã‚½ãƒ¼ã‚¹ãƒ„リーã¯å¹…広ã„範囲ã®ãƒ‰ã‚­ãƒ¥ãƒ¡ãƒ³ãƒˆã‚’å
README
ã“ã®ãƒ•ã‚¡ã‚¤ãƒ«ã¯ Linuxカーãƒãƒ«ã®ç°¡å˜ãªèƒŒæ™¯ã¨ã‚«ãƒ¼ãƒãƒ«ã‚’設定(訳注
configure )ã—ã€ç”Ÿæˆ(訳注 build )ã™ã‚‹ãŸã‚ã«å¿…è¦ãªã“ã¨ã¯ä½•ã‹ãŒæ›¸ã‹ã‚Œ
- ã¦ã„ã¾ã™ã€‚カーãƒãƒ«ã«é–¢ã—ã¦åˆã‚ã¦ã®äººã¯ã“ã“ã‹ã‚‰ã‚¹ã‚¿ãƒ¼ãƒˆã™ã‚‹ã¨ã‚ˆã„ã§
+ ã¦ã„ã¾ã™ã€‚カーãƒãƒ«ã«é–¢ã—ã¦åˆã‚ã¦ã®äººã¯ã“ã“ã‹ã‚‰ã‚¹ã‚¿ãƒ¼ãƒˆã™ã‚‹ã¨è‰¯ã„ã§
ã—ょã†ã€‚
Documentation/Changes
@@ -128,7 +130,7 @@ Linux カーãƒãƒ«ã‚½ãƒ¼ã‚¹ãƒ„リーã¯å¹…広ã„範囲ã®ãƒ‰ã‚­ãƒ¥ãƒ¡ãƒ³ãƒˆã‚’å
Documentation/CodingStyle
ã“れ㯠Linux カーãƒãƒ«ã®ã‚³ãƒ¼ãƒ‡ã‚£ãƒ³ã‚°ã‚¹ã‚¿ã‚¤ãƒ«ã¨èƒŒæ™¯ã«ã‚ã‚‹ç†ç”±ã‚’記述
ã—ã¦ã„ã¾ã™ã€‚å…¨ã¦ã®æ–°ã—ã„コードã¯ã“ã®ãƒ‰ã‚­ãƒ¥ãƒ¡ãƒ³ãƒˆã«ã‚るガイドライン
- ã«å¾“ã£ã¦ã„ã‚‹ã“ã¨ã‚’期待ã•ã‚Œã¦ã„ã¾ã™ã€‚大部分ã®ãƒ¡ãƒ³ãƒ†ãƒŠãƒ¼ã¯ã“れらã®ãƒ«ãƒ¼
+ ã«å¾“ã£ã¦ã„ã‚‹ã“ã¨ã‚’期待ã•ã‚Œã¦ã„ã¾ã™ã€‚大部分ã®ãƒ¡ãƒ³ãƒ†ãƒŠã¯ã“れらã®ãƒ«ãƒ¼
ルã«å¾“ã£ã¦ã„ã‚‹ã‚‚ã®ã ã‘ã‚’å—ã‘付ã‘ã€å¤šãã®äººã¯æ­£ã—ã„スタイルã®ã‚³ãƒ¼ãƒ‰
ã ã‘をレビューã—ã¾ã™ã€‚
@@ -168,16 +170,16 @@ Linux カーãƒãƒ«ã‚½ãƒ¼ã‚¹ãƒ„リーã¯å¹…広ã„範囲ã®ãƒ‰ã‚­ãƒ¥ãƒ¡ãƒ³ãƒˆã‚’å
支æ´ã—ã¦ãã ã•ã„。
Documentation/ManagementStyle
- ã“ã®ãƒ‰ã‚­ãƒ¥ãƒ¡ãƒ³ãƒˆã¯ Linux カーãƒãƒ«ã®ãƒ¡ãƒ³ãƒ†ãƒŠãƒ¼é”ãŒã©ã†è¡Œå‹•ã™ã‚‹ã‹ã€
+ ã“ã®ãƒ‰ã‚­ãƒ¥ãƒ¡ãƒ³ãƒˆã¯ Linux カーãƒãƒ«ã®ãƒ¡ãƒ³ãƒ†ãƒŠé”ãŒã©ã†è¡Œå‹•ã™ã‚‹ã‹ã€
彼らã®æ‰‹æ³•ã®èƒŒæ™¯ã«ã‚る共有ã•ã‚Œã¦ã„る精神ã«ã¤ã„ã¦è¨˜è¿°ã—ã¦ã„ã¾ã™ã€‚ã“
ã‚Œã¯ã‚«ãƒ¼ãƒãƒ«é–‹ç™ºã®åˆå¿ƒè€…ãªã‚‰ï¼ˆã‚‚ã—ãã¯ã€å˜ã«èˆˆå‘³ãŒã‚ã‚‹ã ã‘ã®äººã§ã‚‚)
- é‡è¦ã§ã™ã€‚ãªãœãªã‚‰ã“ã®ãƒ‰ã‚­ãƒ¥ãƒ¡ãƒ³ãƒˆã¯ã€ã‚«ãƒ¼ãƒãƒ«ãƒ¡ãƒ³ãƒ†ãƒŠãƒ¼é”ã®ç‹¬ç‰¹ãª
+ é‡è¦ã§ã™ã€‚ãªãœãªã‚‰ã“ã®ãƒ‰ã‚­ãƒ¥ãƒ¡ãƒ³ãƒˆã¯ã€ã‚«ãƒ¼ãƒãƒ«ãƒ¡ãƒ³ãƒ†ãƒŠé”ã®ç‹¬ç‰¹ãª
行動ã«ã¤ã„ã¦ã®å¤šãã®èª¤è§£ã‚„混乱を解消ã™ã‚‹ã‹ã‚‰ã§ã™ã€‚
Documentation/stable_kernel_rules.txt
ã“ã®ãƒ•ã‚¡ã‚¤ãƒ«ã¯ã©ã®ã‚ˆã†ã« stable カーãƒãƒ«ã®ãƒªãƒªãƒ¼ã‚¹ãŒè¡Œã‚れるã‹ã®ãƒ«ãƒ¼
ルãŒè¨˜è¿°ã•ã‚Œã¦ã„ã¾ã™ã€‚ãã—ã¦ã“れらã®ãƒªãƒªãƒ¼ã‚¹ã®ä¸­ã®ã©ã“ã‹ã§å¤‰æ›´ã‚’å–
- り入れã¦ã‚‚らã„ãŸã„å ´åˆã«ä½•ã‚’ã™ã‚Œã°ã„ã„ã‹ãŒç¤ºã•ã‚Œã¦ã„ã¾ã™ã€‚
+ り入れã¦ã‚‚らã„ãŸã„å ´åˆã«ä½•ã‚’ã™ã‚Œã°è‰¯ã„ã‹ãŒç¤ºã•ã‚Œã¦ã„ã¾ã™ã€‚
Documentation/kernel-docs.txt
  カーãƒãƒ«é–‹ç™ºã«ä»˜éšã™ã‚‹å¤–部ドキュメントã®ãƒªã‚¹ãƒˆã§ã™ã€‚ã‚‚ã—ã‚ãªãŸãŒ
@@ -218,9 +220,9 @@ web サイトã«ã¯ã€ã‚³ãƒ¼ãƒ‰ã®æ§‹æˆã€ã‚µãƒ–システムã€ç¾åœ¨å­˜åœ¨ã™ã
ã“ã“ã«ã¯ã€ã¾ãŸã€ã‚«ãƒ¼ãƒãƒ«ã®ã‚³ãƒ³ãƒ‘イルã®ã‚„り方やパッãƒã®å½“ã¦æ–¹ãªã©ã®é–“接
çš„ãªåŸºæœ¬æƒ…報も記述ã•ã‚Œã¦ã„ã¾ã™ã€‚
-ã‚ãªãŸãŒã©ã“ã‹ã‚‰ã‚¹ã‚¿ãƒ¼ãƒˆã—ã¦ã‚ˆã„ã‹ã‚ã‹ã‚‰ãªã„ãŒã€Linux カーãƒãƒ«é–‹ç™ºã‚³ãƒŸãƒ¥
+ã‚ãªãŸãŒã©ã“ã‹ã‚‰ã‚¹ã‚¿ãƒ¼ãƒˆã—ã¦è‰¯ã„ã‹ã‚ã‹ã‚‰ãªã„ãŒã€Linux カーãƒãƒ«é–‹ç™ºã‚³ãƒŸãƒ¥
ニティã«å‚加ã—ã¦ä½•ã‹ã™ã‚‹ã“ã¨ã‚’ã•ãŒã—ã¦ã„ã‚‹å ´åˆã«ã¯ã€Linux kernel
-Janitor's プロジェクトã«ã„ã‘ã°ã‚ˆã„ã§ã—ょㆠ-
+Janitor's プロジェクトã«ã„ã‘ã°è‰¯ã„ã§ã—ょㆠ-
http://janitor.kernelnewbies.org/
ã“ã“ã¯ãã®ã‚ˆã†ãªã‚¹ã‚¿ãƒ¼ãƒˆã‚’ã™ã‚‹ã®ã«ã†ã£ã¦ã¤ã‘ã®å ´æ‰€ã§ã™ã€‚ã“ã“ã«ã¯ã€
Linux カーãƒãƒ«ã‚½ãƒ¼ã‚¹ãƒ„リーã®ä¸­ã«å«ã¾ã‚Œã‚‹ã€ãã‚Œã„ã«ã—ã€ä¿®æ­£ã—ãªã‘ã‚Œã°ãª
@@ -243,7 +245,7 @@ Linux カーãƒãƒ«ã‚½ãƒ¼ã‚¹ãƒ„リーã®ä¸­ã«å«ã¾ã‚Œã‚‹ã€ãã‚Œã„ã«ã—ã€ä¿
自己å‚照方å¼ã§ã€ç´¢å¼•ãŒã¤ã„㟠web å½¢å¼ã§ã€ã‚½ãƒ¼ã‚¹ã‚³ãƒ¼ãƒ‰ã‚’å‚ç…§ã™ã‚‹ã“ã¨ãŒ
ã§ãã¾ã™ã€‚ã“ã®æœ€æ–°ã®ç´ æ™´ã—ã„カーãƒãƒ«ã‚³ãƒ¼ãƒ‰ã®ãƒªãƒã‚¸ãƒˆãƒªã¯ä»¥ä¸‹ã§è¦‹ã¤ã‹ã‚Š
ã¾ã™-
- http://sosdg.org/~coywolf/lxr/
+ http://sosdg.org/~qiyong/lxr/
開発プロセス
-----------------------
@@ -265,9 +267,9 @@ Linux カーãƒãƒ«ã®é–‹ç™ºãƒ—ロセスã¯ç¾åœ¨å¹¾ã¤ã‹ã®ç•°ãªã‚‹ãƒ¡ã‚¤ãƒ³ã‚
以下ã®ã¨ãŠã‚Š-
- æ–°ã—ã„カーãƒãƒ«ãŒãƒªãƒªãƒ¼ã‚¹ã•ã‚ŒãŸç›´å¾Œã«ã€2週間ã®ç‰¹åˆ¥æœŸé–“ãŒè¨­ã‘られã€
- ã“ã®æœŸé–“中ã«ã€ãƒ¡ãƒ³ãƒ†ãƒŠãƒ¼é”㯠Linus ã«å¤§ããªå·®åˆ†ã‚’é€ã‚‹ã“ã¨ãŒã§ãã¾
- ã™ã€‚ã“ã®ã‚ˆã†ãªå·®åˆ†ã¯é€šå¸¸ -mm カーãƒãƒ«ã«æ•°é€±é–“å«ã¾ã‚Œã¦ããŸãƒ‘ッãƒã§
- ã™ã€‚ 大ããªå¤‰æ›´ã¯ git(カーãƒãƒ«ã®ã‚½ãƒ¼ã‚¹ç®¡ç†ãƒ„ールã€è©³ç´°ã¯
+ ã“ã®æœŸé–“中ã«ã€ãƒ¡ãƒ³ãƒ†ãƒŠé”㯠Linus ã«å¤§ããªå·®åˆ†ã‚’é€ã‚‹ã“ã¨ãŒã§ãã¾ã™ã€‚
+ ã“ã®ã‚ˆã†ãªå·®åˆ†ã¯é€šå¸¸ -mm カーãƒãƒ«ã«æ•°é€±é–“å«ã¾ã‚Œã¦ããŸãƒ‘ッãƒã§ã™ã€‚
+ 大ããªå¤‰æ›´ã¯ git(カーãƒãƒ«ã®ã‚½ãƒ¼ã‚¹ç®¡ç†ãƒ„ールã€è©³ç´°ã¯
http://git.or.cz/ å‚ç…§) を使ã£ã¦é€ã‚‹ã®ãŒå¥½ã¾ã—ã„ã‚„ã‚Šæ–¹ã§ã™ãŒã€ãƒ‘ッ
ãƒãƒ•ã‚¡ã‚¤ãƒ«ã®å½¢å¼ã®ã¾ã¾é€ã‚‹ã®ã§ã‚‚å分ã§ã™ã€‚
@@ -285,6 +287,10 @@ Linux カーãƒãƒ«ã®é–‹ç™ºãƒ—ロセスã¯ç¾åœ¨å¹¾ã¤ã‹ã®ç•°ãªã‚‹ãƒ¡ã‚¤ãƒ³ã‚
ã«å®‰å®šã—ãŸçŠ¶æ…‹ã«ã‚ã‚‹ã¨åˆ¤æ–­ã—ãŸã¨ãã«ãƒªãƒªãƒ¼ã‚¹ã•ã‚Œã¾ã™ã€‚目標ã¯æ¯Žé€±æ–°
ã—ã„ -rc カーãƒãƒ«ã‚’リリースã™ã‚‹ã“ã¨ã§ã™ã€‚
+ - 以下㮠URL ã§å„ -rc リリースã«å­˜åœ¨ã™ã‚‹æ—¢çŸ¥ã®å¾Œæˆ»ã‚Šå•é¡Œã®ãƒªã‚¹ãƒˆ
+ ãŒè¿½è·¡ã•ã‚Œã¾ã™-
+ http://kernelnewbies.org/known_regressions
+
- ã“ã®ãƒ—ロセスã¯ã‚«ãƒ¼ãƒãƒ«ãŒ 「準備ãŒã§ããŸã€ã¨è€ƒãˆã‚‰ã‚Œã‚‹ã¾ã§ç¶™ç¶šã—ã¾
ã™ã€‚ã“ã®ãƒ—ロセスã¯ã ã„ãŸã„ 6週間継続ã—ã¾ã™ã€‚
@@ -331,8 +337,8 @@ Andrew ã¯å€‹åˆ¥ã®ã‚µãƒ–システムカーãƒãƒ«ãƒ„リーã¨ãƒ‘ッãƒã‚’å…¨ã¦é
linux-kernel メーリングリストã§åŽé›†ã•ã‚ŒãŸå¤šæ•°ã®ãƒ‘ッãƒã¨åŒæ™‚ã«ä¸€ã¤ã«ã¾
ã¨ã‚ã¾ã™ã€‚
ã“ã®ãƒ„リーã¯æ–°æ©Ÿèƒ½ã¨ãƒ‘ッãƒãŒæ¤œè¨¼ã•ã‚Œã‚‹å ´ã¨ãªã‚Šã¾ã™ã€‚ã‚る期間ã®é–“パッãƒ
-㌠-mm ã«å…¥ã£ã¦ä¾¡å€¤ã‚’証明ã•ã‚ŒãŸã‚‰ã€Andrew やサブシステムメンテナãŒã€ãƒ¡
-インラインã¸å…¥ã‚Œã‚‹ã‚ˆã†ã« Linus ã«ãƒ—ッシュã—ã¾ã™ã€‚
+㌠-mm ã«å…¥ã£ã¦ä¾¡å€¤ã‚’証明ã•ã‚ŒãŸã‚‰ã€Andrew やサブシステムメンテナãŒã€
+メインラインã¸å…¥ã‚Œã‚‹ã‚ˆã†ã« Linus ã«ãƒ—ッシュã—ã¾ã™ã€‚
メインカーãƒãƒ«ãƒ„リーã«å«ã‚ã‚‹ãŸã‚ã« Linus ã«é€ã‚‹å‰ã«ã€ã™ã¹ã¦ã®æ–°ã—ã„パッ
ãƒãŒ -mm ツリーã§ãƒ†ã‚¹ãƒˆã•ã‚Œã‚‹ã“ã¨ãŒå¼·ã推奨ã•ã‚Œã¾ã™ã€‚
@@ -460,7 +466,7 @@ MAINTAINERS ファイルã«ãƒªã‚¹ãƒˆãŒã‚ã‚Šã¾ã™ã®ã§å‚ç…§ã—ã¦ãã ã•ã
ã›ã‚“-
彼らã¯ã‚ãªãŸã®ãƒ‘ッãƒã®è¡Œæ¯Žã«ã‚³ãƒ¡ãƒ³ãƒˆã‚’入れãŸã„ã®ã§ã€ãã®ãŸã‚ã«ã¯ãã†ã™
ã‚‹ã—ã‹ã‚ã‚Šã¾ã›ã‚“。ã‚ãªãŸã®ãƒ¡ãƒ¼ãƒ«ãƒ—ログラムãŒç©ºç™½ã‚„タブを圧縮ã—ãªã„よã†
-ã«ç¢ºèªã—ãŸæ–¹ãŒã„ã„ã§ã™ã€‚最åˆã®è‰¯ã„テストã¨ã—ã¦ã¯ã€è‡ªåˆ†ã«ãƒ¡ãƒ¼ãƒ«ã‚’é€ã£ã¦
+ã«ç¢ºèªã—ãŸæ–¹ãŒè‰¯ã„ã§ã™ã€‚最åˆã®è‰¯ã„テストã¨ã—ã¦ã¯ã€è‡ªåˆ†ã«ãƒ¡ãƒ¼ãƒ«ã‚’é€ã£ã¦
ã¿ã¦ã€ãã®ãƒ‘ッãƒã‚’自分ã§å½“ã¦ã¦ã¿ã‚‹ã“ã¨ã§ã™ã€‚ã‚‚ã—ãã‚ŒãŒã†ã¾ãè¡Œã‹ãªã„ãª
らã€ã‚ãªãŸã®ãƒ¡ãƒ¼ãƒ«ãƒ—ログラムを直ã—ã¦ã‚‚らã†ã‹ã€æ­£ã—ãå‹•ãよã†ã«å¤‰ãˆã‚‹ã¹
ãã§ã™ã€‚
@@ -507,14 +513,14 @@ MAINTAINERS ファイルã«ãƒªã‚¹ãƒˆãŒã‚ã‚Šã¾ã™ã®ã§å‚ç…§ã—ã¦ãã ã•ã
ã¨ã‚‚普通ã®ã“ã¨ã§ã™ã€‚ã“ã‚Œã¯ã‚ãªãŸã®ãƒ‘ッãƒãŒå—ã‘入れられãªã„ã¨ã„ã†ã“ã¨ã§
㯠*ã‚ã‚Šã¾ã›ã‚“*ã€ãã—ã¦ã‚ãªãŸè‡ªèº«ã«å対ã™ã‚‹ã“ã¨ã‚’æ„味ã™ã‚‹ã®ã§ã‚‚ *ã‚ã‚Šã¾
ã›ã‚“*。å˜ã«è‡ªåˆ†ã®ãƒ‘ッãƒã«å¯¾ã—ã¦æŒ‡æ‘˜ã•ã‚ŒãŸå•é¡Œã‚’å…¨ã¦ä¿®æ­£ã—ã¦å†é€ã™ã‚Œã°
-ã„ã„ã®ã§ã™ã€‚
+良ã„ã®ã§ã™ã€‚
カーãƒãƒ«ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¨ä¼æ¥­çµ„ç¹”ã®ã¡ãŒã„
-----------------------------------------------------------------
カーãƒãƒ«ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¯å¤§éƒ¨åˆ†ã®ä¼çµ±çš„ãªä¼šç¤¾ã®é–‹ç™ºç’°å¢ƒã¨ã¯ç•°ã£ãŸã‚„ã‚Šæ–¹ã§
-å‹•ã„ã¦ã„ã¾ã™ã€‚以下ã¯å•é¡Œã‚’é¿ã‘ã‚‹ãŸã‚ã«ã§ãã‚‹ã¨ã‚ˆã„ã“ã¨ã®ã®ãƒªã‚¹ãƒˆã§ã™-
+å‹•ã„ã¦ã„ã¾ã™ã€‚以下ã¯å•é¡Œã‚’é¿ã‘ã‚‹ãŸã‚ã«ã§ãã‚‹ã¨è‰¯ã„ã“ã¨ã®ãƒªã‚¹ãƒˆã§ã™-
ã‚ãªãŸã®æ案ã™ã‚‹å¤‰æ›´ã«ã¤ã„ã¦è¨€ã†ã¨ãã®ã†ã¾ã„言ã„方:
@@ -525,7 +531,7 @@ MAINTAINERS ファイルã«ãƒªã‚¹ãƒˆãŒã‚ã‚Šã¾ã™ã®ã§å‚ç…§ã—ã¦ãã ã•ã
- "以下ã¯ä¸€é€£ã®å°ã•ãªãƒ‘ッãƒç¾¤ã§ã™ãŒ..."
- "ã“ã‚Œã¯å…¸åž‹çš„ãªãƒžã‚·ãƒ³ã§ã®æ€§èƒ½ã‚’å‘上ã•ã›ã¾ã™.."
- ã‚„ã‚ãŸæ–¹ãŒã„ã„悪ã„言ã„方:
+ ã‚„ã‚ãŸæ–¹ãŒè‰¯ã„悪ã„言ã„方:
- ã“ã®ã‚„り方㧠AIX/ptx/Solaris ã§ã¯ã§ããŸã®ã§ã€ã§ãã‚‹ã¯ãšã 
- ç§ã¯ã“れを20å¹´ã‚‚ã®é–“ã‚„ã£ã¦ããŸã€ã ã‹ã‚‰
@@ -575,10 +581,10 @@ Linux カーãƒãƒ«ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¯ã€ä¸€åº¦ã«å¤§é‡ã®ã‚³ãƒ¼ãƒ‰ã®å¡Šã‚’å–
1) å°ã•ã„パッãƒã¯ã‚ãªãŸã®ãƒ‘ッãƒãŒé©ç”¨ã•ã‚Œã‚‹è¦‹è¾¼ã¿ã‚’大ããã—ã¾ã™ã€ã‚«ãƒ¼
ãƒãƒ«ã®äººé”ã¯ãƒ‘ッãƒãŒæ­£ã—ã„ã‹ã©ã†ã‹ã‚’確èªã™ã‚‹æ™‚間や労力をã‹ã‘ãªã„ã‹
- らã§ã™ã€‚5è¡Œã®ãƒ‘ッãƒã¯ãƒ¡ãƒ³ãƒ†ãƒŠãŒãŸã£ãŸ1秒見るã ã‘ã§é©ç”¨ã§ãã¾ã™ã€‚ã—
- ã‹ã—ã€500è¡Œã®ãƒ‘ッãƒã¯ã€æ­£ã—ã„ã“ã¨ã‚’レビューã™ã‚‹ã®ã«æ•°æ™‚é–“ã‹ã‹ã‚‹ã‹ã‚‚
- ã—ã‚Œã¾ã›ã‚“(時間ã¯ãƒ‘ッãƒã®ã‚µã‚¤ã‚ºãªã©ã«ã‚ˆã‚ŠæŒ‡æ•°é–¢æ•°ã«æ¯”例ã—ã¦ã‹ã‹ã‚Šã¾
- ã™)
+ らã§ã™ã€‚5è¡Œã®ãƒ‘ッãƒã¯ãƒ¡ãƒ³ãƒ†ãƒŠãŒãŸã£ãŸ1秒見るã ã‘ã§é©ç”¨ã§ãã¾ã™ã€‚
+ ã—ã‹ã—ã€500è¡Œã®ãƒ‘ッãƒã¯ã€æ­£ã—ã„ã“ã¨ã‚’レビューã™ã‚‹ã®ã«æ•°æ™‚é–“ã‹ã‹ã‚‹ã‹
+ ã‚‚ã—ã‚Œã¾ã›ã‚“(時間ã¯ãƒ‘ッãƒã®ã‚µã‚¤ã‚ºãªã©ã«ã‚ˆã‚ŠæŒ‡æ•°é–¢æ•°ã«æ¯”例ã—ã¦ã‹ã‹ã‚Š
+ ã¾ã™)
å°ã•ã„パッãƒã¯ä½•ã‹ã‚ã£ãŸã¨ãã«ãƒ‡ãƒãƒƒã‚°ã‚‚ã¨ã¦ã‚‚ç°¡å˜ã«ãªã‚Šã¾ã™ã€‚パッ
ãƒã‚’1個1個å–り除ãã®ã¯ã€ã¨ã¦ã‚‚大ããªãƒ‘ッãƒã‚’当ã¦ãŸå¾Œã«(ã‹ã¤ã€ä½•ã‹ãŠ
@@ -587,23 +593,23 @@ Linux カーãƒãƒ«ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¯ã€ä¸€åº¦ã«å¤§é‡ã®ã‚³ãƒ¼ãƒ‰ã®å¡Šã‚’å–
2) å°ã•ã„パッãƒã‚’é€ã‚‹ã ã‘ã§ãªãã€é€ã‚‹ã¾ãˆã«ã€æ›¸ãç›´ã—ã¦ã€ã‚·ãƒ³ãƒ—ルã«ã™
ã‚‹(ã‚‚ã—ãã¯ã€å˜ã«é †ç•ªã‚’変ãˆã‚‹ã ã‘ã§ã‚‚)ã“ã¨ã‚‚ã€ã¨ã¦ã‚‚é‡è¦ã§ã™ã€‚
-以下ã¯ã‚«ãƒ¼ãƒãƒ«é–‹ç™ºè€…ã® Al Viro ã®ãŸã¨ãˆè©±ã—ã§ã™ï¼š
+以下ã¯ã‚«ãƒ¼ãƒãƒ«é–‹ç™ºè€…ã® Al Viro ã®ãŸã¨ãˆè©±ã§ã™ï¼š
"生徒ã®æ•°å­¦ã®å®¿é¡Œã‚’採点ã™ã‚‹å…ˆç”Ÿã®ã“ã¨ã‚’考ãˆã¦ã¿ã¦ãã ã•ã„ã€å…ˆ
- 生ã¯ç”Ÿå¾’ãŒè§£ã«åˆ°é”ã™ã‚‹ã¾ã§ã®è©¦è¡ŒéŒ¯èª¤ã‚’ã¿ãŸã„ã¨ã¯æ€ã‚ãªã„ã§ã—ょ
- ã†ã€‚先生ã¯ç°¡æ½”ãªæœ€é«˜ã®è§£ã‚’ã¿ãŸã„ã®ã§ã™ã€‚良ã„生徒ã¯ã“れを知ã£ã¦
+ 生ã¯ç”Ÿå¾’ãŒè§£ã«åˆ°é”ã™ã‚‹ã¾ã§ã®è©¦è¡ŒéŒ¯èª¤ã‚’見ãŸã„ã¨ã¯æ€ã‚ãªã„ã§ã—ょ
+ ã†ã€‚先生ã¯ç°¡æ½”ãªæœ€é«˜ã®è§£ã‚’見ãŸã„ã®ã§ã™ã€‚良ã„生徒ã¯ã“れを知ã£ã¦
ãŠã‚Šã€ãã—ã¦æœ€çµ‚解ã®å‰ã®ä¸­é–“作業をæ出ã™ã‚‹ã“ã¨ã¯æ±ºã—ã¦ãªã„ã®ã§
ã™"
- カーãƒãƒ«é–‹ç™ºã§ã‚‚ã“ã‚Œã¯åŒã˜ã§ã™ã€‚メンテナーé”ã¨ãƒ¬ãƒ“ューアé”ã¯ã€
- å•é¡Œã‚’解決ã™ã‚‹è§£ã®èƒŒå¾Œã«ãªã‚‹æ€è€ƒãƒ—ロセスをã¿ãŸã„ã¨ã¯æ€ã„ã¾ã›ã‚“。
- 彼らã¯å˜ç´”ã§ã‚ã–ã‚„ã‹ãªè§£æ±ºæ–¹æ³•ã‚’ã¿ãŸã„ã®ã§ã™ã€‚
+ カーãƒãƒ«é–‹ç™ºã§ã‚‚ã“ã‚Œã¯åŒã˜ã§ã™ã€‚メンテナé”ã¨ãƒ¬ãƒ“ューアé”ã¯ã€
+ å•é¡Œã‚’解決ã™ã‚‹è§£ã®èƒŒå¾Œã«ãªã‚‹æ€è€ƒãƒ—ロセスを見ãŸã„ã¨ã¯æ€ã„ã¾ã›ã‚“。
+ 彼らã¯å˜ç´”ã§ã‚ã–ã‚„ã‹ãªè§£æ±ºæ–¹æ³•ã‚’見ãŸã„ã®ã§ã™ã€‚
ã‚ã–ã‚„ã‹ãªè§£ã‚’説明ã™ã‚‹ã®ã¨ã€ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¨å…±ã«ä»•äº‹ã‚’ã—ã€æœªè§£æ±ºã®ä»•äº‹ã‚’
è­°è«–ã™ã‚‹ã“ã¨ã®ãƒãƒ©ãƒ³ã‚¹ã‚’キープã™ã‚‹ã®ã¯é›£ã—ã„ã‹ã‚‚ã—ã‚Œã¾ã›ã‚“。
ã§ã™ã‹ã‚‰ã€é–‹ç™ºãƒ—ロセスã®æ—©æœŸæ®µéšŽã§æ”¹å–„ã®ãŸã‚ã®ãƒ•ã‚£ãƒ¼ãƒ‰ãƒãƒƒã‚¯ã‚’もらã†ã‚ˆ
-ã†ã«ã™ã‚‹ã®ã‚‚ã„ã„ã§ã™ãŒã€å¤‰æ›´ç‚¹ã‚’å°ã•ã„部分ã«åˆ†å‰²ã—ã¦å…¨ä½“ã§ã¯ã¾ã å®Œæˆã—
-ã¦ã„ãªã„仕事を(部分的ã«)å–り込んã§ã‚‚らãˆã‚‹ã‚ˆã†ã«ã™ã‚‹ã“ã¨ã‚‚ã„ã„ã“ã¨ã§ã™ã€‚
+ã†ã«ã™ã‚‹ã®ã‚‚良ã„ã§ã™ãŒã€å¤‰æ›´ç‚¹ã‚’å°ã•ã„部分ã«åˆ†å‰²ã—ã¦å…¨ä½“ã§ã¯ã¾ã å®Œæˆã—
+ã¦ã„ãªã„仕事を(部分的ã«)å–り込んã§ã‚‚らãˆã‚‹ã‚ˆã†ã«ã™ã‚‹ã“ã¨ã‚‚良ã„ã“ã¨ã§ã™ã€‚
ã¾ãŸã€ã§ã上ãŒã£ã¦ã„ãªã„ã‚‚ã®ã‚„ã€"å°†æ¥ç›´ã™" よã†ãªãƒ‘ッãƒã‚’ã€æœ¬æµã«å«ã‚
ã¦ã‚‚らã†ã‚ˆã†ã«é€ã£ã¦ã‚‚ã€ãã‚Œã¯å—ã‘付ã‘られãªã„ã“ã¨ã‚’ç†è§£ã—ã¦ãã ã•ã„。
@@ -629,7 +635,7 @@ Linux カーãƒãƒ«ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¯ã€ä¸€åº¦ã«å¤§é‡ã®ã‚³ãƒ¼ãƒ‰ã®å¡Šã‚’å–
- テストçµæžœ
ã“ã‚Œã«ã¤ã„ã¦å…¨ã¦ãŒã©ã®ã‚ˆã†ã«ã‚ã‚‹ã¹ãã‹ã«ã¤ã„ã¦ã®è©³ç´°ã¯ã€ä»¥ä¸‹ã®ãƒ‰ã‚­ãƒ¥ãƒ¡
-ント㮠ChangeLog セクションをã¿ã¦ãã ã•ã„-
+ント㮠ChangeLog セクションを見ã¦ãã ã•ã„-
"The Perfect Patch"
http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt
diff --git a/Documentation/java.txt b/Documentation/java.txt
index 3cce3fbb6644..e6a723281547 100644
--- a/Documentation/java.txt
+++ b/Documentation/java.txt
@@ -37,7 +37,7 @@ other program after you have done the following:
or the following, if you want to be more selective:
':Applet:M::<!--applet::/usr/bin/appletviewer:'
- Of cause you have to fix the path names. Given path/file names in this
+ Of course you have to fix the path names. The path/file names given in this
document match the Debian 2.1 system. (i.e. jdk installed in /usr,
custom wrappers from this document in /usr/local)
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt
index fe8b0c4892cf..616043a6da99 100644
--- a/Documentation/kbuild/kconfig-language.txt
+++ b/Documentation/kbuild/kconfig-language.txt
@@ -77,7 +77,12 @@ applicable everywhere (see syntax).
Optionally, dependencies only for this default value can be added with
"if".
-- dependencies: "depends on"/"requires" <expr>
+- type definition + default value:
+ "def_bool"/"def_tristate" <expr> ["if" <expr>]
+ This is a shorthand notation for a type definition plus a value.
+ Optionally dependencies for this default value can be added with "if".
+
+- dependencies: "depends on" <expr>
This defines a dependency for this menu entry. If multiple
dependencies are defined, they are connected with '&&'. Dependencies
are applied to all other options within this menu entry (which also
@@ -289,3 +294,10 @@ source:
"source" <prompt>
This reads the specified configuration file. This file is always parsed.
+
+mainmenu:
+
+ "mainmenu" <prompt>
+
+This sets the config program's title bar if the config program chooses
+to use it.
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
index e08ef8759a07..7a7753321a26 100644
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -276,41 +276,39 @@ more details, with real examples.
--- 3.7 Compilation flags
- EXTRA_CFLAGS, EXTRA_AFLAGS, EXTRA_LDFLAGS, EXTRA_ARFLAGS
+ ccflags-y, asflags-y and ldflags-y
+ The three flags listed above applies only to the kbuild makefile
+ where they are assigned. They are used for all the normal
+ cc, as and ld invocation happenign during a recursive build.
+ Note: Flags with the same behaviour were previously named:
+ EXTRA_CFLAGS, EXTRA_AFLAGS and EXTRA_LDFLAGS.
+ They are yet supported but their use are deprecated.
- All the EXTRA_ variables apply only to the kbuild makefile
- where they are assigned. The EXTRA_ variables apply to all
- commands executed in the kbuild makefile.
-
- $(EXTRA_CFLAGS) specifies options for compiling C files with
- $(CC).
+ ccflags-y specifies options for compiling C files with $(CC).
Example:
# drivers/sound/emu10k1/Makefile
- EXTRA_CFLAGS += -I$(obj)
- ifdef DEBUG
- EXTRA_CFLAGS += -DEMU10K1_DEBUG
- endif
+ ccflags-y += -I$(obj)
+ ccflags-$(DEBUG) += -DEMU10K1_DEBUG
This variable is necessary because the top Makefile owns the
- variable $(CFLAGS) and uses it for compilation flags for the
+ variable $(KBUILD_CFLAGS) and uses it for compilation flags for the
entire tree.
- $(EXTRA_AFLAGS) is a similar string for per-directory options
+ asflags-y is a similar string for per-directory options
when compiling assembly language source.
Example:
#arch/x86_64/kernel/Makefile
- EXTRA_AFLAGS := -traditional
+ asflags-y := -traditional
- $(EXTRA_LDFLAGS) and $(EXTRA_ARFLAGS) are similar strings for
- per-directory options to $(LD) and $(AR).
+ ldflags-y is a string for per-directory options to $(LD).
Example:
#arch/m68k/fpsp040/Makefile
- EXTRA_LDFLAGS := -x
+ ldflags-y := -x
CFLAGS_$@, AFLAGS_$@
@@ -425,6 +423,7 @@ more details, with real examples.
as-instr checks if the assembler reports a specific instruction
and then outputs either option1 or option2
C escapes are supported in the test instruction
+ Note: as-instr-option uses KBUILD_AFLAGS for $(AS) options
cc-option
cc-option is used to check if $(CC) supports a given option, and not
@@ -438,6 +437,7 @@ more details, with real examples.
-march=pentium-mmx if supported by $(CC), otherwise -march=i586.
The second argument to cc-option is optional, and if omitted,
cflags-y will be assigned no value if first option is not supported.
+ Note: cc-option uses KBUILD_CFLAGS for $(CC) options
cc-option-yn
cc-option-yn is used to check if gcc supports a given option
@@ -453,6 +453,7 @@ more details, with real examples.
option. When $(biarch) equals 'y', the expanded variables $(aflags-y)
and $(cflags-y) will be assigned the values -a32 and -m32,
respectively.
+ Note: cc-option-yn uses KBUILD_CFLAGS for $(CC) options
cc-option-align
gcc versions >= 3.0 changed the type of options used to specify
@@ -464,10 +465,11 @@ more details, with real examples.
cc-option-align = -falign
Example:
- CFLAGS += $(cc-option-align)-functions=4
+ KBUILD_CFLAGS += $(cc-option-align)-functions=4
In the above example, the option -falign-functions=4 is used for
gcc >= 3.00. For gcc < 3.00, -malign-functions=4 is used.
+ Note: cc-option-align uses KBUILD_CFLAGS for $(CC) options
cc-version
cc-version returns a numerical version of the $(CC) compiler version.
@@ -492,9 +494,9 @@ more details, with real examples.
Example:
#fs/reiserfs/Makefile
- EXTRA_CFLAGS := $(call cc-ifversion, -lt, 0402, -O1)
+ ccflags-y := $(call cc-ifversion, -lt, 0402, -O1)
- In this example, EXTRA_CFLAGS will be assigned the value -O1 if the
+ In this example, ccflags-y will be assigned the value -O1 if the
$(CC) version is less than 4.2.
cc-ifversion takes all the shell operators:
-eq, -ne, -lt, -le, -gt, and -ge
@@ -516,6 +518,28 @@ more details, with real examples.
In this example for a specific GCC version the build will error out explaining
to the user why it stops.
+ cc-cross-prefix
+ cc-cross-prefix is used to check if there exists a $(CC) in path with
+ one of the listed prefixes. The first prefix where there exist a
+ prefix$(CC) in the PATH is returned - and if no prefix$(CC) is found
+ then nothing is returned.
+ Additional prefixes are separated by a single space in the
+ call of cc-cross-prefix.
+ This functionality is useful for architecture Makefiles that try
+ to set CROSS_COMPILE to well-known values but may have several
+ values to select between.
+ It is recommended only to try to set CROSS_COMPILE if it is a cross
+ build (host arch is different from target arch). And if CROSS_COMPILE
+ is already set then leave it with the old value.
+
+ Example:
+ #arch/m68k/Makefile
+ ifneq ($(SUBARCH),$(ARCH))
+ ifeq ($(CROSS_COMPILE),)
+ CROSS_COMPILE := $(call cc-cross-prefix, m68k-linux-gnu-)
+ endif
+ endif
+
=== 4 Host Program support
Kbuild supports building executables on the host for use during the
@@ -780,8 +804,8 @@ When kbuild executes, the following steps are followed (roughly):
Example:
#arch/s390/Makefile
LDFLAGS := -m elf_s390
- Note: EXTRA_LDFLAGS and LDFLAGS_$@ can be used to further customise
- the flags used. See chapter 7.
+ Note: ldflags-y can be used to further customise
+ the flags used. See chapter 3.7.
LDFLAGS_MODULE Options for $(LD) when linking modules
@@ -817,26 +841,26 @@ When kbuild executes, the following steps are followed (roughly):
In this example, the binary $(obj)/image is a binary version of
vmlinux. The usage of $(call if_changed,xxx) will be described later.
- AFLAGS $(AS) assembler flags
+ KBUILD_AFLAGS $(AS) assembler flags
Default value - see top level Makefile
Append or modify as required per architecture.
Example:
#arch/sparc64/Makefile
- AFLAGS += -m64 -mcpu=ultrasparc
+ KBUILD_AFLAGS += -m64 -mcpu=ultrasparc
- CFLAGS $(CC) compiler flags
+ KBUILD_CFLAGS $(CC) compiler flags
Default value - see top level Makefile
Append or modify as required per architecture.
- Often, the CFLAGS variable depends on the configuration.
+ Often, the KBUILD_CFLAGS variable depends on the configuration.
Example:
#arch/i386/Makefile
cflags-$(CONFIG_M386) += -march=i386
- CFLAGS += $(cflags-y)
+ KBUILD_CFLAGS += $(cflags-y)
Many arch Makefiles dynamically run the target C compiler to
probe supported options:
@@ -848,7 +872,7 @@ When kbuild executes, the following steps are followed (roughly):
-march=pentium2,-march=i686)
...
# Disable unit-at-a-time mode ...
- CFLAGS += $(call cc-option,-fno-unit-at-a-time)
+ KBUILD_CFLAGS += $(call cc-option,-fno-unit-at-a-time)
...
@@ -1096,8 +1120,8 @@ When kbuild executes, the following steps are followed (roughly):
specified options when building the target vmlinux.lds.
When building the *.lds target, kbuild uses the variables:
- CPPFLAGS : Set in top-level Makefile
- EXTRA_CPPFLAGS : May be set in the kbuild makefile
+ KBUILD_CPPFLAGS : Set in top-level Makefile
+ cppflags-y : May be set in the kbuild makefile
CPPFLAGS_$(@F) : Target specific flags.
Note that the full filename is used in this
assignment.
diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt
index 2fedc081b4c8..d0ac72cc19ff 100644
--- a/Documentation/kdump/kdump.txt
+++ b/Documentation/kdump/kdump.txt
@@ -13,7 +13,7 @@ dump of the system kernel's memory needs to be taken (for example, when
the system panics). The system kernel's memory image is preserved across
the reboot and is accessible to the dump-capture kernel.
-You can use common Linux commands, such as cp and scp, to copy the
+You can use common commands, such as cp and scp, to copy the
memory image to a dump file on the local disk, or across the network to
a remote system.
@@ -69,7 +69,7 @@ http://www.kernel.org/pub/linux/kernel/people/horms/kexec-tools/kexec-tools-test
This is a symlink to the latest version, which at the time of writing is
20061214, the only release of kexec-tools-testing so far. As other versions
-are made released, the older onese will remain available at
+are released, the older ones will remain available at
http://www.kernel.org/pub/linux/kernel/people/horms/kexec-tools/
Note: Latest kexec-tools-testing git tree is available at
@@ -159,16 +159,17 @@ Dump-capture kernel config options (Arch Independent)
CONFIG_PROC_VMCORE=y
(CONFIG_PROC_VMCORE is set by default when CONFIG_CRASH_DUMP is selected.)
-Dump-capture kernel config options (Arch Dependent, i386)
---------------------------------------------------------
-1) On x86, enable high memory support under "Processor type and
+Dump-capture kernel config options (Arch Dependent, i386 and x86_64)
+--------------------------------------------------------------------
+
+1) On i386, enable high memory support under "Processor type and
features":
CONFIG_HIGHMEM64G=y
or
CONFIG_HIGHMEM4G
-2) On x86 and x86_64, disable symmetric multi-processing support
+2) On i386 and x86_64, disable symmetric multi-processing support
under "Processor type and features":
CONFIG_SMP=n
@@ -203,28 +204,6 @@ Dump-capture kernel config options (Arch Dependent, i386)
5) Make and install the kernel and its modules. DO NOT add this kernel
to the boot loader configuration files.
-Dump-capture kernel config options (Arch Dependent, x86_64)
-----------------------------------------------------------
-1) On x86 and x86_64, disable symmetric multi-processing support
- under "Processor type and features":
-
- CONFIG_SMP=n
-
- (If CONFIG_SMP=y, then specify maxcpus=1 on the kernel command line
- when loading the dump-capture kernel, see section "Load the Dump-capture
- Kernel".)
-
-2) Use a suitable value for "Physical address where the kernel is
- loaded" (under "Processor type and features"). This only appears when
- "kernel crash dumps" is enabled. By default this value is 0x1000000
- (16MB). It should be the same as X in the "crashkernel=Y@X" boot
- parameter.
-
- For x86_64, normally "CONFIG_PHYSICAL_START=0x1000000".
-
-3) Make and install the kernel and its modules. DO NOT add this kernel
- to the boot loader configuration files.
-
Dump-capture kernel config options (Arch Dependent, ppc64)
----------------------------------------------------------
@@ -252,6 +231,32 @@ Dump-capture kernel config options (Arch Dependent, ia64)
any space below the alignment point will be wasted.
+Extended crashkernel syntax
+===========================
+
+While the "crashkernel=size[@offset]" syntax is sufficient for most
+configurations, sometimes it's handy to have the reserved memory dependent
+on the value of System RAM -- that's mostly for distributors that pre-setup
+the kernel command line to avoid a unbootable system after some memory has
+been removed from the machine.
+
+The syntax is:
+
+ crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset]
+ range=start-[end]
+
+For example:
+
+ crashkernel=512M-2G:64M,2G-:128M
+
+This would mean:
+
+ 1) if the RAM is smaller than 512M, then don't reserve anything
+ (this is the "rescue" case)
+ 2) if the RAM size is between 512M and 2G, then reserve 64M
+ 3) if the RAM size is larger than 2G, then reserve 128M
+
+
Boot into System Kernel
=======================
@@ -282,11 +287,9 @@ Based on the architecture and type of image (relocatable or not), one
can choose to load the uncompressed vmlinux or compressed bzImage/vmlinuz
of dump-capture kernel. Following is the summary.
-For i386:
+For i386 and x86_64:
- Use vmlinux if kernel is not relocatable.
- Use bzImage/vmlinuz if kernel is relocatable.
-For x86_64:
- - Use vmlinux
For ppc64:
- Use vmlinux
For ia64:
@@ -315,20 +318,22 @@ Following are the arch specific command line options to be used while
loading dump-capture kernel.
For i386, x86_64 and ia64:
- "1 irqpoll maxcpus=1"
+ "1 irqpoll maxcpus=1 reset_devices"
For ppc64:
- "1 maxcpus=1 noirqdistrib"
+ "1 maxcpus=1 noirqdistrib reset_devices"
Notes on loading the dump-capture kernel:
* By default, the ELF headers are stored in ELF64 format to support
- systems with more than 4GB memory. The --elf32-core-headers option can
- be used to force the generation of ELF32 headers. This is necessary
- because GDB currently cannot open vmcore files with ELF64 headers on
- 32-bit systems. ELF32 headers can be used on non-PAE systems (that is,
- less than 4GB of memory).
+ systems with more than 4GB memory. On i386, kexec automatically checks if
+ the physical RAM size exceeds the 4 GB limit and if not, uses ELF32.
+ So, on non-PAE systems, ELF32 is always used.
+
+ The --elf32-core-headers option can be used to force the generation of ELF32
+ headers. This is necessary because GDB currently cannot open vmcore files
+ with ELF64 headers on 32-bit systems.
* The "irqpoll" boot parameter reduces driver initialization failures
due to shared interrupts in the dump-capture kernel.
@@ -360,7 +365,7 @@ If die() is called, and it happens to be a thread with pid 0 or 1, or die()
is called inside interrupt context or die() is called and panic_on_oops is set,
the system will boot into the dump-capture kernel.
-On powererpc systems when a soft-reset is generated, die() is called by all cpus
+On powerpc systems when a soft-reset is generated, die() is called by all cpus
and the system will boot into the dump-capture kernel.
For testing purposes, you can trigger a crash by using "ALT-SysRq-c",
@@ -426,9 +431,3 @@ Contact
Vivek Goyal (vgoyal@in.ibm.com)
Maneesh Soni (maneesh@in.ibm.com)
-
-Trademark
-=========
-
-Linux is a trademark of Linus Torvalds in the United States, other
-countries, or both.
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt
index d9e3b199929b..5a4ef48224ae 100644
--- a/Documentation/kernel-docs.txt
+++ b/Documentation/kernel-docs.txt
@@ -76,9 +76,9 @@
* Title: "Conceptual Architecture of the Linux Kernel"
Author: Ivan T. Bowman.
URL: http://plg.uwaterloo.ca/~itbowman/papers/CS746G-a1.html
- Keywords: conceptual software arquitecture, extracted design,
+ Keywords: conceptual software architecture, extracted design,
reverse engineering, system structure.
- Description: Conceptual software arquitecture of the Linux kernel,
+ Description: Conceptual software architecture of the Linux kernel,
automatically extracted from the source code. Very detailed. Good
figures. Gives good overall kernel understanding.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4d175c751246..33121d6c827c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -35,6 +35,7 @@ parameter is applicable:
APIC APIC support is enabled.
APM Advanced Power Management support is enabled.
AX25 Appropriate AX.25 support is enabled.
+ BLACKFIN Blackfin architecture is enabled.
DRM Direct Rendering Management support is enabled.
EDD BIOS Enhanced Disk Drive Services (EDD) is enabled
EFI EFI Partitioning (GPT) is enabled
@@ -67,16 +68,19 @@ parameter is applicable:
PARIDE The ParIDE (parallel port IDE) subsystem is enabled.
PARISC The PA-RISC architecture is enabled.
PCI PCI bus support is enabled.
+ PCIE PCI Express support is enabled.
PCMCIA The PCMCIA subsystem is enabled.
PNP Plug & Play support is enabled.
PPC PowerPC architecture is enabled.
PPT Parallel port support is enabled.
PS2 Appropriate PS/2 support is enabled.
RAM RAM disk support is enabled.
+ ROOTPLUG The example Root Plug LSM is enabled.
S390 S390 architecture is enabled.
SCSI Appropriate SCSI support is enabled.
A lot of drivers has their options described inside of
Documentation/scsi/.
+ SECURITY Different security models are enabled.
SELINUX SELinux support is enabled.
SERIAL Serial support is enabled.
SH SuperH architecture is enabled.
@@ -218,9 +222,6 @@ and is between 256 and 4096 characters. It is defined in the file
Warning: Many of these options can produce a lot of
output and make your system unusable. Be very careful.
-
- acpi_fake_ecdt [HW,ACPI] Workaround failure due to BIOS lacking ECDT
-
acpi_pm_good [X86-32,X86-64]
Override the pmtimer bug detection: force the kernel
to assume that this machine's pmtimer latches its value
@@ -293,9 +294,6 @@ and is between 256 and 4096 characters. It is defined in the file
apm= [APM] Advanced Power Management
See header of arch/i386/kernel/apm.c.
- applicom= [HW]
- Format: <mem>,<irq>
-
arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards
Format: <io>,<irq>,<nodeID>
@@ -341,11 +339,10 @@ and is between 256 and 4096 characters. It is defined in the file
Format: <io>,<irq>,<mode>
See header of drivers/net/hamradio/baycom_ser_hdx.c.
- blkmtd_device= [HW,MTD]
- blkmtd_erasesz=
- blkmtd_ro=
- blkmtd_bs=
- blkmtd_count=
+ boot_delay= Milliseconds to delay each printk during boot.
+ Values larger than 10 seconds (10000) are changed to
+ no delay (0).
+ Format: integer
bttv.card= [HW,V4L] bttv (bt848 + bt878 based grabber cards)
bttv.radio= Most important insmod options are available as
@@ -366,6 +363,12 @@ and is between 256 and 4096 characters. It is defined in the file
possible to determine what the correct size should be.
This option provides an override for these situations.
+ capability.disable=
+ [SECURITY] Disable capabilities. This would normally
+ be used only if an alternative security model is to be
+ configured. Potentially dangerous and should only be
+ used if you are entirely sure of the consequences.
+
chandev= [HW,NET] Generic channel device initialisation
checkreqprot [SELINUX] Set initial checkreqprot flag value.
@@ -416,8 +419,11 @@ and is between 256 and 4096 characters. It is defined in the file
over the 8254 in addition to over the IO-APIC. The
kernel tries to set a sensible default.
- hpet= [X86-32,HPET] option to disable HPET and use PIT.
- Format: disable
+ hpet= [X86-32,HPET] option to control HPET usage
+ Format: { enable (default) | disable | force }
+ disable: disable HPET and use PIT instead
+ force: allow force enabled of undocumented chips (ICH4,
+ VIA, nVidia)
com20020= [HW,NET] ARCnet - COM20020 chipset
Format:
@@ -464,6 +470,16 @@ and is between 256 and 4096 characters. It is defined in the file
UART at the specified I/O port or MMIO address.
The options are the same as for ttyS, above.
+ no_console_suspend
+ [HW] Never suspend the console
+ Disable suspending of consoles during suspend and
+ hibernate operations. Once disabled, debugging
+ messages can reach various consoles while the rest
+ of the system is being put to sleep (ie, while
+ debugging driver suspend/resume hooks). This may
+ not work reliably with all consoles, but is known
+ to work with serial and VGA consoles.
+
cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver
Format:
<first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
@@ -472,6 +488,13 @@ and is between 256 and 4096 characters. It is defined in the file
[KNL] Reserve a chunk of physical memory to
hold a kernel to switch to with kexec on panic.
+ crashkernel=range1:size1[,range2:size2,...][@offset]
+ [KNL] Same as above, but depends on the memory
+ in the running system. The syntax of range is
+ start-[end] where start and end are both
+ a memory unit (amount[KMG]). See also
+ Documentation/kdump/kdump.txt for a example.
+
cs4232= [HW,OSS]
Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq>
@@ -481,8 +504,6 @@ and is between 256 and 4096 characters. It is defined in the file
cs89x0_media= [HW,NET]
Format: { rj45 | aui | bnc }
- cyclades= [HW,SERIAL] Cyclades multi-serial port adapter.
-
dasd= [HW,NET]
See header of drivers/s390/block/dasd_devmap.c.
@@ -540,17 +561,13 @@ and is between 256 and 4096 characters. It is defined in the file
See drivers/char/README.epca and
Documentation/digiepca.txt.
- dmascc= [HW,AX25,SERIAL] AX.25 Z80SCC driver with DMA
- support available.
- Format: <io_dev0>[,<io_dev1>[,..<io_dev32>]]
-
dmasound= [HW,OSS] Sound subsystem buffers
dscc4.setup= [NET]
dtc3181e= [HW,SCSI]
- earlyprintk= [X86-32,X86-64,SH]
+ earlyprintk= [X86-32,X86-64,SH,BLACKFIN]
earlyprintk=vga
earlyprintk=serial[,ttySn[,baudrate]]
@@ -569,22 +586,10 @@ and is between 256 and 4096 characters. It is defined in the file
eata= [HW,SCSI]
- ec_intr= [HW,ACPI] ACPI Embedded Controller interrupt mode
- Format: <int>
- 0: polling mode
- non-0: interrupt mode (default)
-
- eda= [HW,PS2]
-
- edb= [HW,PS2]
-
edd= [EDD]
Format: {"of[f]" | "sk[ipmbr]"}
See comment in arch/i386/boot/edd.S
- eicon= [HW,ISDN]
- Format: <id>,<membase>,<irq>
-
eisa_irq_edge= [PARISC,HW]
See header of drivers/parisc/eisa.c.
@@ -763,6 +768,23 @@ and is between 256 and 4096 characters. It is defined in the file
inttest= [IA64]
+ intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option
+ off
+ Disable intel iommu driver.
+ igfx_off [Default Off]
+ By default, gfx is mapped as normal device. If a gfx
+ device has a dedicated DMAR unit, the DMAR unit is
+ bypassed by not enabling DMAR with this option. In
+ this case, gfx device will use physical address for
+ DMA.
+ forcedac [x86_64]
+ With this option iommu will not optimize to look
+ for io virtual address below 32 bit forcing dual
+ address cycle on pci bus for cards supporting greater
+ than 32 bit addressing. The default is to look
+ for translation below 32 bit and if not available
+ then look in the higher range.
+
io7= [HW] IO7 for Marvel based alpha systems
See comment before marvel_specify_io7 in
arch/alpha/kernel/core_marvel.c.
@@ -860,8 +882,9 @@ and is between 256 and 4096 characters. It is defined in the file
lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer in
C2 power state.
- lasi= [HW,SCSI] PARISC LASI driver for the 53c700 chip
- Format: addr:<io>,irq:<irq>
+ libata.noacpi [LIBATA] Disables use of ACPI in libata suspend/resume
+ when set.
+ Format: <int>
load_ramdisk= [RAM] List of ramdisks to load from floppy
See Documentation/ramdisk.txt.
@@ -900,6 +923,11 @@ and is between 256 and 4096 characters. It is defined in the file
n must be a power of two. The default size
is set in the kernel config file.
+ logo.nologo [FB] Disables display of the built-in Linux logo.
+ This may be used to provide more screen space for
+ kernel log messages and is useful when debugging
+ kernel boot problems.
+
lp=0 [LP] Specify parallel ports to use, e.g,
lp=port[,port...] lp=none,parport0 (lp0 not configured, lp1 uses
lp=reset first parallel port). 'lp=0' disables the
@@ -970,6 +998,8 @@ and is between 256 and 4096 characters. It is defined in the file
mce [X86-32] Machine Check Exception
+ mce=option [X86-64] See Documentation/x86_64/boot-options.txt
+
md= [HW] RAID subsystems devices and level
See Documentation/md.txt.
@@ -1008,6 +1038,10 @@ and is between 256 and 4096 characters. It is defined in the file
meye.*= [HW] Set MotionEye Camera parameters
See Documentation/video4linux/meye.txt.
+ mfgpt_irq= [IA-32] Specify the IRQ to use for the
+ Multi-Function General Purpose Timers on AMD Geode
+ platforms.
+
mga= [HW,DRM]
mousedev.tap_time=
@@ -1073,16 +1107,19 @@ and is between 256 and 4096 characters. It is defined in the file
[NFS] set the maximum lifetime for idmapper cache
entries.
+ nfs.enable_ino64=
+ [NFS] enable 64-bit inode numbers.
+ If zero, the NFS client will fake up a 32-bit inode
+ number for the readdir() and stat() syscalls instead
+ of returning the full 64-bit number.
+ The default is to return 64-bit inode numbers.
+
nmi_watchdog= [KNL,BUGS=X86-32] Debugging features for SMP kernels
no387 [BUGS=X86-32] Tells the kernel to use the 387 maths
emulation library even if a 387 maths coprocessor
is present.
- noacpi [LIBATA] Disables use of ACPI in libata suspend/resume
- when set.
- Format: <int>
-
noaliencache [MM, NUMA, SLAB] Disables the allocation of alien
caches in the slab allocator. Saves per-node memory,
but will impact performance.
@@ -1092,9 +1129,6 @@ and is between 256 and 4096 characters. It is defined in the file
noapic [SMP,APIC] Tells the kernel to not make use of any
IOAPICs that may be present in the system.
- noasync [HW,M68K] Disables async and sync negotiation for
- all devices.
-
nobats [PPC] Do not use BATs for mapping kernel lowmem
on "Classic" PPC cores.
@@ -1159,6 +1193,9 @@ and is between 256 and 4096 characters. It is defined in the file
nomce [X86-32] Machine Check Exception
+ nomfgpt [X86-32] Disable Multi-Function General Purpose
+ Timer usage (for AMD Geode machines).
+
noreplace-paravirt [X86-32,PV_OPS] Don't patch paravirt_ops
noreplace-smp [X86-32,SMP] Don't replace SMP instructions
@@ -1269,6 +1306,11 @@ and is between 256 and 4096 characters. It is defined in the file
Mechanism 1.
conf2 [X86-32] Force use of PCI Configuration
Mechanism 2.
+ noaer [PCIE] If the PCIEAER kernel config parameter is
+ enabled, this kernel boot option can be used to
+ disable the use of PCIE advanced error reporting.
+ nodomains [PCI] Disable support for multiple PCI
+ root domains (aka PCI segments, in ACPI-speak).
nommconf [X86-32,X86_64] Disable use of MMCONFIG for PCI
Configuration
nomsi [MSI] If the PCI_MSI kernel config parameter is
@@ -1313,6 +1355,8 @@ and is between 256 and 4096 characters. It is defined in the file
IRQ routing is enabled.
noacpi [X86-32] Do not use ACPI for IRQ routing
or for PCI scanning.
+ use_crs [X86-32] Use _CRS for PCI resource
+ allocation.
routeirq Do IRQ routing for all PCI devices.
This is normally done in pci_enable_device(),
so this option is a temporary workaround
@@ -1395,7 +1439,9 @@ and is between 256 and 4096 characters. It is defined in the file
Param: "schedule" - profile schedule points.
Param: <number> - step/bucket size as a power of 2 for
statistical time based profiling.
- Param: "sleep" - profile D-state sleeping (millisecs)
+ Param: "sleep" - profile D-state sleeping (millisecs).
+ Requires CONFIG_SCHEDSTATS
+ Param: "kvm" - profile VM exits.
processor.max_cstate= [HW,ACPI]
Limit processor to maximum C-state
@@ -1429,6 +1475,10 @@ and is between 256 and 4096 characters. It is defined in the file
pt. [PARIDE]
See Documentation/paride.txt.
+ pty.legacy_count=
+ [KNL] Number of legacy pty's. Overwrites compiled-in
+ default number.
+
quiet [KNL] Disable most log messages
r128= [HW,DRM]
@@ -1436,14 +1486,10 @@ and is between 256 and 4096 characters. It is defined in the file
raid= [HW,RAID]
See Documentation/md.txt.
- ramdisk= [RAM] Sizes of RAM disks in kilobytes [deprecated]
- See Documentation/ramdisk.txt.
-
ramdisk_blocksize= [RAM]
See Documentation/ramdisk.txt.
ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
- New name for the ramdisk parameter.
See Documentation/ramdisk.txt.
rcu.blimit= [KNL,BOOT] Set maximum number of finished
@@ -1506,6 +1552,15 @@ and is between 256 and 4096 characters. It is defined in the file
Useful for devices that are detected asynchronously
(e.g. USB and MMC devices).
+ root_plug.vendor_id=
+ [ROOTPLUG] Override the default vendor ID
+
+ root_plug.product_id=
+ [ROOTPLUG] Override the default product ID
+
+ root_plug.debug=
+ [ROOTPLUG] Enable debugging output
+
rw [KNL] Mount root device read-write on boot
S [KNL] Run init in single mode
@@ -1513,9 +1568,6 @@ and is between 256 and 4096 characters. It is defined in the file
sa1100ir [NET]
See drivers/net/irda/sa1100_ir.c.
- sb= [HW,OSS]
- Format: <io>,<irq>,<dma>,<dma2>
-
sbni= [NET] Granch SBNI12 leased line adapter
sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver
@@ -1559,8 +1611,6 @@ and is between 256 and 4096 characters. It is defined in the file
serialnumber [BUGS=X86-32]
- sg_def_reserved_size= [SCSI]
-
shapers= [NET]
Maximal number of shapers.
@@ -1863,9 +1913,6 @@ and is between 256 and 4096 characters. It is defined in the file
Format:
<io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
- tsdev.xres= [TS] Horizontal screen resolution.
- tsdev.yres= [TS] Vertical screen resolution.
-
turbografx.map[2|3]= [HW,JOY]
TurboGraFX parallel port interface
Format:
@@ -1954,10 +2001,6 @@ and is between 256 and 4096 characters. It is defined in the file
norandmaps Don't use address space randomization
Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space
- unwind_debug=N N > 0 will enable dwarf2 unwinder debugging
- This is useful to get more information why
- you got a "dwarf2 unwinder stuck"
-
______________________________________________________________________
TODO:
diff --git a/Documentation/keys-request-key.txt b/Documentation/keys-request-key.txt
index c1f64fdf84cb..266955d23ee6 100644
--- a/Documentation/keys-request-key.txt
+++ b/Documentation/keys-request-key.txt
@@ -20,6 +20,19 @@ or:
const char *callout_string,
void *aux);
+or:
+
+ struct key *request_key_async(const struct key_type *type,
+ const char *description,
+ const char *callout_string);
+
+or:
+
+ struct key *request_key_async_with_auxdata(const struct key_type *type,
+ const char *description,
+ const char *callout_string,
+ void *aux);
+
Or by userspace invoking the request_key system call:
key_serial_t request_key(const char *type,
@@ -32,10 +45,14 @@ does not need to link the key to a keyring to prevent it from being immediately
destroyed. The kernel interface returns a pointer directly to the key, and
it's up to the caller to destroy the key.
-The request_key_with_auxdata() call is like the in-kernel request_key() call,
-except that it permits auxiliary data to be passed to the upcaller (the default
-is NULL). This is only useful for those key types that define their own upcall
-mechanism rather than using /sbin/request-key.
+The request_key*_with_auxdata() calls are like the in-kernel request_key*()
+calls, except that they permit auxiliary data to be passed to the upcaller (the
+default is NULL). This is only useful for those key types that define their
+own upcall mechanism rather than using /sbin/request-key.
+
+The two async in-kernel calls may return keys that are still in the process of
+being constructed. The two non-async ones will wait for construction to
+complete first.
The userspace interface links the key to a keyring associated with the process
to prevent the key from going away, and returns the serial number of the key to
diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 947d57d53453..51652d39e61c 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -4,7 +4,7 @@
This service allows cryptographic keys, authentication tokens, cross-domain
user mappings, and similar to be cached in the kernel for the use of
-filesystems other kernel services.
+filesystems and other kernel services.
Keyrings are permitted; these are a special type of key that can hold links to
other keys. Processes each have three standard keyring subscriptions that a
@@ -726,6 +726,15 @@ call, and the key released upon close. How to deal with conflicting keys due to
two different users opening the same file is left to the filesystem author to
solve.
+To access the key manager, the following header must be #included:
+
+ <linux/key.h>
+
+Specific key types should have a header file under include/keys/ that should be
+used to access that type. For keys of type "user", for example, that would be:
+
+ <keys/user-type.h>
+
Note that there are two different types of pointers to keys that may be
encountered:
@@ -791,6 +800,36 @@ payload contents" for more information.
passed to the key_type->request_key() op if it exists.
+(*) A key can be requested asynchronously by calling one of:
+
+ struct key *request_key_async(const struct key_type *type,
+ const char *description,
+ const char *callout_string);
+
+ or:
+
+ struct key *request_key_async_with_auxdata(const struct key_type *type,
+ const char *description,
+ const char *callout_string,
+ void *aux);
+
+ which are asynchronous equivalents of request_key() and
+ request_key_with_auxdata() respectively.
+
+ These two functions return with the key potentially still under
+ construction. To wait for contruction completion, the following should be
+ called:
+
+ int wait_for_key_construction(struct key *key, bool intr);
+
+ The function will wait for the key to finish being constructed and then
+ invokes key_validate() to return an appropriate value to indicate the state
+ of the key (0 indicates the key is usable).
+
+ If intr is true, then the wait can be interrupted by a signal, in which
+ case error ERESTARTSYS will be returned.
+
+
(*) When it is no longer required, the key should be released using:
void key_put(struct key *key);
@@ -924,7 +963,11 @@ DEFINING A KEY TYPE
A kernel service may want to define its own key type. For instance, an AFS
filesystem might want to define a Kerberos 5 ticket key type. To do this, it
-author fills in a struct key_type and registers it with the system.
+author fills in a key_type struct and registers it with the system.
+
+Source files that implement key types should include the following header file:
+
+ <linux/key-type.h>
The structure has a number of fields, some of which are mandatory:
@@ -1053,22 +1096,44 @@ The structure has a number of fields, some of which are mandatory:
as might happen when the userspace buffer is accessed.
- (*) int (*request_key)(struct key *key, struct key *authkey, const char *op,
+ (*) int (*request_key)(struct key_construction *cons, const char *op,
void *aux);
- This method is optional. If provided, request_key() and
- request_key_with_auxdata() will invoke this function rather than
- upcalling to /sbin/request-key to operate upon a key of this type.
+ This method is optional. If provided, request_key() and friends will
+ invoke this function rather than upcalling to /sbin/request-key to operate
+ upon a key of this type.
+
+ The aux parameter is as passed to request_key_async_with_auxdata() and
+ similar or is NULL otherwise. Also passed are the construction record for
+ the key to be operated upon and the operation type (currently only
+ "create").
+
+ This method is permitted to return before the upcall is complete, but the
+ following function must be called under all circumstances to complete the
+ instantiation process, whether or not it succeeds, whether or not there's
+ an error:
+
+ void complete_request_key(struct key_construction *cons, int error);
+
+ The error parameter should be 0 on success, -ve on error. The
+ construction record is destroyed by this action and the authorisation key
+ will be revoked. If an error is indicated, the key under construction
+ will be negatively instantiated if it wasn't already instantiated.
+
+ If this method returns an error, that error will be returned to the
+ caller of request_key*(). complete_request_key() must be called prior to
+ returning.
+
+ The key under construction and the authorisation key can be found in the
+ key_construction struct pointed to by cons:
+
+ (*) struct key *key;
+
+ The key under construction.
- The aux parameter is as passed to request_key_with_auxdata() or is NULL
- otherwise. Also passed are the key to be operated upon, the
- authorisation key for this operation and the operation type (currently
- only "create").
+ (*) struct key *authkey;
- This function should return only when the upcall is complete. Upon return
- the authorisation key will be revoked, and the target key will be
- negatively instantiated if it is still uninstantiated. The error will be
- returned to the caller of request_key*().
+ The authorisation key.
============================
diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt
index 8ee49ee7c963..ca86a885ad8f 100644
--- a/Documentation/kobject.txt
+++ b/Documentation/kobject.txt
@@ -54,7 +54,6 @@ embedded in larger data structures and replace fields they duplicate.
struct kobject {
const char * k_name;
- char name[KOBJ_NAME_LEN];
struct kref kref;
struct list_head entry;
struct kobject * parent;
@@ -223,18 +222,15 @@ decl_subsys(devices, &ktype_device, &device_uevent_ops);
is equivalent to doing:
struct kset devices_subsys = {
- .kobj = {
- .name = "devices",
- },
.ktype = &ktype_devices,
.uevent_ops = &device_uevent_ops,
};
-
+kobject_set_name(&devices_subsys, name);
The objects that are registered with a subsystem that use the
subsystem's default list must have their kset ptr set properly. These
objects may have embedded kobjects or ksets. The
-following helpers make setting the kset easier:
+following helper makes setting the kset easier:
kobj_set_kset_s(obj,subsys)
@@ -242,22 +238,8 @@ kobj_set_kset_s(obj,subsys)
- Assumes that obj->kobj exists, and is a struct kobject.
- Sets the kset of that kobject to the kset <subsys>.
-
-kset_set_kset_s(obj,subsys)
-
-- Assumes that obj->kset exists, and is a struct kset.
-- Sets the kset of the embedded kobject to the kset <subsys>.
-
-subsys_set_kset(obj,subsys)
-
-- Assumes obj->subsys exists, and is a struct subsystem.
-- Sets obj->subsys.kset.kobj.kset to the subsystem's embedded kset.
-
-void subsystem_init(struct kset *s);
int subsystem_register(struct kset *s);
void subsystem_unregister(struct kset *s);
-struct kset *subsys_get(struct kset *s);
-void kset_put(struct kset *s);
These are just wrappers around the respective kset_* functions.
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
index c0b7a4556390..bac037eb1cda 100644
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
@@ -1,28 +1,8 @@
# This creates the demonstration utility "lguest" which runs a Linux guest.
-
-# For those people that have a separate object dir, look there for .config
-KBUILD_OUTPUT := ../..
-ifdef O
- ifeq ("$(origin O)", "command line")
- KBUILD_OUTPUT := $(O)
- endif
-endif
-# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
-include $(KBUILD_OUTPUT)/.config
-LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
-
-CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds
+CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include
LDLIBS:=-lz
-# Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and
-# not others (eg. FC7).
-LDFLAGS+=-static
-all: lguest.lds lguest
-# The linker script on x86 is so complex the only way of creating one
-# which will link our binary in the right place is to mangle the
-# default one.
-lguest.lds:
- $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
+all: lguest
clean:
- rm -f lguest.lds lguest
+ rm -f lguest
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 73c5f1f3d5d2..f2668390e8f7 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1,10 +1,7 @@
/*P:100 This is the Launcher code, a simple program which lays out the
* "physical" memory for the new Guest by mapping the kernel image and the
* virtual devices, then reads repeatedly from /dev/lguest to run the Guest.
- *
- * The only trick: the Makefile links it at a high address so it will be clear
- * of the guest memory region. It means that each Guest cannot have more than
- * about 2.5G of memory on a normally configured Host. :*/
+:*/
#define _LARGEFILE64_SOURCE
#define _GNU_SOURCE
#include <stdio.h>
@@ -15,6 +12,7 @@
#include <stdlib.h>
#include <elf.h>
#include <sys/mman.h>
+#include <sys/param.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/wait.h>
@@ -34,19 +32,26 @@
#include <termios.h>
#include <getopt.h>
#include <zlib.h>
-/*L:110 We can ignore the 28 include files we need for this program, but I do
+#include <assert.h>
+#include <sched.h>
+#include "linux/lguest_launcher.h"
+#include "linux/virtio_config.h"
+#include "linux/virtio_net.h"
+#include "linux/virtio_blk.h"
+#include "linux/virtio_console.h"
+#include "linux/virtio_ring.h"
+#include "asm-x86/bootparam.h"
+/*L:110 We can ignore the 38 include files we need for this program, but I do
* want to draw attention to the use of kernel-style types.
*
* As Linus said, "C is a Spartan language, and so should your naming be." I
- * like these abbreviations and the header we need uses them, so we define them
- * here.
- */
+ * like these abbreviations, so we define them here. Note that u64 is always
+ * unsigned long long, which works on all Linux systems: this means that we can
+ * use %llu in printf for any u64. */
typedef unsigned long long u64;
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
-#include "../../include/linux/lguest_launcher.h"
-#include "../../include/asm-i386/e820.h"
/*:*/
#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
@@ -55,6 +60,10 @@ typedef uint8_t u8;
#ifndef SIOCBRADDIF
#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
#endif
+/* We can have up to 256 pages for devices. */
+#define DEVICE_PAGES 256
+/* This fits nicely in a single 4096-byte page. */
+#define VIRTQUEUE_NUM 127
/*L:120 verbose is both a global flag and a macro. The C preprocessor allows
* this, and although I wouldn't recommend it, it works quite nicely here. */
@@ -65,8 +74,10 @@ static bool verbose;
/* The pipe to send commands to the waker process */
static int waker_fd;
-/* The top of guest physical memory. */
-static u32 top;
+/* The pointer to the start of guest memory. */
+static void *guest_base;
+/* The maximum guest physical address allowed, and maximum possible. */
+static unsigned long guest_limit, guest_max;
/* This is our list of devices. */
struct device_list
@@ -76,8 +87,17 @@ struct device_list
fd_set infds;
int max_infd;
+ /* Counter to assign interrupt numbers. */
+ unsigned int next_irq;
+
+ /* Counter to print out convenient device numbers. */
+ unsigned int device_num;
+
/* The descriptor page for the devices. */
- struct lguest_device_desc *descs;
+ u8 *descpage;
+
+ /* The tail of the last descriptor. */
+ unsigned int desc_used;
/* A single linked list of devices. */
struct device *dev;
@@ -85,31 +105,111 @@ struct device_list
struct device **lastdev;
};
+/* The list of Guest devices, based on command line arguments. */
+static struct device_list devices;
+
/* The device structure describes a single device. */
struct device
{
/* The linked-list pointer. */
struct device *next;
- /* The descriptor for this device, as mapped into the Guest. */
+
+ /* The this device's descriptor, as mapped into the Guest. */
struct lguest_device_desc *desc;
- /* The memory page(s) of this device, if any. Also mapped in Guest. */
- void *mem;
+
+ /* The name of this device, for --verbose. */
+ const char *name;
/* If handle_input is set, it wants to be called when this file
* descriptor is ready. */
int fd;
bool (*handle_input)(int fd, struct device *me);
- /* If handle_output is set, it wants to be called when the Guest sends
- * DMA to this key. */
- unsigned long watch_key;
- u32 (*handle_output)(int fd, const struct iovec *iov,
- unsigned int num, struct device *me);
+ /* Any queues attached to this device */
+ struct virtqueue *vq;
/* Device-specific data. */
void *priv;
};
+/* The virtqueue structure describes a queue attached to a device. */
+struct virtqueue
+{
+ struct virtqueue *next;
+
+ /* Which device owns me. */
+ struct device *dev;
+
+ /* The configuration for this queue. */
+ struct lguest_vqconfig config;
+
+ /* The actual ring of buffers. */
+ struct vring vring;
+
+ /* Last available index we saw. */
+ u16 last_avail_idx;
+
+ /* The routine to call when the Guest pings us. */
+ void (*handle_output)(int fd, struct virtqueue *me);
+};
+
+/* Since guest is UP and we don't run at the same time, we don't need barriers.
+ * But I include them in the code in case others copy it. */
+#define wmb()
+
+/* Convert an iovec element to the given type.
+ *
+ * This is a fairly ugly trick: we need to know the size of the type and
+ * alignment requirement to check the pointer is kosher. It's also nice to
+ * have the name of the type in case we report failure.
+ *
+ * Typing those three things all the time is cumbersome and error prone, so we
+ * have a macro which sets them all up and passes to the real function. */
+#define convert(iov, type) \
+ ((type *)_convert((iov), sizeof(type), __alignof__(type), #type))
+
+static void *_convert(struct iovec *iov, size_t size, size_t align,
+ const char *name)
+{
+ if (iov->iov_len != size)
+ errx(1, "Bad iovec size %zu for %s", iov->iov_len, name);
+ if ((unsigned long)iov->iov_base % align != 0)
+ errx(1, "Bad alignment %p for %s", iov->iov_base, name);
+ return iov->iov_base;
+}
+
+/* The virtio configuration space is defined to be little-endian. x86 is
+ * little-endian too, but it's nice to be explicit so we have these helpers. */
+#define cpu_to_le16(v16) (v16)
+#define cpu_to_le32(v32) (v32)
+#define cpu_to_le64(v64) (v64)
+#define le16_to_cpu(v16) (v16)
+#define le32_to_cpu(v32) (v32)
+#define le64_to_cpu(v32) (v64)
+
+/*L:100 The Launcher code itself takes us out into userspace, that scary place
+ * where pointers run wild and free! Unfortunately, like most userspace
+ * programs, it's quite boring (which is why everyone likes to hack on the
+ * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
+ * will get you through this section. Or, maybe not.
+ *
+ * The Launcher sets up a big chunk of memory to be the Guest's "physical"
+ * memory and stores it in "guest_base". In other words, Guest physical ==
+ * Launcher virtual with an offset.
+ *
+ * This can be tough to get your head around, but usually it just means that we
+ * use these trivial conversion functions when the Guest gives us it's
+ * "physical" addresses: */
+static void *from_guest_phys(unsigned long addr)
+{
+ return guest_base + addr;
+}
+
+static unsigned long to_guest_phys(const void *addr)
+{
+ return (addr - guest_base);
+}
+
/*L:130
* Loading the Kernel.
*
@@ -123,43 +223,55 @@ static int open_or_die(const char *name, int flags)
return fd;
}
-/* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */
-static void *map_zeroed_pages(unsigned long addr, unsigned int num)
+/* map_zeroed_pages() takes a number of pages. */
+static void *map_zeroed_pages(unsigned int num)
{
- /* We cache the /dev/zero file-descriptor so we only open it once. */
- static int fd = -1;
-
- if (fd == -1)
- fd = open_or_die("/dev/zero", O_RDONLY);
+ int fd = open_or_die("/dev/zero", O_RDONLY);
+ void *addr;
/* We use a private mapping (ie. if we write to the page, it will be
- * copied), and obviously we insist that it be mapped where we ask. */
- if (mmap((void *)addr, getpagesize() * num,
- PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0)
- != (void *)addr)
- err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr);
-
- /* Returning the address is just a courtesy: can simplify callers. */
- return (void *)addr;
+ * copied). */
+ addr = mmap(NULL, getpagesize() * num,
+ PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
+ if (addr == MAP_FAILED)
+ err(1, "Mmaping %u pages of /dev/zero", num);
+
+ return addr;
}
-/* To find out where to start we look for the magic Guest string, which marks
- * the code we see in lguest_asm.S. This is a hack which we are currently
- * plotting to replace with the normal Linux entry point. */
-static unsigned long entry_point(void *start, void *end,
- unsigned long page_offset)
+/* Get some more pages for a device. */
+static void *get_pages(unsigned int num)
{
- void *p;
+ void *addr = from_guest_phys(guest_limit);
- /* The scan gives us the physical starting address. We want the
- * virtual address in this case, and fortunately, we already figured
- * out the physical-virtual difference and passed it here in
- * "page_offset". */
- for (p = start; p < end; p++)
- if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
- return (long)p + strlen("GenuineLguest") + page_offset;
+ guest_limit += num * getpagesize();
+ if (guest_limit > guest_max)
+ errx(1, "Not enough memory for devices");
+ return addr;
+}
- err(1, "Is this image a genuine lguest?");
+/* This routine is used to load the kernel or initrd. It tries mmap, but if
+ * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
+ * it falls back to reading the memory in. */
+static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
+{
+ ssize_t r;
+
+ /* We map writable even though for some segments are marked read-only.
+ * The kernel really wants to be writable: it patches its own
+ * instructions.
+ *
+ * MAP_PRIVATE means that the page won't be copied until a write is
+ * done to it. This allows us to share untouched memory between
+ * Guests. */
+ if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC,
+ MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
+ return;
+
+ /* pread does a seek and a read in one shot: saves a few lines. */
+ r = pread(fd, addr, len, offset);
+ if (r != len)
+ err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
}
/* This routine takes an open vmlinux image, which is in ELF, and maps it into
@@ -167,19 +279,14 @@ static unsigned long entry_point(void *start, void *end,
* by all modern binaries on Linux including the kernel.
*
* The ELF headers give *two* addresses: a physical address, and a virtual
- * address. The Guest kernel expects to be placed in memory at the physical
- * address, and the page tables set up so it will correspond to that virtual
- * address. We return the difference between the virtual and physical
- * addresses in the "page_offset" pointer.
+ * address. We use the physical address; the Guest will map itself to the
+ * virtual address.
*
* We return the starting address. */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
- unsigned long *page_offset)
+static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
{
- void *addr;
Elf32_Phdr phdr[ehdr->e_phnum];
unsigned int i;
- unsigned long start = -1UL, end = 0;
/* Sanity checks on the main ELF header: an x86 executable with a
* reasonable number of correctly-sized program headers. */
@@ -199,9 +306,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
err(1, "Reading program headers");
- /* We don't know page_offset yet. */
- *page_offset = 0;
-
/* Try all the headers: there are usually only three. A read-only one,
* a read-write one, and a "note" section which isn't loadable. */
for (i = 0; i < ehdr->e_phnum; i++) {
@@ -212,158 +316,53 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
verbose("Section %i: size %i addr %p\n",
i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
- /* We expect a simple linear address space: every segment must
- * have the same difference between virtual (p_vaddr) and
- * physical (p_paddr) address. */
- if (!*page_offset)
- *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
- else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
- errx(1, "Page offset of section %i different", i);
-
- /* We track the first and last address we mapped, so we can
- * tell entry_point() where to scan. */
- if (phdr[i].p_paddr < start)
- start = phdr[i].p_paddr;
- if (phdr[i].p_paddr + phdr[i].p_filesz > end)
- end = phdr[i].p_paddr + phdr[i].p_filesz;
-
- /* We map this section of the file at its physical address. We
- * map it read & write even if the header says this segment is
- * read-only. The kernel really wants to be writable: it
- * patches its own instructions which would normally be
- * read-only.
- *
- * MAP_PRIVATE means that the page won't be copied until a
- * write is done to it. This allows us to share much of the
- * kernel memory between Guests. */
- addr = mmap((void *)phdr[i].p_paddr,
- phdr[i].p_filesz,
- PROT_READ|PROT_WRITE|PROT_EXEC,
- MAP_FIXED|MAP_PRIVATE,
- elf_fd, phdr[i].p_offset);
- if (addr != (void *)phdr[i].p_paddr)
- err(1, "Mmaping vmlinux seg %i gave %p not %p",
- i, addr, (void *)phdr[i].p_paddr);
+ /* We map this section of the file at its physical address. */
+ map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
+ phdr[i].p_offset, phdr[i].p_filesz);
}
- return entry_point((void *)start, (void *)end, *page_offset);
+ /* The entry point is given in the ELF header. */
+ return ehdr->e_entry;
}
-/*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated.
- *
- * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects
- * to be. We don't know what that option was, but we can figure it out
- * approximately by looking at the addresses in the code. I chose the common
- * case of reading a memory location into the %eax register:
- *
- * movl <some-address>, %eax
- *
- * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example,
- * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax.
- *
- * In this example can guess that the kernel was compiled with
- * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the
- * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our
- * kernel isn't that bloated yet.
- *
- * Unfortunately, x86 has variable-length instructions, so finding this
- * particular instruction properly involves writing a disassembler. Instead,
- * we rely on statistics. We look for "0xA1" and tally the different bytes
- * which occur 4 bytes later (the "0xC0" in our example above). When one of
- * those bytes appears three times, we can be reasonably confident that it
- * forms the start of CONFIG_PAGE_OFFSET.
+/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're
+ * supposed to jump into it and it will unpack itself. We used to have to
+ * perform some hairy magic because the unpacking code scared me.
*
- * This is amazingly reliable. */
-static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
+ * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
+ * a small patch to jump over the tricky bits in the Guest, so now we just read
+ * the funky header so we know where in the file to load, and away we go! */
+static unsigned long load_bzimage(int fd)
{
- unsigned int i, possibilities[256] = { 0 };
+ struct boot_params boot;
+ int r;
+ /* Modern bzImages get loaded at 1M. */
+ void *p = from_guest_phys(0x100000);
- for (i = 0; i + 4 < len; i++) {
- /* mov 0xXXXXXXXX,%eax */
- if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
- return (unsigned long)img[i+4] << 24;
- }
- errx(1, "could not determine page offset");
-}
+ /* Go back to the start of the file and read the header. It should be
+ * a Linux boot header (see Documentation/i386/boot.txt) */
+ lseek(fd, 0, SEEK_SET);
+ read(fd, &boot, sizeof(boot));
-/*L:160 Unfortunately the entire ELF image isn't compressed: the segments
- * which need loading are extracted and compressed raw. This denies us the
- * information we need to make a fully-general loader. */
-static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
-{
- gzFile f;
- int ret, len = 0;
- /* A bzImage always gets loaded at physical address 1M. This is
- * actually configurable as CONFIG_PHYSICAL_START, but as the comment
- * there says, "Don't change this unless you know what you are doing".
- * Indeed. */
- void *img = (void *)0x100000;
-
- /* gzdopen takes our file descriptor (carefully placed at the start of
- * the GZIP header we found) and returns a gzFile. */
- f = gzdopen(fd, "rb");
- /* We read it into memory in 64k chunks until we hit the end. */
- while ((ret = gzread(f, img + len, 65536)) > 0)
- len += ret;
- if (ret < 0)
- err(1, "reading image from bzImage");
-
- verbose("Unpacked size %i addr %p\n", len, img);
-
- /* Without the ELF header, we can't tell virtual-physical gap. This is
- * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately,
- * I have a clever way of figuring it out from the code itself. */
- *page_offset = intuit_page_offset(img, len);
-
- return entry_point(img, img + len, *page_offset);
-}
+ /* Inside the setup_hdr, we expect the magic "HdrS" */
+ if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
+ errx(1, "This doesn't look like a bzImage to me");
-/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're
- * supposed to jump into it and it will unpack itself. We can't do that
- * because the Guest can't run the unpacking code, and adding features to
- * lguest kills puppies, so we don't want to.
- *
- * The bzImage is formed by putting the decompressing code in front of the
- * compressed kernel code. So we can simple scan through it looking for the
- * first "gzip" header, and start decompressing from there. */
-static unsigned long load_bzimage(int fd, unsigned long *page_offset)
-{
- unsigned char c;
- int state = 0;
-
- /* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */
- while (read(fd, &c, 1) == 1) {
- switch (state) {
- case 0:
- if (c == 0x1F)
- state++;
- break;
- case 1:
- if (c == 0x8B)
- state++;
- else
- state = 0;
- break;
- case 2 ... 8:
- state++;
- break;
- case 9:
- /* Seek back to the start of the gzip header. */
- lseek(fd, -10, SEEK_CUR);
- /* One final check: "compressed under UNIX". */
- if (c != 0x03)
- state = -1;
- else
- return unpack_bzimage(fd, page_offset);
- }
- }
- errx(1, "Could not find kernel in bzImage");
+ /* Skip over the extra sectors of the header. */
+ lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
+
+ /* Now read everything into memory. in nice big chunks. */
+ while ((r = read(fd, p, 65536)) > 0)
+ p += r;
+
+ /* Finally, code32_start tells us where to enter the kernel. */
+ return boot.hdr.code32_start;
}
/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
- * come wrapped up in the self-decompressing "bzImage" format. With some funky
- * coding, we can load those, too. */
-static unsigned long load_kernel(int fd, unsigned long *page_offset)
+ * come wrapped up in the self-decompressing "bzImage" format. With a little
+ * work, we can load those, too. */
+static unsigned long load_kernel(int fd)
{
Elf32_Ehdr hdr;
@@ -373,10 +372,10 @@ static unsigned long load_kernel(int fd, unsigned long *page_offset)
/* If it's an ELF file, it starts with "\177ELF" */
if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
- return map_elf(fd, &hdr, page_offset);
+ return map_elf(fd, &hdr);
/* Otherwise we assume it's a bzImage, and try to unpack it */
- return load_bzimage(fd, page_offset);
+ return load_bzimage(fd);
}
/* This is a trivial little helper to align pages. Andi Kleen hated it because
@@ -402,59 +401,45 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
int ifd;
struct stat st;
unsigned long len;
- void *iaddr;
ifd = open_or_die(name, O_RDONLY);
/* fstat() is needed to get the file size. */
if (fstat(ifd, &st) < 0)
err(1, "fstat() on initrd '%s'", name);
- /* The length needs to be rounded up to a page size: mmap needs the
- * address to be page aligned. */
+ /* We map the initrd at the top of memory, but mmap wants it to be
+ * page-aligned, so we round the size up for that. */
len = page_align(st.st_size);
- /* We map the initrd at the top of memory. */
- iaddr = mmap((void *)mem - len, st.st_size,
- PROT_READ|PROT_EXEC|PROT_WRITE,
- MAP_FIXED|MAP_PRIVATE, ifd, 0);
- if (iaddr != (void *)mem - len)
- err(1, "Mmaping initrd '%s' returned %p not %p",
- name, iaddr, (void *)mem - len);
+ map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
/* Once a file is mapped, you can close the file descriptor. It's a
* little odd, but quite useful. */
close(ifd);
- verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr);
+ verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
/* We return the initrd size. */
return len;
}
-/* Once we know how much memory we have, and the address the Guest kernel
- * expects, we can construct simple linear page tables which will get the Guest
- * far enough into the boot to create its own.
+/* Once we know how much memory we have, we can construct simple linear page
+ * tables which set virtual == physical which will get the Guest far enough
+ * into the boot to create its own.
*
* We lay them out of the way, just below the initrd (which is why we need to
* know its size). */
static unsigned long setup_pagetables(unsigned long mem,
- unsigned long initrd_size,
- unsigned long page_offset)
+ unsigned long initrd_size)
{
- u32 *pgdir, *linear;
+ unsigned long *pgdir, *linear;
unsigned int mapped_pages, i, linear_pages;
- unsigned int ptes_per_page = getpagesize()/sizeof(u32);
+ unsigned int ptes_per_page = getpagesize()/sizeof(void *);
- /* Ideally we map all physical memory starting at page_offset.
- * However, if page_offset is 0xC0000000 we can only map 1G of physical
- * (0xC0000000 + 1G overflows). */
- if (mem <= -page_offset)
- mapped_pages = mem/getpagesize();
- else
- mapped_pages = -page_offset/getpagesize();
+ mapped_pages = mem/getpagesize();
/* Each PTE page can map ptes_per_page pages: how many do we need? */
linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
/* We put the toplevel page directory page at the top of memory. */
- pgdir = (void *)mem - initrd_size - getpagesize();
+ pgdir = from_guest_phys(mem) - initrd_size - getpagesize();
/* Now we use the next linear_pages pages as pte pages */
linear = (void *)pgdir - linear_pages*getpagesize();
@@ -465,21 +450,21 @@ static unsigned long setup_pagetables(unsigned long mem,
for (i = 0; i < mapped_pages; i++)
linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
- /* The top level points to the linear page table pages above. The
- * entry representing page_offset points to the first one, and they
- * continue from there. */
+ /* The top level points to the linear page table pages above. */
for (i = 0; i < mapped_pages; i += ptes_per_page) {
- pgdir[(i + page_offset/getpagesize())/ptes_per_page]
- = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT);
+ pgdir[i/ptes_per_page]
+ = ((to_guest_phys(linear) + i*sizeof(void *))
+ | PAGE_PRESENT);
}
- verbose("Linear mapping of %u pages in %u pte pages at %p\n",
- mapped_pages, linear_pages, linear);
+ verbose("Linear mapping of %u pages in %u pte pages at %#lx\n",
+ mapped_pages, linear_pages, to_guest_phys(linear));
/* We return the top level (guest-physical) address: the kernel needs
* to know where it is. */
- return (unsigned long)pgdir;
+ return to_guest_phys(pgdir);
}
+/*:*/
/* Simple routine to roll all the commandline arguments together with spaces
* between them. */
@@ -496,16 +481,19 @@ static void concat(char *dst, char *args[])
dst[len] = '\0';
}
-/* This is where we actually tell the kernel to initialize the Guest. We saw
- * the arguments it expects when we looked at initialize() in lguest_user.c:
- * the top physical page to allow, the top level pagetable, the entry point and
- * the page_offset constant for the Guest. */
-static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
+/*L:185 This is where we actually tell the kernel to initialize the Guest. We
+ * saw the arguments it expects when we looked at initialize() in lguest_user.c:
+ * the base of Guest "physical" memory, the top physical page to allow, the
+ * top level pagetable and the entry point for the Guest. */
+static int tell_kernel(unsigned long pgdir, unsigned long start)
{
- u32 args[] = { LHREQ_INITIALIZE,
- top/getpagesize(), pgdir, start, page_offset };
+ unsigned long args[] = { LHREQ_INITIALIZE,
+ (unsigned long)guest_base,
+ guest_limit / getpagesize(), pgdir, start };
int fd;
+ verbose("Guest: %p - %p (%#lx)\n",
+ guest_base, guest_base + guest_limit, guest_limit);
fd = open_or_die("/dev/lguest", O_RDWR);
if (write(fd, args, sizeof(args)) < 0)
err(1, "Writing to /dev/lguest");
@@ -515,62 +503,67 @@ static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
}
/*:*/
-static void set_fd(int fd, struct device_list *devices)
+static void add_device_fd(int fd)
{
- FD_SET(fd, &devices->infds);
- if (fd > devices->max_infd)
- devices->max_infd = fd;
+ FD_SET(fd, &devices.infds);
+ if (fd > devices.max_infd)
+ devices.max_infd = fd;
}
/*L:200
* The Waker.
*
- * With a console and network devices, we can have lots of input which we need
- * to process. We could try to tell the kernel what file descriptors to watch,
- * but handing a file descriptor mask through to the kernel is fairly icky.
+ * With console, block and network devices, we can have lots of input which we
+ * need to process. We could try to tell the kernel what file descriptors to
+ * watch, but handing a file descriptor mask through to the kernel is fairly
+ * icky.
*
* Instead, we fork off a process which watches the file descriptors and writes
- * the LHREQ_BREAK command to the /dev/lguest filedescriptor to tell the Host
- * loop to stop running the Guest. This causes it to return from the
+ * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
+ * stop running the Guest. This causes the Launcher to return from the
* /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
* the LHREQ_BREAK and wake us up again.
*
* This, of course, is merely a different *kind* of icky.
*/
-static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices)
+static void wake_parent(int pipefd, int lguest_fd)
{
/* Add the pipe from the Launcher to the fdset in the device_list, so
* we watch it, too. */
- set_fd(pipefd, devices);
+ add_device_fd(pipefd);
for (;;) {
- fd_set rfds = devices->infds;
- u32 args[] = { LHREQ_BREAK, 1 };
+ fd_set rfds = devices.infds;
+ unsigned long args[] = { LHREQ_BREAK, 1 };
/* Wait until input is ready from one of the devices. */
- select(devices->max_infd+1, &rfds, NULL, NULL, NULL);
+ select(devices.max_infd+1, &rfds, NULL, NULL, NULL);
/* Is it a message from the Launcher? */
if (FD_ISSET(pipefd, &rfds)) {
- int ignorefd;
+ int fd;
/* If read() returns 0, it means the Launcher has
* exited. We silently follow. */
- if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0)
+ if (read(pipefd, &fd, sizeof(fd)) == 0)
exit(0);
- /* Otherwise it's telling us there's a problem with one
- * of the devices, and we should ignore that file
- * descriptor from now on. */
- FD_CLR(ignorefd, &devices->infds);
+ /* Otherwise it's telling us to change what file
+ * descriptors we're to listen to. Positive means
+ * listen to a new one, negative means stop
+ * listening. */
+ if (fd >= 0)
+ FD_SET(fd, &devices.infds);
+ else
+ FD_CLR(-fd - 1, &devices.infds);
} else /* Send LHREQ_BREAK command. */
write(lguest_fd, args, sizeof(args));
}
}
/* This routine just sets up a pipe to the Waker process. */
-static int setup_waker(int lguest_fd, struct device_list *device_list)
+static int setup_waker(int lguest_fd)
{
int pipefd[2], child;
- /* We create a pipe to talk to the waker, and also so it knows when the
+ /* We create a pipe to talk to the Waker, and also so it knows when the
* Launcher dies (and closes pipe). */
pipe(pipefd);
child = fork();
@@ -578,9 +571,10 @@ static int setup_waker(int lguest_fd, struct device_list *device_list)
err(1, "forking");
if (child == 0) {
- /* Close the "writing" end of our copy of the pipe */
+ /* We are the Waker: close the "writing" end of our copy of the
+ * pipe and start waiting for input. */
close(pipefd[1]);
- wake_parent(pipefd[0], lguest_fd, device_list);
+ wake_parent(pipefd[0], lguest_fd);
}
/* Close the reading end of our copy of the pipe. */
close(pipefd[0]);
@@ -589,12 +583,12 @@ static int setup_waker(int lguest_fd, struct device_list *device_list)
return pipefd[1];
}
-/*L:210
+/*
* Device Handling.
*
- * When the Guest sends DMA to us, it sends us an array of addresses and sizes.
+ * When the Guest gives us a buffer, it sends an array of addresses and sizes.
* We need to make sure it's not trying to reach into the Launcher itself, so
- * we have a convenient routine which check it and exits with an error message
+ * we have a convenient routine which checks it and exits with an error message
* if something funny is going on:
*/
static void *_check_pointer(unsigned long addr, unsigned int size,
@@ -602,87 +596,139 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
{
/* We have to separately check addr and addr+size, because size could
* be huge and addr + size might wrap around. */
- if (addr >= top || addr + size >= top)
- errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr);
+ if (addr >= guest_limit || addr + size >= guest_limit)
+ errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
/* We return a pointer for the caller's convenience, now we know it's
* safe to use. */
- return (void *)addr;
+ return from_guest_phys(addr);
}
/* A macro which transparently hands the line number to the real function. */
#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
-/* The Guest has given us the address of a "struct lguest_dma". We check it's
- * OK and convert it to an iovec (which is a simple array of ptr/size
- * pairs). */
-static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
+/* Each buffer in the virtqueues is actually a chain of descriptors. This
+ * function returns the next descriptor in the chain, or vq->vring.num if we're
+ * at the end. */
+static unsigned next_desc(struct virtqueue *vq, unsigned int i)
{
- unsigned int i;
- struct lguest_dma *udma;
-
- /* First we make sure that the array memory itself is valid. */
- udma = check_pointer(dma, sizeof(*udma));
- /* Now we check each element */
- for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
- /* A zero length ends the array. */
- if (!udma->len[i])
- break;
+ unsigned int next;
- iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]);
- iov[i].iov_len = udma->len[i];
- }
- *num = i;
+ /* If this descriptor says it doesn't chain, we're done. */
+ if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT))
+ return vq->vring.num;
- /* We return the pointer to where the caller should write the amount of
- * the buffer used. */
- return &udma->used_len;
+ /* Check they're not leading us off end of descriptors. */
+ next = vq->vring.desc[i].next;
+ /* Make sure compiler knows to grab that: we don't want it changing! */
+ wmb();
+
+ if (next >= vq->vring.num)
+ errx(1, "Desc next is %u", next);
+
+ return next;
}
-/* This routine gets a DMA buffer from the Guest for a given key, and converts
- * it to an iovec array. It returns the interrupt the Guest wants when we're
- * finished, and a pointer to the "used_len" field to fill in. */
-static u32 *get_dma_buffer(int fd, void *key,
- struct iovec iov[], unsigned int *num, u32 *irq)
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->vring.num (which
+ * is never a valid descriptor number) if none was found. */
+static unsigned get_vq_desc(struct virtqueue *vq,
+ struct iovec iov[],
+ unsigned int *out_num, unsigned int *in_num)
{
- u32 buf[] = { LHREQ_GETDMA, (u32)key };
- unsigned long udma;
- u32 *res;
-
- /* Ask the kernel for a DMA buffer corresponding to this key. */
- udma = write(fd, buf, sizeof(buf));
- /* They haven't registered any, or they're all used? */
- if (udma == (unsigned long)-1)
- return NULL;
-
- /* Convert it into our iovec array */
- res = dma2iov(udma, iov, num);
- /* The kernel stashes irq in ->used_len to get it out to us. */
- *irq = *res;
- /* Return a pointer to ((struct lguest_dma *)udma)->used_len. */
- return res;
+ unsigned int i, head;
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num)
+ errx(1, "Guest moved used index from %u to %u",
+ vq->last_avail_idx, vq->vring.avail->idx);
+
+ /* If there's nothing new since last we looked, return invalid. */
+ if (vq->vring.avail->idx == vq->last_avail_idx)
+ return vq->vring.num;
+
+ /* Grab the next descriptor number they're advertising, and increment
+ * the index we've seen. */
+ head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num];
+
+ /* If their number is silly, that's a fatal mistake. */
+ if (head >= vq->vring.num)
+ errx(1, "Guest says index %u is available", head);
+
+ /* When we start there are none of either input nor output. */
+ *out_num = *in_num = 0;
+
+ i = head;
+ do {
+ /* Grab the first descriptor, and check it's OK. */
+ iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len;
+ iov[*out_num + *in_num].iov_base
+ = check_pointer(vq->vring.desc[i].addr,
+ vq->vring.desc[i].len);
+ /* If this is an input descriptor, increment that count. */
+ if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE)
+ (*in_num)++;
+ else {
+ /* If it's an output descriptor, they're all supposed
+ * to come before any input descriptors. */
+ if (*in_num)
+ errx(1, "Descriptor has out after in");
+ (*out_num)++;
+ }
+
+ /* If we've got too many, that implies a descriptor loop. */
+ if (*out_num + *in_num > vq->vring.num)
+ errx(1, "Looped descriptor");
+ } while ((i = next_desc(vq, i)) != vq->vring.num);
+
+ return head;
}
-/* This is a convenient routine to send the Guest an interrupt. */
-static void trigger_irq(int fd, u32 irq)
+/* After we've used one of their buffers, we tell them about it. We'll then
+ * want to send them an interrupt, using trigger_irq(). */
+static void add_used(struct virtqueue *vq, unsigned int head, int len)
{
- u32 buf[] = { LHREQ_IRQ, irq };
+ struct vring_used_elem *used;
+
+ /* The virtqueue contains a ring of used buffers. Get a pointer to the
+ * next entry in that used ring. */
+ used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
+ used->id = head;
+ used->len = len;
+ /* Make sure buffer is written before we update index. */
+ wmb();
+ vq->vring.used->idx++;
+}
+
+/* This actually sends the interrupt for this virtqueue */
+static void trigger_irq(int fd, struct virtqueue *vq)
+{
+ unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
+
+ /* If they don't want an interrupt, don't send one. */
+ if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+ return;
+
+ /* Send the Guest an interrupt tell them we used something up. */
if (write(fd, buf, sizeof(buf)) != 0)
- err(1, "Triggering irq %i", irq);
+ err(1, "Triggering irq %i", vq->config.irq);
}
-/* This simply sets up an iovec array where we can put data to be discarded.
- * This happens when the Guest doesn't want or can't handle the input: we have
- * to get rid of it somewhere, and if we bury it in the ceiling space it will
- * start to smell after a week. */
-static void discard_iovec(struct iovec *iov, unsigned int *num)
+/* And here's the combo meal deal. Supersize me! */
+static void add_used_and_trigger(int fd, struct virtqueue *vq,
+ unsigned int head, int len)
{
- static char discard_buf[1024];
- *num = 1;
- iov->iov_base = discard_buf;
- iov->iov_len = sizeof(discard_buf);
+ add_used(vq, head, len);
+ trigger_irq(fd, vq);
}
-/* Here is the input terminal setting we save, and the routine to restore them
- * on exit so the user can see what they type next. */
+/*
+ * The Console
+ *
+ * Here is the input terminal setting we save, and the routine to restore them
+ * on exit so the user gets their terminal back. */
static struct termios orig_term;
static void restore_term(void)
{
@@ -701,38 +747,39 @@ struct console_abort
/* This is the routine which handles console input (ie. stdin). */
static bool handle_console_input(int fd, struct device *dev)
{
- u32 irq = 0, *lenp;
int len;
- unsigned int num;
- struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+ unsigned int head, in_num, out_num;
+ struct iovec iov[dev->vq->vring.num];
struct console_abort *abort = dev->priv;
- /* First we get the console buffer from the Guest. The key is dev->mem
- * which was set to 0 in setup_console(). */
- lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq);
- if (!lenp) {
- /* If it's not ready for input, warn and set up to discard. */
- warn("console: no dma buffer!");
- discard_iovec(iov, &num);
- }
+ /* First we need a console buffer from the Guests's input virtqueue. */
+ head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+
+ /* If they're not ready for input, stop listening to this file
+ * descriptor. We'll start again once they add an input buffer. */
+ if (head == dev->vq->vring.num)
+ return false;
+
+ if (out_num)
+ errx(1, "Output buffers in console in queue?");
/* This is why we convert to iovecs: the readv() call uses them, and so
* it reads straight into the Guest's buffer. */
- len = readv(dev->fd, iov, num);
+ len = readv(dev->fd, iov, in_num);
if (len <= 0) {
/* This implies that the console is closed, is /dev/null, or
- * something went terribly wrong. We still go through the rest
- * of the logic, though, especially the exit handling below. */
+ * something went terribly wrong. */
warnx("Failed to get console input, ignoring console.");
- len = 0;
+ /* Put the input terminal back. */
+ restore_term();
+ /* Remove callback from input vq, so it doesn't restart us. */
+ dev->vq->handle_output = NULL;
+ /* Stop listening to this fd: don't call us again. */
+ return false;
}
- /* If we read the data into the Guest, fill in the length and send the
- * interrupt. */
- if (lenp) {
- *lenp = len;
- trigger_irq(fd, irq);
- }
+ /* Tell the Guest about the new input. */
+ add_used_and_trigger(fd, dev->vq, head, len);
/* Three ^C within one second? Exit.
*
@@ -746,7 +793,7 @@ static bool handle_console_input(int fd, struct device *dev)
struct timeval now;
gettimeofday(&now, NULL);
if (now.tv_sec <= abort->start.tv_sec+1) {
- u32 args[] = { LHREQ_BREAK, 0 };
+ unsigned long args[] = { LHREQ_BREAK, 0 };
/* Close the fd so Waker will know it has to
* exit. */
close(waker_fd);
@@ -761,214 +808,168 @@ static bool handle_console_input(int fd, struct device *dev)
/* Any other key resets the abort counter. */
abort->count = 0;
- /* Now, if we didn't read anything, put the input terminal back and
- * return failure (meaning, don't call us again). */
- if (!len) {
- restore_term();
- return false;
- }
/* Everything went OK! */
return true;
}
-/* Handling console output is much simpler than input. */
-static u32 handle_console_output(int fd, const struct iovec *iov,
- unsigned num, struct device*dev)
+/* Handling output for console is simple: we just get all the output buffers
+ * and write them to stdout. */
+static void handle_console_output(int fd, struct virtqueue *vq)
{
- /* Whatever the Guest sends, write it to standard output. Return the
- * number of bytes written. */
- return writev(STDOUT_FILENO, iov, num);
-}
-
-/* Guest->Host network output is also pretty easy. */
-static u32 handle_tun_output(int fd, const struct iovec *iov,
- unsigned num, struct device *dev)
-{
- /* We put a flag in the "priv" pointer of the network device, and set
- * it as soon as we see output. We'll see why in handle_tun_input() */
- *(bool *)dev->priv = true;
- /* Whatever packet the Guest sent us, write it out to the tun
- * device. */
- return writev(dev->fd, iov, num);
+ unsigned int head, out, in;
+ int len;
+ struct iovec iov[vq->vring.num];
+
+ /* Keep getting output buffers from the Guest until we run out. */
+ while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
+ if (in)
+ errx(1, "Input buffers in output queue?");
+ len = writev(STDOUT_FILENO, iov, out);
+ add_used_and_trigger(fd, vq, head, len);
+ }
}
-/* This matches the peer_key() in lguest_net.c. The key for any given slot
- * is the address of the network device's page plus 4 * the slot number. */
-static unsigned long peer_offset(unsigned int peernum)
+/*
+ * The Network
+ *
+ * Handling output for network is also simple: we get all the output buffers
+ * and write them (ignoring the first element) to this device's file descriptor
+ * (stdout). */
+static void handle_net_output(int fd, struct virtqueue *vq)
{
- return 4 * peernum;
+ unsigned int head, out, in;
+ int len;
+ struct iovec iov[vq->vring.num];
+
+ /* Keep getting output buffers from the Guest until we run out. */
+ while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
+ if (in)
+ errx(1, "Input buffers in output queue?");
+ /* Check header, but otherwise ignore it (we told the Guest we
+ * supported no features, so it shouldn't have anything
+ * interesting). */
+ (void)convert(&iov[0], struct virtio_net_hdr);
+ len = writev(vq->dev->fd, iov+1, out-1);
+ add_used_and_trigger(fd, vq, head, len);
+ }
}
-/* This is where we handle a packet coming in from the tun device */
+/* This is where we handle a packet coming in from the tun device to our
+ * Guest. */
static bool handle_tun_input(int fd, struct device *dev)
{
- u32 irq = 0, *lenp;
+ unsigned int head, in_num, out_num;
int len;
- unsigned num;
- struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+ struct iovec iov[dev->vq->vring.num];
+ struct virtio_net_hdr *hdr;
- /* First we get a buffer the Guest has bound to its key. */
- lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num,
- &irq);
- if (!lenp) {
+ /* First we need a network buffer from the Guests's recv virtqueue. */
+ head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+ if (head == dev->vq->vring.num) {
/* Now, it's expected that if we try to send a packet too
- * early, the Guest won't be ready yet. This is why we set a
- * flag when the Guest sends its first packet. If it's sent a
- * packet we assume it should be ready to receive them.
- *
- * Actually, this is what the status bits in the descriptor are
- * for: we should *use* them. FIXME! */
- if (*(bool *)dev->priv)
+ * early, the Guest won't be ready yet. Wait until the device
+ * status says it's ready. */
+ /* FIXME: Actually want DRIVER_ACTIVE here. */
+ if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK)
warn("network: no dma buffer!");
- discard_iovec(iov, &num);
- }
+ /* We'll turn this back on if input buffers are registered. */
+ return false;
+ } else if (out_num)
+ errx(1, "Output buffers in network recv queue?");
+
+ /* First element is the header: we set it to 0 (no features). */
+ hdr = convert(&iov[0], struct virtio_net_hdr);
+ hdr->flags = 0;
+ hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
/* Read the packet from the device directly into the Guest's buffer. */
- len = readv(dev->fd, iov, num);
+ len = readv(dev->fd, iov+1, in_num-1);
if (len <= 0)
err(1, "reading network");
- /* Write the used_len, and trigger the interrupt for the Guest */
- if (lenp) {
- *lenp = len;
- trigger_irq(fd, irq);
- }
+ /* Tell the Guest about the new packet. */
+ add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len);
+
verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
- ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1],
- lenp ? "sent" : "discarded");
+ ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
+ head != dev->vq->vring.num ? "sent" : "discarded");
+
/* All good. */
return true;
}
-/* The last device handling routine is block output: the Guest has sent a DMA
- * to the block device. It will have placed the command it wants in the
- * "struct lguest_block_page". */
-static u32 handle_block_output(int fd, const struct iovec *iov,
- unsigned num, struct device *dev)
+/*L:215 This is the callback attached to the network and console input
+ * virtqueues: it ensures we try again, in case we stopped console or net
+ * delivery because Guest didn't have any buffers. */
+static void enable_fd(int fd, struct virtqueue *vq)
{
- struct lguest_block_page *p = dev->mem;
- u32 irq, *lenp;
- unsigned int len, reply_num;
- struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
- off64_t device_len, off = (off64_t)p->sector * 512;
-
- /* First we extract the device length from the dev->priv pointer. */
- device_len = *(off64_t *)dev->priv;
-
- /* We first check that the read or write is within the length of the
- * block file. */
- if (off >= device_len)
- err(1, "Bad offset %llu vs %llu", off, device_len);
- /* Move to the right location in the block file. This shouldn't fail,
- * but best to check. */
- if (lseek64(dev->fd, off, SEEK_SET) != off)
- err(1, "Bad seek to sector %i", p->sector);
-
- verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
-
- /* They were supposed to bind a reply buffer at key equal to the start
- * of the block device memory. We need this to tell them when the
- * request is finished. */
- lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
- if (!lenp)
- err(1, "Block request didn't give us a dma buffer");
-
- if (p->type) {
- /* A write request. The DMA they sent contained the data, so
- * write it out. */
- len = writev(dev->fd, iov, num);
- /* Grr... Now we know how long the "struct lguest_dma" they
- * sent was, we make sure they didn't try to write over the end
- * of the block file (possibly extending it). */
- if (off + len > device_len) {
- /* Trim it back to the correct length */
- ftruncate64(dev->fd, device_len);
- /* Die, bad Guest, die. */
- errx(1, "Write past end %llu+%u", off, len);
- }
- /* The reply length is 0: we just send back an empty DMA to
- * interrupt them and tell them the write is finished. */
- *lenp = 0;
- } else {
- /* A read request. They sent an empty DMA to start the
- * request, and we put the read contents into the reply
- * buffer. */
- len = readv(dev->fd, reply, reply_num);
- *lenp = len;
- }
-
- /* The result is 1 (done), 2 if there was an error (short read or
- * write). */
- p->result = 1 + (p->bytes != len);
- /* Now tell them we've used their reply buffer. */
- trigger_irq(fd, irq);
-
- /* We're supposed to return the number of bytes of the output buffer we
- * used. But the block device uses the "result" field instead, so we
- * don't bother. */
- return 0;
+ add_device_fd(vq->dev->fd);
+ /* Tell waker to listen to it again */
+ write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
}
-/* This is the generic routine we call when the Guest sends some DMA out. */
-static void handle_output(int fd, unsigned long dma, unsigned long key,
- struct device_list *devices)
+/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
+static void handle_output(int fd, unsigned long addr)
{
struct device *i;
- u32 *lenp;
- struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
- unsigned num = 0;
-
- /* Convert the "struct lguest_dma" they're sending to a "struct
- * iovec". */
- lenp = dma2iov(dma, iov, &num);
-
- /* Check each device: if they expect output to this key, tell them to
- * handle it. */
- for (i = devices->dev; i; i = i->next) {
- if (i->handle_output && key == i->watch_key) {
- /* We write the result straight into the used_len field
- * for them. */
- *lenp = i->handle_output(fd, iov, num, i);
- return;
+ struct virtqueue *vq;
+
+ /* Check each virtqueue. */
+ for (i = devices.dev; i; i = i->next) {
+ for (vq = i->vq; vq; vq = vq->next) {
+ if (vq->config.pfn == addr/getpagesize()
+ && vq->handle_output) {
+ verbose("Output to %s\n", vq->dev->name);
+ vq->handle_output(fd, vq);
+ return;
+ }
}
}
- /* This can happen: the kernel sends any SEND_DMA which doesn't match
- * another Guest to us. It could be that another Guest just left a
- * network, for example. But it's unusual. */
- warnx("Pending dma %p, key %p", (void *)dma, (void *)key);
+ /* Early console write is done using notify on a nul-terminated string
+ * in Guest memory. */
+ if (addr >= guest_limit)
+ errx(1, "Bad NOTIFY %#lx", addr);
+
+ write(STDOUT_FILENO, from_guest_phys(addr),
+ strnlen(from_guest_phys(addr), guest_limit - addr));
}
-/* This is called when the waker wakes us up: check for incoming file
+/* This is called when the Waker wakes us up: check for incoming file
* descriptors. */
-static void handle_input(int fd, struct device_list *devices)
+static void handle_input(int fd)
{
/* select() wants a zeroed timeval to mean "don't wait". */
struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
for (;;) {
struct device *i;
- fd_set fds = devices->infds;
+ fd_set fds = devices.infds;
/* If nothing is ready, we're done. */
- if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0)
+ if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0)
break;
/* Otherwise, call the device(s) which have readable
* file descriptors and a method of handling them. */
- for (i = devices->dev; i; i = i->next) {
+ for (i = devices.dev; i; i = i->next) {
if (i->handle_input && FD_ISSET(i->fd, &fds)) {
+ int dev_fd;
+ if (i->handle_input(fd, i))
+ continue;
+
/* If handle_input() returns false, it means we
- * should no longer service it.
- * handle_console_input() does this. */
- if (!i->handle_input(fd, i)) {
- /* Clear it from the set of input file
- * descriptors kept at the head of the
- * device list. */
- FD_CLR(i->fd, &devices->infds);
- /* Tell waker to ignore it too... */
- write(waker_fd, &i->fd, sizeof(i->fd));
- }
+ * should no longer service it. Networking and
+ * console do this when there's no input
+ * buffers to deliver into. Console also uses
+ * it when it discovers that stdin is
+ * closed. */
+ FD_CLR(i->fd, &devices.infds);
+ /* Tell waker to ignore it too, by sending a
+ * negative fd number (-1, since 0 is a valid
+ * FD number). */
+ dev_fd = -i->fd - 1;
+ write(waker_fd, &dev_fd, sizeof(dev_fd));
}
}
}
@@ -982,71 +983,121 @@ static void handle_input(int fd, struct device_list *devices)
* routines to allocate them.
*
* This routine allocates a new "struct lguest_device_desc" from descriptor
- * table in the devices array just above the Guest's normal memory. */
-static struct lguest_device_desc *
-new_dev_desc(struct lguest_device_desc *descs,
- u16 type, u16 features, u16 num_pages)
+ * table just above the Guest's normal memory. It returns a pointer to that
+ * descriptor. */
+static struct lguest_device_desc *new_dev_desc(u16 type)
{
- unsigned int i;
+ struct lguest_device_desc *d;
- for (i = 0; i < LGUEST_MAX_DEVICES; i++) {
- if (!descs[i].type) {
- descs[i].type = type;
- descs[i].features = features;
- descs[i].num_pages = num_pages;
- /* If they said the device needs memory, we allocate
- * that now, bumping up the top of Guest memory. */
- if (num_pages) {
- map_zeroed_pages(top, num_pages);
- descs[i].pfn = top/getpagesize();
- top += num_pages*getpagesize();
- }
- return &descs[i];
- }
- }
- errx(1, "too many devices");
+ /* We only have one page for all the descriptors. */
+ if (devices.desc_used + sizeof(*d) > getpagesize())
+ errx(1, "Too many devices");
+
+ /* We don't need to set config_len or status: page is 0 already. */
+ d = (void *)devices.descpage + devices.desc_used;
+ d->type = type;
+ devices.desc_used += sizeof(*d);
+
+ return d;
+}
+
+/* Each device descriptor is followed by some configuration information.
+ * Each configuration field looks like: u8 type, u8 len, [... len bytes...].
+ *
+ * This routine adds a new field to an existing device's descriptor. It only
+ * works for the last device, but that's OK because that's how we use it. */
+static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c)
+{
+ /* This is the last descriptor, right? */
+ assert(devices.descpage + devices.desc_used
+ == (u8 *)(dev->desc + 1) + dev->desc->config_len);
+
+ /* We only have one page of device descriptions. */
+ if (devices.desc_used + 2 + len > getpagesize())
+ errx(1, "Too many devices");
+
+ /* Copy in the new config header: type then length. */
+ devices.descpage[devices.desc_used++] = type;
+ devices.descpage[devices.desc_used++] = len;
+ memcpy(devices.descpage + devices.desc_used, c, len);
+ devices.desc_used += len;
+
+ /* Update the device descriptor length: two byte head then data. */
+ dev->desc->config_len += 2 + len;
}
-/* This monster routine does all the creation and setup of a new device,
- * including caling new_dev_desc() to allocate the descriptor and device
- * memory. */
-static struct device *new_device(struct device_list *devices,
- u16 type, u16 num_pages, u16 features,
- int fd,
- bool (*handle_input)(int, struct device *),
- unsigned long watch_off,
- u32 (*handle_output)(int,
- const struct iovec *,
- unsigned,
- struct device *))
+/* This routine adds a virtqueue to a device. We specify how many descriptors
+ * the virtqueue is to have. */
+static void add_virtqueue(struct device *dev, unsigned int num_descs,
+ void (*handle_output)(int fd, struct virtqueue *me))
+{
+ unsigned int pages;
+ struct virtqueue **i, *vq = malloc(sizeof(*vq));
+ void *p;
+
+ /* First we need some pages for this virtqueue. */
+ pages = (vring_size(num_descs) + getpagesize() - 1) / getpagesize();
+ p = get_pages(pages);
+
+ /* Initialize the configuration. */
+ vq->config.num = num_descs;
+ vq->config.irq = devices.next_irq++;
+ vq->config.pfn = to_guest_phys(p) / getpagesize();
+
+ /* Initialize the vring. */
+ vring_init(&vq->vring, num_descs, p);
+
+ /* Add the configuration information to this device's descriptor. */
+ add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE,
+ sizeof(vq->config), &vq->config);
+
+ /* Add to tail of list, so dev->vq is first vq, dev->vq->next is
+ * second. */
+ for (i = &dev->vq; *i; i = &(*i)->next);
+ *i = vq;
+
+ /* Link virtqueue back to device. */
+ vq->dev = dev;
+
+ /* Set the routine to call when the Guest does something to this
+ * virtqueue. */
+ vq->handle_output = handle_output;
+
+ /* Set the "Don't Notify Me" flag if we don't have a handler */
+ if (!handle_output)
+ vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
+}
+
+/* This routine does all the creation and setup of a new device, including
+ * calling new_dev_desc() to allocate the descriptor and device memory. */
+static struct device *new_device(const char *name, u16 type, int fd,
+ bool (*handle_input)(int, struct device *))
{
struct device *dev = malloc(sizeof(*dev));
/* Append to device list. Prepending to a single-linked list is
* easier, but the user expects the devices to be arranged on the bus
* in command-line order. The first network device on the command line
- * is eth0, the first block device /dev/lgba, etc. */
- *devices->lastdev = dev;
+ * is eth0, the first block device /dev/vda, etc. */
+ *devices.lastdev = dev;
dev->next = NULL;
- devices->lastdev = &dev->next;
+ devices.lastdev = &dev->next;
/* Now we populate the fields one at a time. */
dev->fd = fd;
/* If we have an input handler for this file descriptor, then we add it
* to the device_list's fdset and maxfd. */
if (handle_input)
- set_fd(dev->fd, devices);
- dev->desc = new_dev_desc(devices->descs, type, features, num_pages);
- dev->mem = (void *)(dev->desc->pfn * getpagesize());
+ add_device_fd(dev->fd);
+ dev->desc = new_dev_desc(type);
dev->handle_input = handle_input;
- dev->watch_key = (unsigned long)dev->mem + watch_off;
- dev->handle_output = handle_output;
+ dev->name = name;
return dev;
}
/* Our first setup routine is the console. It's a fairly simple device, but
* UNIX tty handling makes it uglier than it could be. */
-static void setup_console(struct device_list *devices)
+static void setup_console(void)
{
struct device *dev;
@@ -1062,127 +1113,38 @@ static void setup_console(struct device_list *devices)
atexit(restore_term);
}
- /* We don't currently require any memory for the console, so we ask for
- * 0 pages. */
- dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0,
- STDIN_FILENO, handle_console_input,
- LGUEST_CONSOLE_DMA_KEY, handle_console_output);
+ dev = new_device("console", VIRTIO_ID_CONSOLE,
+ STDIN_FILENO, handle_console_input);
/* We store the console state in dev->priv, and initialize it. */
dev->priv = malloc(sizeof(struct console_abort));
((struct console_abort *)dev->priv)->count = 0;
- verbose("device %p: console\n",
- (void *)(dev->desc->pfn * getpagesize()));
-}
-/* Setting up a block file is also fairly straightforward. */
-static void setup_block_file(const char *filename, struct device_list *devices)
-{
- int fd;
- struct device *dev;
- off64_t *device_len;
- struct lguest_block_page *p;
-
- /* We open with O_LARGEFILE because otherwise we get stuck at 2G. We
- * open with O_DIRECT because otherwise our benchmarks go much too
- * fast. */
- fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT);
-
- /* We want one page, and have no input handler (the block file never
- * has anything interesting to say to us). Our timing will be quite
- * random, so it should be a reasonable randomness source. */
- dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1,
- LGUEST_DEVICE_F_RANDOMNESS,
- fd, NULL, 0, handle_block_output);
-
- /* We store the device size in the private area */
- device_len = dev->priv = malloc(sizeof(*device_len));
- /* This is the safe way of establishing the size of our device: it
- * might be a normal file or an actual block device like /dev/hdb. */
- *device_len = lseek64(fd, 0, SEEK_END);
-
- /* The device memory is a "struct lguest_block_page". It's zeroed
- * already, we just need to put in the device size. Block devices
- * think in sectors (ie. 512 byte chunks), so we translate here. */
- p = dev->mem;
- p->num_sectors = *device_len/512;
- verbose("device %p: block %i sectors\n",
- (void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
+ /* The console needs two virtqueues: the input then the output. When
+ * they put something the input queue, we make sure we're listening to
+ * stdin. When they put something in the output queue, we write it to
+ * stdout. */
+ add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+ add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
+
+ verbose("device %u: console\n", devices.device_num++);
}
+/*:*/
-/*
- * Network Devices.
+/*M:010 Inter-guest networking is an interesting area. Simplest is to have a
+ * --sharenet=<name> option which opens or creates a named pipe. This can be
+ * used to send packets to another guest in a 1:1 manner.
*
- * Setting up network devices is quite a pain, because we have three types.
- * First, we have the inter-Guest network. This is a file which is mapped into
- * the address space of the Guests who are on the network. Because it is a
- * shared mapping, the same page underlies all the devices, and they can send
- * DMA to each other.
+ * More sopisticated is to use one of the tools developed for project like UML
+ * to do networking.
*
- * Remember from our network driver, the Guest is told what slot in the page it
- * is to use. We use exclusive fnctl locks to reserve a slot. If another
- * Guest is using a slot, the lock will fail and we try another. Because fnctl
- * locks are cleaned up automatically when we die, this cleverly means that our
- * reservation on the slot will vanish if we crash. */
-static unsigned int find_slot(int netfd, const char *filename)
-{
- struct flock fl;
-
- fl.l_type = F_WRLCK;
- fl.l_whence = SEEK_SET;
- fl.l_len = 1;
- /* Try a 1 byte lock in each possible position number */
- for (fl.l_start = 0;
- fl.l_start < getpagesize()/sizeof(struct lguest_net);
- fl.l_start++) {
- /* If we succeed, return the slot number. */
- if (fcntl(netfd, F_SETLK, &fl) == 0)
- return fl.l_start;
- }
- errx(1, "No free slots in network file %s", filename);
-}
-
-/* This function sets up the network file */
-static void setup_net_file(const char *filename,
- struct device_list *devices)
-{
- int netfd;
- struct device *dev;
-
- /* We don't use open_or_die() here: for friendliness we create the file
- * if it doesn't already exist. */
- netfd = open(filename, O_RDWR, 0);
- if (netfd < 0) {
- if (errno == ENOENT) {
- netfd = open(filename, O_RDWR|O_CREAT, 0600);
- if (netfd >= 0) {
- /* If we succeeded, initialize the file with a
- * blank page. */
- char page[getpagesize()];
- memset(page, 0, sizeof(page));
- write(netfd, page, sizeof(page));
- }
- }
- if (netfd < 0)
- err(1, "cannot open net file '%s'", filename);
- }
-
- /* We need 1 page, and the features indicate the slot to use and that
- * no checksum is needed. We never touch this device again; it's
- * between the Guests on the network, so we don't register input or
- * output handlers. */
- dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
- find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM,
- -1, NULL, 0, NULL);
-
- /* Map the shared file. */
- if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
- MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
- err(1, "could not mmap '%s'", filename);
- verbose("device %p: shared net %s, peer %i\n",
- (void *)(dev->desc->pfn * getpagesize()), filename,
- dev->desc->features & ~LGUEST_NET_F_NOCSUM);
-}
-/*:*/
+ * Faster is to do virtio bonding in kernel. Doing this 1:1 would be
+ * completely generic ("here's my vring, attach to your vring") and would work
+ * for any traffic. Of course, namespace and permissions issues need to be
+ * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide
+ * multiple inter-guest channels behind one interface, although it would
+ * require some manner of hotplugging new virtio channels.
+ *
+ * Finally, we could implement a virtio network switch in the kernel. :*/
static u32 str2ip(const char *ipaddr)
{
@@ -1217,7 +1179,7 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name)
/* This sets up the Host end of the network device with an IP address, brings
* it up so packets will flow, the copies the MAC address into the hwaddr
- * pointer (in practice, the Host's slot in the network device's memory). */
+ * pointer. */
static void configure_device(int fd, const char *devname, u32 ipaddr,
unsigned char hwaddr[6])
{
@@ -1243,18 +1205,18 @@ static void configure_device(int fd, const char *devname, u32 ipaddr,
memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
}
-/*L:195 The other kind of network is a Host<->Guest network. This can either
- * use briding or routing, but the principle is the same: it uses the "tun"
- * device to inject packets into the Host as if they came in from a normal
- * network card. We just shunt packets between the Guest and the tun
- * device. */
-static void setup_tun_net(const char *arg, struct device_list *devices)
+/*L:195 Our network is a Host<->Guest network. This can either use bridging or
+ * routing, but the principle is the same: it uses the "tun" device to inject
+ * packets into the Host as if they came in from a normal network card. We
+ * just shunt packets between the Guest and the tun device. */
+static void setup_tun_net(const char *arg)
{
struct device *dev;
struct ifreq ifr;
int netfd, ipfd;
u32 ip;
const char *br_name = NULL;
+ u8 hwaddr[6];
/* We open the /dev/net/tun device and tell it we want a tap device. A
* tap device is like a tun device, only somehow different. To tell
@@ -1270,21 +1232,13 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
* device: trust us! */
ioctl(netfd, TUNSETNOCSUM, 1);
- /* We create the net device with 1 page, using the features field of
- * the descriptor to tell the Guest it is in slot 1 (NET_PEERNUM), and
- * that the device has fairly random timing. We do *not* specify
- * LGUEST_NET_F_NOCSUM: these packets can reach the real world.
- *
- * We will put our MAC address is slot 0 for the Guest to see, so
- * it will send packets to us using the key "peer_offset(0)": */
- dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
- NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd,
- handle_tun_input, peer_offset(0), handle_tun_output);
+ /* First we create a new network device. */
+ dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
- /* We keep a flag which says whether we've seen packets come out from
- * this network device. */
- dev->priv = malloc(sizeof(bool));
- *(bool *)dev->priv = false;
+ /* Network devices need a receive and a send queue, just like
+ * console. */
+ add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+ add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
/* We need a socket to perform the magic network ioctls to bring up the
* tap interface, connect to the bridge etc. Any socket will do! */
@@ -1300,72 +1254,293 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
} else /* It is an IP address to set up the device with */
ip = str2ip(arg);
- /* We are peer 0, ie. first slot, so we hand dev->mem to this routine
- * to write the MAC address at the start of the device memory. */
- configure_device(ipfd, ifr.ifr_name, ip, dev->mem);
+ /* Set up the tun device, and get the mac address for the interface. */
+ configure_device(ipfd, ifr.ifr_name, ip, hwaddr);
- /* Set "promisc" bit: we want every single packet if we're going to
- * bridge to other machines (and otherwise it doesn't matter). */
- *((u8 *)dev->mem) |= 0x1;
+ /* Tell Guest what MAC address to use. */
+ add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr);
+ /* We don't seed the socket any more; setup is done. */
close(ipfd);
- verbose("device %p: tun net %u.%u.%u.%u\n",
- (void *)(dev->desc->pfn * getpagesize()),
- (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip);
+ verbose("device %u: tun net %u.%u.%u.%u\n",
+ devices.device_num++,
+ (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip);
if (br_name)
verbose("attached to bridge: %s\n", br_name);
}
+
+/* Our block (disk) device should be really simple: the Guest asks for a block
+ * number and we read or write that position in the file. Unfortunately, that
+ * was amazingly slow: the Guest waits until the read is finished before
+ * running anything else, even if it could have been doing useful work.
+ *
+ * We could use async I/O, except it's reputed to suck so hard that characters
+ * actually go missing from your code when you try to use it.
+ *
+ * So we farm the I/O out to thread, and communicate with it via a pipe. */
+
+/* This hangs off device->priv. */
+struct vblk_info
+{
+ /* The size of the file. */
+ off64_t len;
+
+ /* The file descriptor for the file. */
+ int fd;
+
+ /* IO thread listens on this file descriptor [0]. */
+ int workpipe[2];
+
+ /* IO thread writes to this file descriptor to mark it done, then
+ * Launcher triggers interrupt to Guest. */
+ int done_fd;
+};
+/*:*/
+
+/*L:210
+ * The Disk
+ *
+ * Remember that the block device is handled by a separate I/O thread. We head
+ * straight into the core of that thread here:
+ */
+static bool service_io(struct device *dev)
+{
+ struct vblk_info *vblk = dev->priv;
+ unsigned int head, out_num, in_num, wlen;
+ int ret;
+ struct virtio_blk_inhdr *in;
+ struct virtio_blk_outhdr *out;
+ struct iovec iov[dev->vq->vring.num];
+ off64_t off;
+
+ /* See if there's a request waiting. If not, nothing to do. */
+ head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+ if (head == dev->vq->vring.num)
+ return false;
+
+ /* Every block request should contain at least one output buffer
+ * (detailing the location on disk and the type of request) and one
+ * input buffer (to hold the result). */
+ if (out_num == 0 || in_num == 0)
+ errx(1, "Bad virtblk cmd %u out=%u in=%u",
+ head, out_num, in_num);
+
+ out = convert(&iov[0], struct virtio_blk_outhdr);
+ in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);
+ off = out->sector * 512;
+
+ /* The block device implements "barriers", where the Guest indicates
+ * that it wants all previous writes to occur before this write. We
+ * don't have a way of asking our kernel to do a barrier, so we just
+ * synchronize all the data in the file. Pretty poor, no? */
+ if (out->type & VIRTIO_BLK_T_BARRIER)
+ fdatasync(vblk->fd);
+
+ /* In general the virtio block driver is allowed to try SCSI commands.
+ * It'd be nice if we supported eject, for example, but we don't. */
+ if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
+ fprintf(stderr, "Scsi commands unsupported\n");
+ in->status = VIRTIO_BLK_S_UNSUPP;
+ wlen = sizeof(in);
+ } else if (out->type & VIRTIO_BLK_T_OUT) {
+ /* Write */
+
+ /* Move to the right location in the block file. This can fail
+ * if they try to write past end. */
+ if (lseek64(vblk->fd, off, SEEK_SET) != off)
+ err(1, "Bad seek to sector %llu", out->sector);
+
+ ret = writev(vblk->fd, iov+1, out_num-1);
+ verbose("WRITE to sector %llu: %i\n", out->sector, ret);
+
+ /* Grr... Now we know how long the descriptor they sent was, we
+ * make sure they didn't try to write over the end of the block
+ * file (possibly extending it). */
+ if (ret > 0 && off + ret > vblk->len) {
+ /* Trim it back to the correct length */
+ ftruncate64(vblk->fd, vblk->len);
+ /* Die, bad Guest, die. */
+ errx(1, "Write past end %llu+%u", off, ret);
+ }
+ wlen = sizeof(in);
+ in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
+ } else {
+ /* Read */
+
+ /* Move to the right location in the block file. This can fail
+ * if they try to read past end. */
+ if (lseek64(vblk->fd, off, SEEK_SET) != off)
+ err(1, "Bad seek to sector %llu", out->sector);
+
+ ret = readv(vblk->fd, iov+1, in_num-1);
+ verbose("READ from sector %llu: %i\n", out->sector, ret);
+ if (ret >= 0) {
+ wlen = sizeof(in) + ret;
+ in->status = VIRTIO_BLK_S_OK;
+ } else {
+ wlen = sizeof(in);
+ in->status = VIRTIO_BLK_S_IOERR;
+ }
+ }
+
+ /* We can't trigger an IRQ, because we're not the Launcher. It does
+ * that when we tell it we're done. */
+ add_used(dev->vq, head, wlen);
+ return true;
+}
+
+/* This is the thread which actually services the I/O. */
+static int io_thread(void *_dev)
+{
+ struct device *dev = _dev;
+ struct vblk_info *vblk = dev->priv;
+ char c;
+
+ /* Close other side of workpipe so we get 0 read when main dies. */
+ close(vblk->workpipe[1]);
+ /* Close the other side of the done_fd pipe. */
+ close(dev->fd);
+
+ /* When this read fails, it means Launcher died, so we follow. */
+ while (read(vblk->workpipe[0], &c, 1) == 1) {
+ /* We acknowledge each request immediately to reduce latency,
+ * rather than waiting until we've done them all. I haven't
+ * measured to see if it makes any difference. */
+ while (service_io(dev))
+ write(vblk->done_fd, &c, 1);
+ }
+ return 0;
+}
+
+/* Now we've seen the I/O thread, we return to the Launcher to see what happens
+ * when the thread tells us it's completed some I/O. */
+static bool handle_io_finish(int fd, struct device *dev)
+{
+ char c;
+
+ /* If the I/O thread died, presumably it printed the error, so we
+ * simply exit. */
+ if (read(dev->fd, &c, 1) != 1)
+ exit(1);
+
+ /* It did some work, so trigger the irq. */
+ trigger_irq(fd, dev->vq);
+ return true;
+}
+
+/* When the Guest submits some I/O, we just need to wake the I/O thread. */
+static void handle_virtblk_output(int fd, struct virtqueue *vq)
+{
+ struct vblk_info *vblk = vq->dev->priv;
+ char c = 0;
+
+ /* Wake up I/O thread and tell it to go to work! */
+ if (write(vblk->workpipe[1], &c, 1) != 1)
+ /* Presumably it indicated why it died. */
+ exit(1);
+}
+
+/*L:198 This actually sets up a virtual block device. */
+static void setup_block_file(const char *filename)
+{
+ int p[2];
+ struct device *dev;
+ struct vblk_info *vblk;
+ void *stack;
+ u64 cap;
+ unsigned int val;
+
+ /* This is the pipe the I/O thread will use to tell us I/O is done. */
+ pipe(p);
+
+ /* The device responds to return from I/O thread. */
+ dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);
+
+ /* The device has one virtqueue, where the Guest places requests. */
+ add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);
+
+ /* Allocate the room for our own bookkeeping */
+ vblk = dev->priv = malloc(sizeof(*vblk));
+
+ /* First we open the file and store the length. */
+ vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
+ vblk->len = lseek64(vblk->fd, 0, SEEK_END);
+
+ /* Tell Guest how many sectors this device has. */
+ cap = cpu_to_le64(vblk->len / 512);
+ add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap);
+
+ /* Tell Guest not to put in too many descriptors at once: two are used
+ * for the in and out elements. */
+ val = cpu_to_le32(VIRTQUEUE_NUM - 2);
+ add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val);
+
+ /* The I/O thread writes to this end of the pipe when done. */
+ vblk->done_fd = p[1];
+
+ /* This is the second pipe, which is how we tell the I/O thread about
+ * more work. */
+ pipe(vblk->workpipe);
+
+ /* Create stack for thread and run it */
+ stack = malloc(32768);
+ if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1)
+ err(1, "Creating clone");
+
+ /* We don't need to keep the I/O thread's end of the pipes open. */
+ close(vblk->done_fd);
+ close(vblk->workpipe[0]);
+
+ verbose("device %u: virtblock %llu sectors\n",
+ devices.device_num, cap);
+}
/* That's the end of device setup. */
/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
* its input and output, and finally, lays it to rest. */
-static void __attribute__((noreturn))
-run_guest(int lguest_fd, struct device_list *device_list)
+static void __attribute__((noreturn)) run_guest(int lguest_fd)
{
for (;;) {
- u32 args[] = { LHREQ_BREAK, 0 };
- unsigned long arr[2];
+ unsigned long args[] = { LHREQ_BREAK, 0 };
+ unsigned long notify_addr;
int readval;
/* We read from the /dev/lguest device to run the Guest. */
- readval = read(lguest_fd, arr, sizeof(arr));
+ readval = read(lguest_fd, &notify_addr, sizeof(notify_addr));
- /* The read can only really return sizeof(arr) (the Guest did a
- * SEND_DMA to us), or an error. */
-
- /* For a successful read, arr[0] is the address of the "struct
- * lguest_dma", and arr[1] is the key the Guest sent to. */
- if (readval == sizeof(arr)) {
- handle_output(lguest_fd, arr[0], arr[1], device_list);
+ /* One unsigned long means the Guest did HCALL_NOTIFY */
+ if (readval == sizeof(notify_addr)) {
+ verbose("Notify on address %#lx\n", notify_addr);
+ handle_output(lguest_fd, notify_addr);
continue;
/* ENOENT means the Guest died. Reading tells us why. */
} else if (errno == ENOENT) {
char reason[1024] = { 0 };
read(lguest_fd, reason, sizeof(reason)-1);
errx(1, "%s", reason);
- /* EAGAIN means the waker wanted us to look at some input.
+ /* EAGAIN means the Waker wanted us to look at some input.
* Anything else means a bug or incompatible change. */
} else if (errno != EAGAIN)
err(1, "Running guest failed");
- /* Service input, then unset the BREAK which releases
- * the Waker. */
- handle_input(lguest_fd, device_list);
+ /* Service input, then unset the BREAK to release the Waker. */
+ handle_input(lguest_fd);
if (write(lguest_fd, args, sizeof(args)) < 0)
err(1, "Resetting break");
}
}
/*
- * This is the end of the Launcher.
+ * This is the end of the Launcher. The good news: we are over halfway
+ * through! The bad news: the most fiendish part of the code still lies ahead
+ * of us.
*
- * But wait! We've seen I/O from the Launcher, and we've seen I/O from the
- * Drivers. If we were to see the Host kernel I/O code, our understanding
- * would be complete... :*/
+ * Are you ready? Take a deep breath and join me in the core of the Host, in
+ * "make Host".
+ :*/
static struct option opts[] = {
{ "verbose", 0, NULL, 'v' },
- { "sharenet", 1, NULL, 's' },
{ "tunnet", 1, NULL, 't' },
{ "block", 1, NULL, 'b' },
{ "initrd", 1, NULL, 'i' },
@@ -1374,37 +1549,21 @@ static struct option opts[] = {
static void usage(void)
{
errx(1, "Usage: lguest [--verbose] "
- "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
+ "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
"|--block=<filename>|--initrd=<filename>]...\n"
"<mem-in-mb> vmlinux [args...]");
}
-/*L:100 The Launcher code itself takes us out into userspace, that scary place
- * where pointers run wild and free! Unfortunately, like most userspace
- * programs, it's quite boring (which is why everyone like to hack on the
- * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
- * will get you through this section. Or, maybe not.
- *
- * The Launcher binary sits up high, usually starting at address 0xB8000000.
- * Everything below this is the "physical" memory for the Guest. For example,
- * if the Guest were to write a "1" at physical address 0, we would see a "1"
- * in the Launcher at "(int *)0". Guest physical == Launcher virtual.
- *
- * This can be tough to get your head around, but usually it just means that we
- * don't need to do any conversion when the Guest gives us it's "physical"
- * addresses.
- */
+/*L:105 The main routine is where the real work begins: */
int main(int argc, char *argv[])
{
- /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size
- * of the (optional) initrd. */
- unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0;
- /* A temporary and the /dev/lguest file descriptor. */
+ /* Memory, top-level pagetable, code startpoint and size of the
+ * (optional) initrd. */
+ unsigned long mem = 0, pgdir, start, initrd_size = 0;
+ /* Two temporaries and the /dev/lguest file descriptor. */
int i, c, lguest_fd;
- /* The list of Guest devices, based on command line arguments. */
- struct device_list device_list;
- /* The boot information for the Guest: at guest-physical address 0. */
- void *boot = (void *)0;
+ /* The boot information for the Guest. */
+ struct boot_params *boot;
/* If they specify an initrd file to load. */
const char *initrd_name = NULL;
@@ -1412,11 +1571,12 @@ int main(int argc, char *argv[])
* device receive input from a file descriptor, we keep an fdset
* (infds) and the maximum fd number (max_infd) with the head of the
* list. We also keep a pointer to the last device, for easy appending
- * to the list. */
- device_list.max_infd = -1;
- device_list.dev = NULL;
- device_list.lastdev = &device_list.dev;
- FD_ZERO(&device_list.infds);
+ * to the list. Finally, we keep the next interrupt number to hand out
+ * (1: remember that 0 is used by the timer). */
+ FD_ZERO(&devices.infds);
+ devices.max_infd = -1;
+ devices.lastdev = &devices.dev;
+ devices.next_irq = 1;
/* We need to know how much memory so we can set up the device
* descriptor and memory pages for the devices as we parse the command
@@ -1424,9 +1584,16 @@ int main(int argc, char *argv[])
* of memory now. */
for (i = 1; i < argc; i++) {
if (argv[i][0] != '-') {
- mem = top = atoi(argv[i]) * 1024 * 1024;
- device_list.descs = map_zeroed_pages(top, 1);
- top += getpagesize();
+ mem = atoi(argv[i]) * 1024 * 1024;
+ /* We start by mapping anonymous pages over all of
+ * guest-physical memory range. This fills it with 0,
+ * and ensures that the Guest won't be killed when it
+ * tries to access it. */
+ guest_base = map_zeroed_pages(mem / getpagesize()
+ + DEVICE_PAGES);
+ guest_limit = mem;
+ guest_max = mem + DEVICE_PAGES*getpagesize();
+ devices.descpage = get_pages(1);
break;
}
}
@@ -1437,14 +1604,11 @@ int main(int argc, char *argv[])
case 'v':
verbose = true;
break;
- case 's':
- setup_net_file(optarg, &device_list);
- break;
case 't':
- setup_tun_net(optarg, &device_list);
+ setup_tun_net(optarg);
break;
case 'b':
- setup_block_file(optarg, &device_list);
+ setup_block_file(optarg);
break;
case 'i':
initrd_name = optarg;
@@ -1459,56 +1623,61 @@ int main(int argc, char *argv[])
if (optind + 2 > argc)
usage();
- /* We always have a console device */
- setup_console(&device_list);
+ verbose("Guest base is at %p\n", guest_base);
- /* We start by mapping anonymous pages over all of guest-physical
- * memory range. This fills it with 0, and ensures that the Guest
- * won't be killed when it tries to access it. */
- map_zeroed_pages(0, mem / getpagesize());
+ /* We always have a console device */
+ setup_console();
/* Now we load the kernel */
- start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
- &page_offset);
+ start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
+
+ /* Boot information is stashed at physical address 0 */
+ boot = from_guest_phys(0);
/* Map the initrd image if requested (at top of physical memory) */
if (initrd_name) {
initrd_size = load_initrd(initrd_name, mem);
/* These are the location in the Linux boot header where the
* start and size of the initrd are expected to be found. */
- *(unsigned long *)(boot+0x218) = mem - initrd_size;
- *(unsigned long *)(boot+0x21c) = initrd_size;
+ boot->hdr.ramdisk_image = mem - initrd_size;
+ boot->hdr.ramdisk_size = initrd_size;
/* The bootloader type 0xFF means "unknown"; that's OK. */
- *(unsigned char *)(boot+0x210) = 0xFF;
+ boot->hdr.type_of_loader = 0xFF;
}
/* Set up the initial linear pagetables, starting below the initrd. */
- pgdir = setup_pagetables(mem, initrd_size, page_offset);
+ pgdir = setup_pagetables(mem, initrd_size);
/* The Linux boot header contains an "E820" memory map: ours is a
* simple, single region. */
- *(char*)(boot+E820NR) = 1;
- *((struct e820entry *)(boot+E820MAP))
- = ((struct e820entry) { 0, mem, E820_RAM });
+ boot->e820_entries = 1;
+ boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
/* The boot header contains a command line pointer: we put the command
- * line after the boot header (at address 4096) */
- *(void **)(boot + 0x228) = boot + 4096;
- concat(boot + 4096, argv+optind+2);
+ * line after the boot header. */
+ boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
+ /* We use a simple helper to copy the arguments separated by spaces. */
+ concat((char *)(boot + 1), argv+optind+2);
+
+ /* Boot protocol version: 2.07 supports the fields for lguest. */
+ boot->hdr.version = 0x207;
+
+ /* The hardware_subarch value of "1" tells the Guest it's an lguest. */
+ boot->hdr.hardware_subarch = 1;
- /* The guest type value of "1" tells the Guest it's under lguest. */
- *(int *)(boot + 0x23c) = 1;
+ /* Tell the entry path not to try to reload segment registers. */
+ boot->hdr.loadflags |= KEEP_SEGMENTS;
/* We tell the kernel to initialize the Guest: this returns the open
* /dev/lguest file descriptor. */
- lguest_fd = tell_kernel(pgdir, start, page_offset);
+ lguest_fd = tell_kernel(pgdir, start);
/* We fork off a child process, which wakes the Launcher whenever one
* of the input file descriptors needs attention. Otherwise we would
* run the Guest until it tries to output something. */
- waker_fd = setup_waker(lguest_fd, &device_list);
+ waker_fd = setup_waker(lguest_fd);
/* Finally, run the Guest. This doesn't return. */
- run_guest(lguest_fd, &device_list);
+ run_guest(lguest_fd);
}
/*:*/
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
index 821617bd6c04..7885ab2d5f53 100644
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -6,7 +6,7 @@ Lguest is designed to be a minimal hypervisor for the Linux kernel, for
Linux developers and users to experiment with virtualization with the
minimum of complexity. Nonetheless, it should have sufficient
features to make it useful for specific tasks, and, of course, you are
-encouraged to fork and enhance it.
+encouraged to fork and enhance it (see drivers/lguest/README).
Features:
@@ -23,19 +23,30 @@ Developer features:
Running Lguest:
-- Lguest runs the same kernel as guest and host. You can configure
- them differently, but usually it's easiest not to.
+- The easiest way to run lguest is to use same kernel as guest and host.
+ You can configure them differently, but usually it's easiest not to.
You will need to configure your kernel with the following options:
- CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1]
- CONFIG_TUN=y/m ("Universal TUN/TAP device driver support")
- CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers")
- CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)")
- CONFIG_LGUEST=y/m ("Linux hypervisor example code")
-
- and I recommend:
- CONFIG_HZ=100 ("Timer frequency")[2]
+ "General setup":
+ "Prompt for development and/or incomplete code/drivers" = Y
+ (CONFIG_EXPERIMENTAL=y)
+
+ "Processor type and features":
+ "Paravirtualized guest support" = Y
+ "Lguest guest support" = Y
+ "High Memory Support" = off/4GB
+ "Alignment value to which kernel should be aligned" = 0x100000
+ (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
+ CONFIG_PHYSICAL_ALIGN=0x100000)
+
+ "Device Drivers":
+ "Network device support"
+ "Universal TUN/TAP device driver support" = M/Y
+ (CONFIG_TUN=m)
+ "Virtualization"
+ "Linux hypervisor example code" = M/Y
+ (CONFIG_LGUEST=m)
- A tool called "lguest" is available in this directory: type "make"
to build it. If you didn't build your kernel in-tree, use "make
@@ -51,14 +62,17 @@ Running Lguest:
dd if=/dev/zero of=rootfile bs=1M count=2048
qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
+ Make sure that you install a getty on /dev/hvc0 if you want to log in on the
+ console!
+
- "modprobe lg" if you built it as a module.
- Run an lguest as root:
- Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba
+ Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda
Explanation:
- 64m: the amount of memory to use.
+ 64: the amount of memory to use, in MB.
vmlinux: the kernel image found in the top of your build directory. You
can also use a standard bzImage.
@@ -66,10 +80,10 @@ Running Lguest:
--tunnet=192.168.19.1: configures a "tap" device for networking with this
IP address.
- --block=rootfile: a file or block device which becomes /dev/lgba
+ --block=rootfile: a file or block device which becomes /dev/vda
inside the guest.
- root=/dev/lgba: this (and anything else on the command line) are
+ root=/dev/vda: this (and anything else on the command line) are
kernel boot parameters.
- Configuring networking. I usually have the host masquerade, using
@@ -99,31 +113,7 @@ Running Lguest:
"--sharenet=<filename>": any two guests using the same file are on
the same network. This file is created if it does not exist.
-Lguest I/O model:
-
-Lguest uses a simplified DMA model plus shared memory for I/O. Guests
-can communicate with each other if they share underlying memory
-(usually by the lguest program mmaping the same file), but they can
-use any non-shared memory to communicate with the lguest process.
-
-Guests can register DMA buffers at any key (must be a valid physical
-address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq)
-hypercall. "dmabufs" is the physical address of an array of "num"
-"struct lguest_dma": each contains a used_len, and an array of
-physical addresses and lengths. When a transfer occurs, the
-"used_len" field of one of the buffers which has used_len 0 will be
-set to the length transferred and the irq will fire.
+There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
-Using an irq value of 0 unbinds the dma buffers.
-
-To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used,
-and the bytes used is written to the used_len field. This can be 0 if
-noone else has bound a DMA buffer to that key or some other error.
-DMA buffers bound by the same guest are ignored.
-
-Cheers!
+Good luck!
Rusty Russell rusty@rustcorp.com.au.
-
-[1] These are on various places on the TODO list, waiting for you to
- get annoyed enough at the limitation to fix it.
-[2] Lguest is not yet tickless when idle. See [1].
diff --git a/Documentation/local_ops.txt b/Documentation/local_ops.txt
index b0aca0705d1e..4269a1105b37 100644
--- a/Documentation/local_ops.txt
+++ b/Documentation/local_ops.txt
@@ -27,7 +27,7 @@ CPU which owns the data. Therefore, care must taken to make sure that only one
CPU writes to the local_t data. This is done by using per cpu data and making
sure that we modify it from within a preemption safe context. It is however
permitted to read local_t data from any CPU : it will then appear to be written
-out of order wrt other memory writes on the owner CPU.
+out of order wrt other memory writes by the owner CPU.
* Implementation for a given architecture
@@ -45,6 +45,29 @@ long fails. The definition looks like :
typedef struct { atomic_long_t a; } local_t;
+* Rules to follow when using local atomic operations
+
+- Variables touched by local ops must be per cpu variables.
+- _Only_ the CPU owner of these variables must write to them.
+- This CPU can use local ops from any context (process, irq, softirq, nmi, ...)
+ to update its local_t variables.
+- Preemption (or interrupts) must be disabled when using local ops in
+ process context to make sure the process won't be migrated to a
+ different CPU between getting the per-cpu variable and doing the
+ actual local op.
+- When using local ops in interrupt context, no special care must be
+ taken on a mainline kernel, since they will run on the local CPU with
+ preemption already disabled. I suggest, however, to explicitly
+ disable preemption anyway to make sure it will still work correctly on
+ -rt kernels.
+- Reading the local cpu variable will provide the current copy of the
+ variable.
+- Reads of these variables can be done from any CPU, because updates to
+ "long", aligned, variables are always atomic. Since no memory
+ synchronization is done by the writer CPU, an outdated copy of the
+ variable can be read when reading some _other_ cpu's variables.
+
+
* How to use local atomic operations
#include <linux/percpu.h>
diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt
index 59108cebe163..248589e8bcf5 100644
--- a/Documentation/m68k/kernel-options.txt
+++ b/Documentation/m68k/kernel-options.txt
@@ -192,10 +192,10 @@ Devices possible for Atari:
seconds.
-2.6) ramdisk=
+2.6) ramdisk_size=
-------------
-Syntax: ramdisk=<size>
+Syntax: ramdisk_size=<size>
This option instructs the kernel to set up a ramdisk of the given
size in KBytes. Do not use this option if the ramdisk contents are
@@ -890,10 +890,7 @@ Syntax: nosync:0
5.5.2) noasync
--------------
-Syntax: noasync:0
-
- Disables async and sync negotiation for all devices. Any value
- after the colon is acceptable (and has the same effect).
+[OBSOLETE, REMOVED]
5.5.3) nodisconnect
-------------------
diff --git a/Documentation/make/headers_install.txt b/Documentation/make/headers_install.txt
new file mode 100644
index 000000000000..f2481cabffcb
--- /dev/null
+++ b/Documentation/make/headers_install.txt
@@ -0,0 +1,46 @@
+Exporting kernel headers for use by userspace
+=============================================
+
+The "make headers_install" command exports the kernel's header files in a
+form suitable for use by userspace programs.
+
+The linux kernel's exported header files describe the API for user space
+programs attempting to use kernel services. These kernel header files are
+used by the system's C library (such as glibc or uClibc) to define available
+system calls, as well as constants and structures to be used with these
+system calls. The C library's header files include the kernel header files
+from the "linux" subdirectory. The system's libc headers are usually
+installed at the default location /usr/include and the kernel headers in
+subdirectories under that (most notably /usr/include/linux and
+/usr/include/asm).
+
+Kernel headers are backwards compatible, but not forwards compatible. This
+means that a program built against a C library using older kernel headers
+should run on a newer kernel (although it may not have access to new
+features), but a program built against newer kernel headers may not work on an
+older kernel.
+
+The "make headers_install" command can be run in the top level directory of the
+kernel source code (or using a standard out-of-tree build). It takes two
+optional arguments:
+
+ make headers_install ARCH=i386 INSTALL_HDR_PATH=/usr/include
+
+ARCH indicates which architecture to produce headers for, and defaults to the
+current architecture. The linux/asm directory of the exported kernel headers
+is platform-specific, to see a complete list of supported architectures use
+the command:
+
+ ls -d include/asm-* | sed 's/.*-//'
+
+INSTALL_HDR_PATH indicates where to install the headers. It defaults to
+"./usr/include".
+
+The command "make headers_install_all" exports headers for all architectures
+simultaneously. (This is mostly of interest to distribution maintainers,
+who create an architecture-independent tarball from the resulting include
+directory.) Remember to provide the appropriate linux/asm directory via "mv"
+or "ln -s" before building a C library with headers exported this way.
+
+The kernel header export infrastructure is maintained by David Woodhouse
+<dwmw2@infradead.org>.
diff --git a/Documentation/markers.txt b/Documentation/markers.txt
new file mode 100644
index 000000000000..295a71bc301e
--- /dev/null
+++ b/Documentation/markers.txt
@@ -0,0 +1,81 @@
+ Using the Linux Kernel Markers
+
+ Mathieu Desnoyers
+
+
+This document introduces Linux Kernel Markers and their use. It provides
+examples of how to insert markers in the kernel and connect probe functions to
+them and provides some examples of probe functions.
+
+
+* Purpose of markers
+
+A marker placed in code provides a hook to call a function (probe) that you can
+provide at runtime. A marker can be "on" (a probe is connected to it) or "off"
+(no probe is attached). When a marker is "off" it has no effect, except for
+adding a tiny time penalty (checking a condition for a branch) and space
+penalty (adding a few bytes for the function call at the end of the
+instrumented function and adds a data structure in a separate section). When a
+marker is "on", the function you provide is called each time the marker is
+executed, in the execution context of the caller. When the function provided
+ends its execution, it returns to the caller (continuing from the marker site).
+
+You can put markers at important locations in the code. Markers are
+lightweight hooks that can pass an arbitrary number of parameters,
+described in a printk-like format string, to the attached probe function.
+
+They can be used for tracing and performance accounting.
+
+
+* Usage
+
+In order to use the macro trace_mark, you should include linux/marker.h.
+
+#include <linux/marker.h>
+
+And,
+
+trace_mark(subsystem_event, "%d %s", someint, somestring);
+Where :
+- subsystem_event is an identifier unique to your event
+ - subsystem is the name of your subsystem.
+ - event is the name of the event to mark.
+- "%d %s" is the formatted string for the serializer.
+- someint is an integer.
+- somestring is a char pointer.
+
+Connecting a function (probe) to a marker is done by providing a probe (function
+to call) for the specific marker through marker_probe_register() and can be
+activated by calling marker_arm(). Marker deactivation can be done by calling
+marker_disarm() as many times as marker_arm() has been called. Removing a probe
+is done through marker_probe_unregister(); it will disarm the probe and make
+sure there is no caller left using the probe when it returns. Probe removal is
+preempt-safe because preemption is disabled around the probe call. See the
+"Probe example" section below for a sample probe module.
+
+The marker mechanism supports inserting multiple instances of the same marker.
+Markers can be put in inline functions, inlined static functions, and
+unrolled loops as well as regular functions.
+
+The naming scheme "subsystem_event" is suggested here as a convention intended
+to limit collisions. Marker names are global to the kernel: they are considered
+as being the same whether they are in the core kernel image or in modules.
+Conflicting format strings for markers with the same name will cause the markers
+to be detected to have a different format string not to be armed and will output
+a printk warning which identifies the inconsistency:
+
+"Format mismatch for probe probe_name (format), marker (format)"
+
+
+* Probe / marker example
+
+See the example provided in samples/markers/src
+
+Compile them with your kernel.
+
+Run, as root :
+modprobe marker-example (insmod order is not important)
+modprobe probe-example
+cat /proc/marker-example (returns an expected error)
+rmmod marker-example probe-example
+dmesg
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 650657c54733..4e17beba2379 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1479,7 +1479,8 @@ kernel.
Any atomic operation that modifies some state in memory and returns information
about the state (old or new) implies an SMP-conditional general memory barrier
-(smp_mb()) on each side of the actual operation. These include:
+(smp_mb()) on each side of the actual operation (with the exception of
+explicit lock operations, described later). These include:
xchg();
cmpxchg();
@@ -1536,10 +1537,19 @@ If they're used for constructing a lock of some description, then they probably
do need memory barriers as a lock primitive generally has to do things in a
specific order.
-
Basically, each usage case has to be carefully considered as to whether memory
barriers are needed or not.
+The following operations are special locking primitives:
+
+ test_and_set_bit_lock();
+ clear_bit_unlock();
+ __clear_bit_unlock();
+
+These implement LOCK-class and UNLOCK-class operations. These should be used in
+preference to other operations when implementing locking primitives, because
+their implementations can be optimised on many architectures.
+
[!] Note that special memory barrier primitives are available for these
situations because on some CPUs the atomic instructions used imply full memory
barriers, and so barrier instructions are superfluous in conjunction with them,
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 5fbcc22c98e9..168117bd6ee8 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -2,7 +2,8 @@
Memory Hotplug
==============
-Last Updated: Jul 28 2007
+Created: Jul 28 2007
+Add description of notifier of memory hotplug Oct 11 2007
This document is about memory hotplug including how-to-use and current status.
Because Memory Hotplug is still under development, contents of this text will
@@ -24,7 +25,8 @@ be changed often.
6.1 Memory offline and ZONE_MOVABLE
6.2. How to offline memory
7. Physical memory remove
-8. Future Work List
+8. Memory hotplug event notifier
+9. Future Work List
Note(1): x86_64's has special implementation for memory hotplug.
This text does not describe it.
@@ -307,8 +309,58 @@ Need more implementation yet....
- Notification completion of remove works by OS to firmware.
- Guard from remove if not yet.
+--------------------------------
+8. Memory hotplug event notifier
+--------------------------------
+Memory hotplug has event notifer. There are 6 types of notification.
+
+MEMORY_GOING_ONLINE
+ Generated before new memory becomes available in order to be able to
+ prepare subsystems to handle memory. The page allocator is still unable
+ to allocate from the new memory.
+
+MEMORY_CANCEL_ONLINE
+ Generated if MEMORY_GOING_ONLINE fails.
+
+MEMORY_ONLINE
+ Generated when memory has succesfully brought online. The callback may
+ allocate pages from the new memory.
+
+MEMORY_GOING_OFFLINE
+ Generated to begin the process of offlining memory. Allocations are no
+ longer possible from the memory but some of the memory to be offlined
+ is still in use. The callback can be used to free memory known to a
+ subsystem from the indicated memory section.
+
+MEMORY_CANCEL_OFFLINE
+ Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from
+ the section that we attempted to offline.
+
+MEMORY_OFFLINE
+ Generated after offlining memory is complete.
+
+A callback routine can be registered by
+ hotplug_memory_notifier(callback_func, priority)
+
+The second argument of callback function (action) is event types of above.
+The third argument is passed by pointer of struct memory_notify.
+
+struct memory_notify {
+ unsigned long start_pfn;
+ unsigned long nr_pages;
+ int status_cahnge_nid;
+}
+
+start_pfn is start_pfn of online/offline memory.
+nr_pages is # of pages of online/offline memory.
+status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
+set/clear. It means a new(memoryless) node gets new memory by online and a
+node loses all memory. If this is -1, then nodemask status is not changed.
+If status_changed_nid >= 0, callback should create/discard structures for the
+node if necessary.
+
--------------
-8. Future Work
+9. Future Work
--------------
- allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
sysctl or new control file.
diff --git a/Documentation/mips/00-INDEX b/Documentation/mips/00-INDEX
new file mode 100644
index 000000000000..3f13bf8043d2
--- /dev/null
+++ b/Documentation/mips/00-INDEX
@@ -0,0 +1,6 @@
+00-INDEX
+ - this file.
+AU1xxx_IDE.README
+ - README for MIPS AU1XXX IDE driver.
+GT64120.README
+ - README for dir with info on MIPS boards using GT-64120 or GT-64120A.
diff --git a/Documentation/mips/AU1xxx_IDE.README b/Documentation/mips/AU1xxx_IDE.README
index afb31c141d9d..5c8334123f4f 100644
--- a/Documentation/mips/AU1xxx_IDE.README
+++ b/Documentation/mips/AU1xxx_IDE.README
@@ -59,7 +59,7 @@ Four configs variables are introduced:
CONFIG_BLK_DEV_IDE_AU1XXX_PIO_DBDMA - enable the PIO+DBDMA mode
CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA - enable the MWDMA mode
CONFIG_BLK_DEV_IDE_AU1XXX_BURSTABLE_ON - set Burstable FIFO in DBDMA
- controler
+ controller
CONFIG_BLK_DEV_IDE_AU1XXX_SEQTS_PER_RQ - maximum transfer size
per descriptor
diff --git a/Documentation/mips/time.README b/Documentation/mips/time.README
deleted file mode 100644
index a4ce603ed3b3..000000000000
--- a/Documentation/mips/time.README
+++ /dev/null
@@ -1,173 +0,0 @@
-README for MIPS time services
-
-Jun Sun
-jsun@mvista.com or jsun@junsun.net
-
-
-ABOUT
------
-This file describes the new arch/mips/kernel/time.c, related files and the
-services they provide.
-
-If you are short in patience and just want to know how to use time.c for a
-new board or convert an existing board, go to the last section.
-
-
-FILES, COMPATABILITY AND CONFIGS
----------------------------------
-
-The old arch/mips/kernel/time.c is renamed to old-time.c.
-
-A new time.c is put there, together with include/asm-mips/time.h.
-
-Two configs variables are introduced, CONFIG_OLD_TIME_C and CONFIG_NEW_TIME_C.
-So we allow boards using
-
- 1) old time.c (CONFIG_OLD_TIME_C)
- 2) new time.c (CONFIG_NEW_TIME_C)
- 3) neither (their own private time.c)
-
-However, it is expected every board will move to the new time.c in the near
-future.
-
-
-WHAT THE NEW CODE PROVIDES?
----------------------------
-
-The new time code provide the following services:
-
- a) Implements functions required by Linux common code:
- time_init
-
- b) provides an abstraction of RTC and null RTC implementation as default.
- extern unsigned long (*rtc_get_time)(void);
- extern int (*rtc_set_time)(unsigned long);
-
- c) high-level and low-level timer interrupt routines where the timer
- interrupt source may or may not be the CPU timer. The high-level
- routine is dispatched through do_IRQ() while the low-level is
- dispatched in assemably code (usually int-handler.S)
-
-
-WHAT THE NEW CODE REQUIRES?
----------------------------
-
-For the new code to work properly, each board implementation needs to supply
-the following functions or values:
-
- a) board_time_init - a function pointer. Invoked at the beginnig of
- time_init(). It is optional.
- 1. (optional) set up RTC routines
- 2. (optional) calibrate and set the mips_hpt_frequency
-
- b) plat_timer_setup - a function pointer. Invoked at the end of time_init()
- 1. (optional) over-ride any decisions made in time_init()
- 2. set up the irqaction for timer interrupt.
- 3. enable the timer interrupt
-
- c) (optional) board-specific RTC routines.
-
- d) (optional) mips_hpt_frequency - It must be definied if the board
- is using CPU counter for timer interrupt.
-
-
-PORTING GUIDE
--------------
-
-Step 1: decide how you like to implement the time services.
-
- a) does this board have a RTC? If yes, implement the two RTC funcs.
-
- b) does the CPU have counter/compare registers?
-
- If the answer is no, you need a timer to provide the timer interrupt
- at 100 HZ speed.
-
- c) The following sub steps assume your CPU has counter register.
- Do you plan to use the CPU counter register as the timer interrupt
- or use an exnternal timer?
-
- In order to use CPU counter register as the timer interrupt source, you
- must know the counter speed (mips_hpt_frequency). It is usually the
- same as the CPU speed or an integral divisor of it.
-
- d) decide on whether you want to use high-level or low-level timer
- interrupt routines. The low-level one is presumably faster, but should
- not make too mcuh difference.
-
-
-Step 2: the machine setup() function
-
- If you supply board_time_init(), set the function poointer.
-
-
-Step 3: implement rtc routines, board_time_init() and plat_timer_setup()
- if needed.
-
- board_time_init() -
- a) (optional) set up RTC routines,
- b) (optional) calibrate and set the mips_hpt_frequency
- (only needed if you intended to use cpu counter as timer interrupt
- source)
-
- plat_timer_setup() -
- a) (optional) over-write any choices made above by time_init().
- b) machine specific code should setup the timer irqaction.
- c) enable the timer interrupt
-
-
- If the RTC chip is a common chip, I suggest the routines are put under
- arch/mips/libs. For example, for DS1386 chip, one would create
- rtc-ds1386.c under arch/mips/lib directory. Add the following line to
- the arch/mips/lib/Makefile:
-
- obj-$(CONFIG_DDB5476) += rtc-ds1386.o
-
-Step 4: if you are using low-level timer interrupt, change your interrupt
- dispathcing code to check for timer interrupt and jump to
- ll_timer_interrupt() directly if one is detected.
-
-Step 5: Modify arch/mips/config.in and add CONFIG_NEW_TIME_C to your machine.
- Modify the appropriate defconfig if applicable.
-
-Final notes:
-
-For some tricky cases, you may need to add your own wrapper functions
-for some of the functions in time.c.
-
-For example, you may define your own timer interrupt routine, which does
-some of its own processing and then calls timer_interrupt().
-
-You can also over-ride any of the built-in functions (RTC routines
-and/or timer interrupt routine).
-
-
-PORTING NOTES FOR SMP
-----------------------
-
-If you have a SMP box, things are slightly more complicated.
-
-The time service running every jiffy is logically divided into two parts:
-
- 1) the one for the whole system (defined in timer_interrupt())
- 2) the one that should run for each CPU (defined in local_timer_interrupt())
-
-You need to decide on your timer interrupt sources.
-
- case 1) - whole system has only one timer interrupt delivered to one CPU
-
- In this case, you set up timer interrupt as in UP systems. In addtion,
- you need to set emulate_local_timer_interrupt to 1 so that other
- CPUs get to call local_timer_interrupt().
-
- THIS IS CURRENTLY NOT IMPLEMNETED. However, it is rather easy to write
- one should such a need arise. You simply make a IPI call.
-
- case 2) - each CPU has a separate timer interrupt
-
- In this case, you need to set up IRQ such that each of them will
- call local_timer_interrupt(). In addition, you need to arrange
- one and only one of them to call timer_interrupt().
-
- You can also do the low-level version of those interrupt routines,
- following similar dispatching routes described above.
diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt
index cbf79881a41c..aa60d1f627e5 100644
--- a/Documentation/mutex-design.txt
+++ b/Documentation/mutex-design.txt
@@ -90,7 +90,8 @@ of advantages of mutexes:
* - task may not exit with mutex held
* - memory areas where held locks reside must not be freed
* - held mutexes must not be reinitialized
- * - mutexes may not be used in irq contexts
+ * - mutexes may not be used in hardware or software interrupt
+ * contexts such as tasklets and timers
furthermore, there are also convenience features in the debugging
code:
@@ -132,4 +133,6 @@ the APIs of 'struct mutex' have been streamlined:
int mutex_trylock(struct mutex *lock);
void mutex_unlock(struct mutex *lock);
int mutex_is_locked(struct mutex *lock);
-
+ void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
+ int mutex_lock_interruptible_nested(struct mutex *lock,
+ unsigned int subclass);
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
index 153d84d281e6..f5a5e6d3d541 100644
--- a/Documentation/networking/00-INDEX
+++ b/Documentation/networking/00-INDEX
@@ -80,8 +80,6 @@ multicast.txt
- Behaviour of cards under Multicast
ncsa-telnet
- notes on how NCSA telnet (DOS) breaks with MTU discovery enabled.
-net-modules.txt
- - info and "insmod" parameters for all network driver modules.
netdevices.txt
- info on network device driver functions exported to the kernel.
olympic.txt
diff --git a/Documentation/networking/NAPI_HOWTO.txt b/Documentation/networking/NAPI_HOWTO.txt
deleted file mode 100644
index 7907435a661c..000000000000
--- a/Documentation/networking/NAPI_HOWTO.txt
+++ /dev/null
@@ -1,766 +0,0 @@
-HISTORY:
-February 16/2002 -- revision 0.2.1:
-COR typo corrected
-February 10/2002 -- revision 0.2:
-some spell checking ;->
-January 12/2002 -- revision 0.1
-This is still work in progress so may change.
-To keep up to date please watch this space.
-
-Introduction to NAPI
-====================
-
-NAPI is a proven (www.cyberus.ca/~hadi/usenix-paper.tgz) technique
-to improve network performance on Linux. For more details please
-read that paper.
-NAPI provides a "inherent mitigation" which is bound by system capacity
-as can be seen from the following data collected by Robert on Gigabit
-ethernet (e1000):
-
- Psize Ipps Tput Rxint Txint Done Ndone
- ---------------------------------------------------------------
- 60 890000 409362 17 27622 7 6823
- 128 758150 464364 21 9301 10 7738
- 256 445632 774646 42 15507 21 12906
- 512 232666 994445 241292 19147 241192 1062
- 1024 119061 1000003 872519 19258 872511 0
- 1440 85193 1000003 946576 19505 946569 0
-
-
-Legend:
-"Ipps" stands for input packets per second.
-"Tput" == packets out of total 1M that made it out.
-"txint" == transmit completion interrupts seen
-"Done" == The number of times that the poll() managed to pull all
-packets out of the rx ring. Note from this that the lower the
-load the more we could clean up the rxring
-"Ndone" == is the converse of "Done". Note again, that the higher
-the load the more times we couldn't clean up the rxring.
-
-Observe that:
-when the NIC receives 890Kpackets/sec only 17 rx interrupts are generated.
-The system cant handle the processing at 1 interrupt/packet at that load level.
-At lower rates on the other hand, rx interrupts go up and therefore the
-interrupt/packet ratio goes up (as observable from that table). So there is
-possibility that under low enough input, you get one poll call for each
-input packet caused by a single interrupt each time. And if the system
-cant handle interrupt per packet ratio of 1, then it will just have to
-chug along ....
-
-
-0) Prerequisites:
-==================
-A driver MAY continue using the old 2.4 technique for interfacing
-to the network stack and not benefit from the NAPI changes.
-NAPI additions to the kernel do not break backward compatibility.
-NAPI, however, requires the following features to be available:
-
-A) DMA ring or enough RAM to store packets in software devices.
-
-B) Ability to turn off interrupts or maybe events that send packets up
-the stack.
-
-NAPI processes packet events in what is known as dev->poll() method.
-Typically, only packet receive events are processed in dev->poll().
-The rest of the events MAY be processed by the regular interrupt handler
-to reduce processing latency (justified also because there are not that
-many of them).
-Note, however, NAPI does not enforce that dev->poll() only processes
-receive events.
-Tests with the tulip driver indicated slightly increased latency if
-all of the interrupt handler is moved to dev->poll(). Also MII handling
-gets a little trickier.
-The example used in this document is to move the receive processing only
-to dev->poll(); this is shown with the patch for the tulip driver.
-For an example of code that moves all the interrupt driver to
-dev->poll() look at the ported e1000 code.
-
-There are caveats that might force you to go with moving everything to
-dev->poll(). Different NICs work differently depending on their status/event
-acknowledgement setup.
-There are two types of event register ACK mechanisms.
- I) what is known as Clear-on-read (COR).
- when you read the status/event register, it clears everything!
- The natsemi and sunbmac NICs are known to do this.
- In this case your only choice is to move all to dev->poll()
-
- II) Clear-on-write (COW)
- i) you clear the status by writing a 1 in the bit-location you want.
- These are the majority of the NICs and work the best with NAPI.
- Put only receive events in dev->poll(); leave the rest in
- the old interrupt handler.
- ii) whatever you write in the status register clears every thing ;->
- Cant seem to find any supported by Linux which do this. If
- someone knows such a chip email us please.
- Move all to dev->poll()
-
-C) Ability to detect new work correctly.
-NAPI works by shutting down event interrupts when there's work and
-turning them on when there's none.
-New packets might show up in the small window while interrupts were being
-re-enabled (refer to appendix 2). A packet might sneak in during the period
-we are enabling interrupts. We only get to know about such a packet when the
-next new packet arrives and generates an interrupt.
-Essentially, there is a small window of opportunity for a race condition
-which for clarity we'll refer to as the "rotting packet".
-
-This is a very important topic and appendix 2 is dedicated for more
-discussion.
-
-Locking rules and environmental guarantees
-==========================================
-
--Guarantee: Only one CPU at any time can call dev->poll(); this is because
-only one CPU can pick the initial interrupt and hence the initial
-netif_rx_schedule(dev);
-- The core layer invokes devices to send packets in a round robin format.
-This implies receive is totally lockless because of the guarantee that only
-one CPU is executing it.
-- contention can only be the result of some other CPU accessing the rx
-ring. This happens only in close() and suspend() (when these methods
-try to clean the rx ring);
-****guarantee: driver authors need not worry about this; synchronization
-is taken care for them by the top net layer.
--local interrupts are enabled (if you dont move all to dev->poll()). For
-example link/MII and txcomplete continue functioning just same old way.
-This improves the latency of processing these events. It is also assumed that
-the receive interrupt is the largest cause of noise. Note this might not
-always be true.
-[according to Manfred Spraul, the winbond insists on sending one
-txmitcomplete interrupt for each packet (although this can be mitigated)].
-For these broken drivers, move all to dev->poll().
-
-For the rest of this text, we'll assume that dev->poll() only
-processes receive events.
-
-new methods introduce by NAPI
-=============================
-
-a) netif_rx_schedule(dev)
-Called by an IRQ handler to schedule a poll for device
-
-b) netif_rx_schedule_prep(dev)
-puts the device in a state which allows for it to be added to the
-CPU polling list if it is up and running. You can look at this as
-the first half of netif_rx_schedule(dev) above; the second half
-being c) below.
-
-c) __netif_rx_schedule(dev)
-Add device to the poll list for this CPU; assuming that _prep above
-has already been called and returned 1.
-
-d) netif_rx_reschedule(dev, undo)
-Called to reschedule polling for device specifically for some
-deficient hardware. Read Appendix 2 for more details.
-
-e) netif_rx_complete(dev)
-
-Remove interface from the CPU poll list: it must be in the poll list
-on current cpu. This primitive is called by dev->poll(), when
-it completes its work. The device cannot be out of poll list at this
-call, if it is then clearly it is a BUG(). You'll know ;->
-
-All of the above methods are used below, so keep reading for clarity.
-
-Device driver changes to be made when porting NAPI
-==================================================
-
-Below we describe what kind of changes are required for NAPI to work.
-
-1) introduction of dev->poll() method
-=====================================
-
-This is the method that is invoked by the network core when it requests
-for new packets from the driver. A driver is allowed to send upto
-dev->quota packets by the current CPU before yielding to the network
-subsystem (so other devices can also get opportunity to send to the stack).
-
-dev->poll() prototype looks as follows:
-int my_poll(struct net_device *dev, int *budget)
-
-budget is the remaining number of packets the network subsystem on the
-current CPU can send up the stack before yielding to other system tasks.
-*Each driver is responsible for decrementing budget by the total number of
-packets sent.
- Total number of packets cannot exceed dev->quota.
-
-dev->poll() method is invoked by the top layer, the driver just sends if it
-can to the stack the packet quantity requested.
-
-more on dev->poll() below after the interrupt changes are explained.
-
-2) registering dev->poll() method
-===================================
-
-dev->poll should be set in the dev->probe() method.
-e.g:
-dev->open = my_open;
-.
-.
-/* two new additions */
-/* first register my poll method */
-dev->poll = my_poll;
-/* next register my weight/quanta; can be overridden in /proc */
-dev->weight = 16;
-.
-.
-dev->stop = my_close;
-
-
-
-3) scheduling dev->poll()
-=============================
-This involves modifying the interrupt handler and the code
-path which takes the packet off the NIC and sends them to the
-stack.
-
-it's important at this point to introduce the classical D Becker
-interrupt processor:
-
-------------------
-static irqreturn_t
-netdevice_interrupt(int irq, void *dev_id, struct pt_regs *regs)
-{
-
- struct net_device *dev = (struct net_device *)dev_instance;
- struct my_private *tp = (struct my_private *)dev->priv;
-
- int work_count = my_work_count;
- status = read_interrupt_status_reg();
- if (status == 0)
- return IRQ_NONE; /* Shared IRQ: not us */
- if (status == 0xffff)
- return IRQ_HANDLED; /* Hot unplug */
- if (status & error)
- do_some_error_handling()
-
- do {
- acknowledge_ints_ASAP();
-
- if (status & link_interrupt) {
- spin_lock(&tp->link_lock);
- do_some_link_stat_stuff();
- spin_lock(&tp->link_lock);
- }
-
- if (status & rx_interrupt) {
- receive_packets(dev);
- }
-
- if (status & rx_nobufs) {
- make_rx_buffs_avail();
- }
-
- if (status & tx_related) {
- spin_lock(&tp->lock);
- tx_ring_free(dev);
- if (tx_died)
- restart_tx();
- spin_unlock(&tp->lock);
- }
-
- status = read_interrupt_status_reg();
-
- } while (!(status & error) || more_work_to_be_done);
- return IRQ_HANDLED;
-}
-
-----------------------------------------------------------------------
-
-We now change this to what is shown below to NAPI-enable it:
-
-----------------------------------------------------------------------
-static irqreturn_t
-netdevice_interrupt(int irq, void *dev_id, struct pt_regs *regs)
-{
- struct net_device *dev = (struct net_device *)dev_instance;
- struct my_private *tp = (struct my_private *)dev->priv;
-
- status = read_interrupt_status_reg();
- if (status == 0)
- return IRQ_NONE; /* Shared IRQ: not us */
- if (status == 0xffff)
- return IRQ_HANDLED; /* Hot unplug */
- if (status & error)
- do_some_error_handling();
-
- do {
-/************************ start note *********************************/
- acknowledge_ints_ASAP(); // dont ack rx and rxnobuff here
-/************************ end note *********************************/
-
- if (status & link_interrupt) {
- spin_lock(&tp->link_lock);
- do_some_link_stat_stuff();
- spin_unlock(&tp->link_lock);
- }
-/************************ start note *********************************/
- if (status & rx_interrupt || (status & rx_nobuffs)) {
- if (netif_rx_schedule_prep(dev)) {
-
- /* disable interrupts caused
- * by arriving packets */
- disable_rx_and_rxnobuff_ints();
- /* tell system we have work to be done. */
- __netif_rx_schedule(dev);
- } else {
- printk("driver bug! interrupt while in poll\n");
- /* FIX by disabling interrupts */
- disable_rx_and_rxnobuff_ints();
- }
- }
-/************************ end note note *********************************/
-
- if (status & tx_related) {
- spin_lock(&tp->lock);
- tx_ring_free(dev);
-
- if (tx_died)
- restart_tx();
- spin_unlock(&tp->lock);
- }
-
- status = read_interrupt_status_reg();
-
-/************************ start note *********************************/
- } while (!(status & error) || more_work_to_be_done(status));
-/************************ end note note *********************************/
- return IRQ_HANDLED;
-}
-
----------------------------------------------------------------------
-
-
-We note several things from above:
-
-I) Any interrupt source which is caused by arriving packets is now
-turned off when it occurs. Depending on the hardware, there could be
-several reasons that arriving packets would cause interrupts; these are the
-interrupt sources we wish to avoid. The two common ones are a) a packet
-arriving (rxint) b) a packet arriving and finding no DMA buffers available
-(rxnobuff) .
-This means also acknowledge_ints_ASAP() will not clear the status
-register for those two items above; clearing is done in the place where
-proper work is done within NAPI; at the poll() and refill_rx_ring()
-discussed further below.
-netif_rx_schedule_prep() returns 1 if device is in running state and
-gets successfully added to the core poll list. If we get a zero value
-we can _almost_ assume are already added to the list (instead of not running.
-Logic based on the fact that you shouldn't get interrupt if not running)
-We rectify this by disabling rx and rxnobuf interrupts.
-
-II) that receive_packets(dev) and make_rx_buffs_avail() may have disappeared.
-These functionalities are still around actually......
-
-infact, receive_packets(dev) is very close to my_poll() and
-make_rx_buffs_avail() is invoked from my_poll()
-
-4) converting receive_packets() to dev->poll()
-===============================================
-
-We need to convert the classical D Becker receive_packets(dev) to my_poll()
-
-First the typical receive_packets() below:
--------------------------------------------------------------------
-
-/* this is called by interrupt handler */
-static void receive_packets (struct net_device *dev)
-{
-
- struct my_private *tp = (struct my_private *)dev->priv;
- rx_ring = tp->rx_ring;
- cur_rx = tp->cur_rx;
- int entry = cur_rx % RX_RING_SIZE;
- int received = 0;
- int rx_work_limit = tp->dirty_rx + RX_RING_SIZE - tp->cur_rx;
-
- while (rx_ring_not_empty) {
- u32 rx_status;
- unsigned int rx_size;
- unsigned int pkt_size;
- struct sk_buff *skb;
- /* read size+status of next frame from DMA ring buffer */
- /* the number 16 and 4 are just examples */
- rx_status = le32_to_cpu (*(u32 *) (rx_ring + ring_offset));
- rx_size = rx_status >> 16;
- pkt_size = rx_size - 4;
-
- /* process errors */
- if ((rx_size > (MAX_ETH_FRAME_SIZE+4)) ||
- (!(rx_status & RxStatusOK))) {
- netdrv_rx_err (rx_status, dev, tp, ioaddr);
- return;
- }
-
- if (--rx_work_limit < 0)
- break;
-
- /* grab a skb */
- skb = dev_alloc_skb (pkt_size + 2);
- if (skb) {
- .
- .
- netif_rx (skb);
- .
- .
- } else { /* OOM */
- /*seems very driver specific ... some just pass
- whatever is on the ring already. */
- }
-
- /* move to the next skb on the ring */
- entry = (++tp->cur_rx) % RX_RING_SIZE;
- received++ ;
-
- }
-
- /* store current ring pointer state */
- tp->cur_rx = cur_rx;
-
- /* Refill the Rx ring buffers if they are needed */
- refill_rx_ring();
- .
- .
-
-}
--------------------------------------------------------------------
-We change it to a new one below; note the additional parameter in
-the call.
-
--------------------------------------------------------------------
-
-/* this is called by the network core */
-static int my_poll (struct net_device *dev, int *budget)
-{
-
- struct my_private *tp = (struct my_private *)dev->priv;
- rx_ring = tp->rx_ring;
- cur_rx = tp->cur_rx;
- int entry = cur_rx % RX_BUF_LEN;
- /* maximum packets to send to the stack */
-/************************ note note *********************************/
- int rx_work_limit = dev->quota;
-
-/************************ end note note *********************************/
- do { // outer beginning loop starts here
-
- clear_rx_status_register_bit();
-
- while (rx_ring_not_empty) {
- u32 rx_status;
- unsigned int rx_size;
- unsigned int pkt_size;
- struct sk_buff *skb;
- /* read size+status of next frame from DMA ring buffer */
- /* the number 16 and 4 are just examples */
- rx_status = le32_to_cpu (*(u32 *) (rx_ring + ring_offset));
- rx_size = rx_status >> 16;
- pkt_size = rx_size - 4;
-
- /* process errors */
- if ((rx_size > (MAX_ETH_FRAME_SIZE+4)) ||
- (!(rx_status & RxStatusOK))) {
- netdrv_rx_err (rx_status, dev, tp, ioaddr);
- return 1;
- }
-
-/************************ note note *********************************/
- if (--rx_work_limit < 0) { /* we got packets, but no quota */
- /* store current ring pointer state */
- tp->cur_rx = cur_rx;
-
- /* Refill the Rx ring buffers if they are needed */
- refill_rx_ring(dev);
- goto not_done;
- }
-/********************** end note **********************************/
-
- /* grab a skb */
- skb = dev_alloc_skb (pkt_size + 2);
- if (skb) {
- .
- .
-/************************ note note *********************************/
- netif_receive_skb (skb);
-/********************** end note **********************************/
- .
- .
- } else { /* OOM */
- /*seems very driver specific ... common is just pass
- whatever is on the ring already. */
- }
-
- /* move to the next skb on the ring */
- entry = (++tp->cur_rx) % RX_RING_SIZE;
- received++ ;
-
- }
-
- /* store current ring pointer state */
- tp->cur_rx = cur_rx;
-
- /* Refill the Rx ring buffers if they are needed */
- refill_rx_ring(dev);
-
- /* no packets on ring; but new ones can arrive since we last
- checked */
- status = read_interrupt_status_reg();
- if (rx status is not set) {
- /* If something arrives in this narrow window,
- an interrupt will be generated */
- goto done;
- }
- /* done! at least that's what it looks like ;->
- if new packets came in after our last check on status bits
- they'll be caught by the while check and we go back and clear them
- since we havent exceeded our quota */
- } while (rx_status_is_set);
-
-done:
-
-/************************ note note *********************************/
- dev->quota -= received;
- *budget -= received;
-
- /* If RX ring is not full we are out of memory. */
- if (tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL)
- goto oom;
-
- /* we are happy/done, no more packets on ring; put us back
- to where we can start processing interrupts again */
- netif_rx_complete(dev);
- enable_rx_and_rxnobuf_ints();
-
- /* The last op happens after poll completion. Which means the following:
- * 1. it can race with disabling irqs in irq handler (which are done to
- * schedule polls)
- * 2. it can race with dis/enabling irqs in other poll threads
- * 3. if an irq raised after the beginning of the outer beginning
- * loop (marked in the code above), it will be immediately
- * triggered here.
- *
- * Summarizing: the logic may result in some redundant irqs both
- * due to races in masking and due to too late acking of already
- * processed irqs. The good news: no events are ever lost.
- */
-
- return 0; /* done */
-
-not_done:
- if (tp->cur_rx - tp->dirty_rx > RX_RING_SIZE/2 ||
- tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL)
- refill_rx_ring(dev);
-
- if (!received) {
- printk("received==0\n");
- received = 1;
- }
- dev->quota -= received;
- *budget -= received;
- return 1; /* not_done */
-
-oom:
- /* Start timer, stop polling, but do not enable rx interrupts. */
- start_poll_timer(dev);
- return 0; /* we'll take it from here so tell core "done"*/
-
-/************************ End note note *********************************/
-}
--------------------------------------------------------------------
-
-From above we note that:
-0) rx_work_limit = dev->quota
-1) refill_rx_ring() is in charge of clearing the bit for rxnobuff when
-it does the work.
-2) We have a done and not_done state.
-3) instead of netif_rx() we call netif_receive_skb() to pass the skb.
-4) we have a new way of handling oom condition
-5) A new outer for (;;) loop has been added. This serves the purpose of
-ensuring that if a new packet has come in, after we are all set and done,
-and we have not exceeded our quota that we continue sending packets up.
-
-
------------------------------------------------------------
-Poll timer code will need to do the following:
-
-a)
-
- if (tp->cur_rx - tp->dirty_rx > RX_RING_SIZE/2 ||
- tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL)
- refill_rx_ring(dev);
-
- /* If RX ring is not full we are still out of memory.
- Restart the timer again. Else we re-add ourselves
- to the master poll list.
- */
-
- if (tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL)
- restart_timer();
-
- else netif_rx_schedule(dev); /* we are back on the poll list */
-
-5) dev->close() and dev->suspend() issues
-==========================================
-The driver writer needn't worry about this; the top net layer takes
-care of it.
-
-6) Adding new Stats to /proc
-=============================
-In order to debug some of the new features, we introduce new stats
-that need to be collected.
-TODO: Fill this later.
-
-APPENDIX 1: discussion on using ethernet HW FC
-==============================================
-Most chips with FC only send a pause packet when they run out of Rx buffers.
-Since packets are pulled off the DMA ring by a softirq in NAPI,
-if the system is slow in grabbing them and we have a high input
-rate (faster than the system's capacity to remove packets), then theoretically
-there will only be one rx interrupt for all packets during a given packetstorm.
-Under low load, we might have a single interrupt per packet.
-FC should be programmed to apply in the case when the system cant pull out
-packets fast enough i.e send a pause only when you run out of rx buffers.
-Note FC in itself is a good solution but we have found it to not be
-much of a commodity feature (both in NICs and switches) and hence falls
-under the same category as using NIC based mitigation. Also, experiments
-indicate that it's much harder to resolve the resource allocation
-issue (aka lazy receiving that NAPI offers) and hence quantify its usefulness
-proved harder. In any case, FC works even better with NAPI but is not
-necessary.
-
-
-APPENDIX 2: the "rotting packet" race-window avoidance scheme
-=============================================================
-
-There are two types of associations seen here
-
-1) status/int which honors level triggered IRQ
-
-If a status bit for receive or rxnobuff is set and the corresponding
-interrupt-enable bit is not on, then no interrupts will be generated. However,
-as soon as the "interrupt-enable" bit is unmasked, an immediate interrupt is
-generated. [assuming the status bit was not turned off].
-Generally the concept of level triggered IRQs in association with a status and
-interrupt-enable CSR register set is used to avoid the race.
-
-If we take the example of the tulip:
-"pending work" is indicated by the status bit(CSR5 in tulip).
-the corresponding interrupt bit (CSR7 in tulip) might be turned off (but
-the CSR5 will continue to be turned on with new packet arrivals even if
-we clear it the first time)
-Very important is the fact that if we turn on the interrupt bit on when
-status is set that an immediate irq is triggered.
-
-If we cleared the rx ring and proclaimed there was "no more work
-to be done" and then went on to do a few other things; then when we enable
-interrupts, there is a possibility that a new packet might sneak in during
-this phase. It helps to look at the pseudo code for the tulip poll
-routine:
-
---------------------------
- do {
- ACK;
- while (ring_is_not_empty()) {
- work-work-work
- if quota is exceeded: exit, no touching irq status/mask
- }
- /* No packets, but new can arrive while we are doing this*/
- CSR5 := read
- if (CSR5 is not set) {
- /* If something arrives in this narrow window here,
- * where the comments are ;-> irq will be generated */
- unmask irqs;
- exit poll;
- }
- } while (rx_status_is_set);
-------------------------
-
-CSR5 bit of interest is only the rx status.
-If you look at the last if statement:
-you just finished grabbing all the packets from the rx ring .. you check if
-status bit says there are more packets just in ... it says none; you then
-enable rx interrupts again; if a new packet just came in during this check,
-we are counting that CSR5 will be set in that small window of opportunity
-and that by re-enabling interrupts, we would actually trigger an interrupt
-to register the new packet for processing.
-
-[The above description nay be very verbose, if you have better wording
-that will make this more understandable, please suggest it.]
-
-2) non-capable hardware
-
-These do not generally respect level triggered IRQs. Normally,
-irqs may be lost while being masked and the only way to leave poll is to do
-a double check for new input after netif_rx_complete() is invoked
-and re-enable polling (after seeing this new input).
-
-Sample code:
-
----------
- .
- .
-restart_poll:
- while (ring_is_not_empty()) {
- work-work-work
- if quota is exceeded: exit, not touching irq status/mask
- }
- .
- .
- .
- enable_rx_interrupts()
- netif_rx_complete(dev);
- if (ring_has_new_packet() && netif_rx_reschedule(dev, received)) {
- disable_rx_and_rxnobufs()
- goto restart_poll
- } while (rx_status_is_set);
----------
-
-Basically netif_rx_complete() removes us from the poll list, but because a
-new packet which will never be caught due to the possibility of a race
-might come in, we attempt to re-add ourselves to the poll list.
-
-
-
-
-APPENDIX 3: Scheduling issues.
-==============================
-As seen NAPI moves processing to softirq level. Linux uses the ksoftirqd as the
-general solution to schedule softirq's to run before next interrupt and by putting
-them under scheduler control. Also this prevents consecutive softirq's from
-monopolize the CPU. This also have the effect that the priority of ksoftirq needs
-to be considered when running very CPU-intensive applications and networking to
-get the proper balance of softirq/user balance. Increasing ksoftirq priority to 0
-(eventually more) is reported cure problems with low network performance at high
-CPU load.
-
-Most used processes in a GIGE router:
-USER PID %CPU %MEM SIZE RSS TTY STAT START TIME COMMAND
-root 3 0.2 0.0 0 0 ? RWN Aug 15 602:00 (ksoftirqd_CPU0)
-root 232 0.0 7.9 41400 40884 ? S Aug 15 74:12 gated
-
---------------------------------------------------------------------
-
-relevant sites:
-==================
-ftp://robur.slu.se/pub/Linux/net-development/NAPI/
-
-
---------------------------------------------------------------------
-TODO: Write net-skeleton.c driver.
--------------------------------------------------------------
-
-Authors:
-========
-Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
-Jamal Hadi Salim <hadi@cyberus.ca>
-Robert Olsson <Robert.Olsson@data.slu.se>
-
-Acknowledgements:
-================
-People who made this document better:
-
-Lennert Buytenhek <buytenh@gnu.org>
-Andrew Morton <akpm@zip.com.au>
-Manfred Spraul <manfred@colorfullife.com>
-Donald Becker <becker@scyld.com>
-Jeff Garzik <jgarzik@pobox.com>
diff --git a/Documentation/networking/bcm43xx.txt b/Documentation/networking/bcm43xx.txt
index a136721499bf..d602c8d6ff3e 100644
--- a/Documentation/networking/bcm43xx.txt
+++ b/Documentation/networking/bcm43xx.txt
@@ -37,7 +37,7 @@ all, distributions. There is, however, additional software that is
required. The firmware used by the chip is the intellectual property
of Broadcom and they have not given the bcm43xx team redistribution
rights to this firmware. Since we cannot legally redistribute
-the firwmare we cannot include it with the driver. Furthermore, it
+the firmware we cannot include it with the driver. Furthermore, it
cannot be placed in the downloadable archives of any distributing
organization; therefore, the user is responsible for obtaining the
firmware and placing it in the appropriate location so that the driver
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 1da566630831..11340625e363 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -281,6 +281,39 @@ downdelay
will be rounded down to the nearest multiple. The default
value is 0.
+fail_over_mac
+
+ Specifies whether active-backup mode should set all slaves to
+ the same MAC address (the traditional behavior), or, when
+ enabled, change the bond's MAC address when changing the
+ active interface (i.e., fail over the MAC address itself).
+
+ Fail over MAC is useful for devices that cannot ever alter
+ their MAC address, or for devices that refuse incoming
+ broadcasts with their own source MAC (which interferes with
+ the ARP monitor).
+
+ The down side of fail over MAC is that every device on the
+ network must be updated via gratuitous ARP, vs. just updating
+ a switch or set of switches (which often takes place for any
+ traffic, not just ARP traffic, if the switch snoops incoming
+ traffic to update its tables) for the traditional method. If
+ the gratuitous ARP is lost, communication may be disrupted.
+
+ When fail over MAC is used in conjuction with the mii monitor,
+ devices which assert link up prior to being able to actually
+ transmit and receive are particularly susecptible to loss of
+ the gratuitous ARP, and an appropriate updelay setting may be
+ required.
+
+ A value of 0 disables fail over MAC, and is the default. A
+ value of 1 enables fail over MAC. This option is enabled
+ automatically if the first slave added cannot change its MAC
+ address. This option may be modified via sysfs only when no
+ slaves are present in the bond.
+
+ This option was added in bonding version 3.2.0.
+
lacp_rate
Option specifying the rate in which we'll ask our link partner
diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index 4504cc59e405..afb66f9a8aff 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -38,8 +38,13 @@ Socket options
DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of
service codes (RFC 4340, sec. 8.1.2); if this socket option is not set,
the socket will fall back to 0 (which means that no meaningful service code
-is present). Connecting sockets set at most one service option; for
-listening sockets, multiple service codes can be specified.
+is present). On active sockets this is set before connect(); specifying more
+than one code has no effect (all subsequent service codes are ignored). The
+case is different for passive sockets, where multiple service codes (up to 32)
+can be set before calling bind().
+
+DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet
+size (application payload size) in bytes, see RFC 4340, section 14.
DCCP_SOCKOPT_SEND_CSCOV and DCCP_SOCKOPT_RECV_CSCOV are used for setting the
partial checksum coverage (RFC 4340, sec. 9.2). The default is that checksums
@@ -50,12 +55,13 @@ be enabled at the receiver, too with suitable choice of CsCov.
DCCP_SOCKOPT_SEND_CSCOV sets the sender checksum coverage. Values in the
range 0..15 are acceptable. The default setting is 0 (full coverage),
values between 1..15 indicate partial coverage.
-DCCP_SOCKOPT_SEND_CSCOV is for the receiver and has a different meaning: it
+DCCP_SOCKOPT_RECV_CSCOV is for the receiver and has a different meaning: it
sets a threshold, where again values 0..15 are acceptable. The default
of 0 means that all packets with a partial coverage will be discarded.
Values in the range 1..15 indicate that packets with minimally such a
coverage value are also acceptable. The higher the number, the more
- restrictive this setting (see [RFC 4340, sec. 9.2.1]).
+ restrictive this setting (see [RFC 4340, sec. 9.2.1]). Partial coverage
+ settings are inherited to the child socket after accept().
The following two options apply to CCID 3 exclusively and are getsockopt()-only.
In either case, a TFRC info struct (defined in <linux/tfrc.h>) is returned.
@@ -112,9 +118,14 @@ tx_qlen = 5
The size of the transmit buffer in packets. A value of 0 corresponds
to an unbounded transmit buffer.
+sync_ratelimit = 125 ms
+ The timeout between subsequent DCCP-Sync packets sent in response to
+ sequence-invalid packets on the same socket (RFC 4340, 7.5.4). The unit
+ of this parameter is milliseconds; a value of 0 disables rate-limiting.
+
Notes
=====
DCCP does not travel through NAT successfully at present on many boxes. This is
-because the checksum covers the psuedo-header as per TCP and UDP. Linux NAT
+because the checksum covers the pseudo-header as per TCP and UDP. Linux NAT
support for DCCP has been added.
diff --git a/Documentation/networking/dgrs.txt b/Documentation/networking/dgrs.txt
deleted file mode 100644
index 1aa1bb3f94ab..000000000000
--- a/Documentation/networking/dgrs.txt
+++ /dev/null
@@ -1,52 +0,0 @@
- The Digi International RightSwitch SE-X (dgrs) Device Driver
-
-This is a Linux driver for the Digi International RightSwitch SE-X
-EISA and PCI boards. These are 4 (EISA) or 6 (PCI) port Ethernet
-switches and a NIC combined into a single board. This driver can
-be compiled into the kernel statically or as a loadable module.
-
-There is also a companion management tool, called "xrightswitch".
-The management tool lets you watch the performance graphically,
-as well as set the SNMP agent IP and IPX addresses, IEEE Spanning
-Tree, and Aging time. These can also be set from the command line
-when the driver is loaded. The driver command line options are:
-
- debug=NNN Debug printing level
- dma=0/1 Disable/Enable DMA on PCI card
- spantree=0/1 Disable/Enable IEEE spanning tree
- hashexpire=NNN Change address aging time (default 300 seconds)
- ipaddr=A,B,C,D Set SNMP agent IP address i.e. 199,86,8,221
- iptrap=A,B,C,D Set SNMP agent IP trap address i.e. 199,86,8,221
- ipxnet=NNN Set SNMP agent IPX network number
- nicmode=0/1 Disable/Enable multiple NIC mode
-
-There is also a tool for setting up input and output packet filters
-on each port, called "dgrsfilt".
-
-Both the management tool and the filtering tool are available
-separately from the following FTP site:
-
- ftp://ftp.dgii.com/drivers/rightswitch/linux/
-
-When nicmode=1, the board and driver operate as 4 or 6 individual
-NIC ports (eth0...eth5) instead of as a switch. All switching
-functions are disabled. In the future, the board firmware may include
-a routing cache when in this mode.
-
-Copyright 1995-1996 Digi International Inc.
-
-This software may be used and distributed according to the terms
-of the GNU General Public License, incorporated herein by reference.
-
-For information on purchasing a RightSwitch SE-4 or SE-6
-board, please contact Digi's sales department at 1-612-912-3444
-or 1-800-DIGIBRD. Outside the U.S., please check our Web page at:
-
- http://www.dgii.com
-
-for sales offices worldwide. Tech support is also available through
-the channels listed on the Web site, although as long as I am
-employed on networking products at Digi I will be happy to provide
-any bug fixes that may be needed.
-
--Rick Richardson, rick@dgii.com
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 32c2e9da5f3a..6f7872ba1def 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -180,13 +180,20 @@ tcp_fin_timeout - INTEGER
to live longer. Cf. tcp_max_orphans.
tcp_frto - INTEGER
- Enables F-RTO, an enhanced recovery algorithm for TCP retransmission
+ Enables Forward RTO-Recovery (F-RTO) defined in RFC4138.
+ F-RTO is an enhanced recovery algorithm for TCP retransmission
timeouts. It is particularly beneficial in wireless environments
where packet loss is typically due to random radio interference
- rather than intermediate router congestion. If set to 1, basic
- version is enabled. 2 enables SACK enhanced F-RTO, which is
- EXPERIMENTAL. The basic version can be used also when SACK is
- enabled for a flow through tcp_sack sysctl.
+ rather than intermediate router congestion. F-RTO is sender-side
+ only modification. Therefore it does not require any support from
+ the peer, but in a typical case, however, where wireless link is
+ the local access link and most of the data flows downlink, the
+ faraway servers should have F-RTO enabled to take advantage of it.
+ If set to 1, basic version is enabled. 2 enables SACK enhanced
+ F-RTO if flow uses SACK. The basic version can be used also when
+ SACK is in use though scenario(s) with it exists where F-RTO
+ interacts badly with the packet counting of the SACK enabled TCP
+ flow.
tcp_frto_response - INTEGER
When F-RTO has detected that a TCP retransmission timeout was
@@ -286,7 +293,7 @@ tcp_no_metrics_save - BOOLEAN
when the connection closes, so that connections established in the
near future can use these to set initial conditions. Usually, this
increases overall performance, but may sometimes cause performance
- degredation. If set, TCP will not cache metrics on closing
+ degradation. If set, TCP will not cache metrics on closing
connections.
tcp_orphan_retries - INTEGER
diff --git a/Documentation/networking/mac80211-injection.txt b/Documentation/networking/mac80211-injection.txt
index 53ef7a06f49c..84906ef3ed6e 100644
--- a/Documentation/networking/mac80211-injection.txt
+++ b/Documentation/networking/mac80211-injection.txt
@@ -13,15 +13,35 @@ The radiotap format is discussed in
./Documentation/networking/radiotap-headers.txt.
Despite 13 radiotap argument types are currently defined, most only make sense
-to appear on received packets. Currently three kinds of argument are used by
-the injection code, although it knows to skip any other arguments that are
-present (facilitating replay of captured radiotap headers directly):
+to appear on received packets. The following information is parsed from the
+radiotap headers and used to control injection:
- - IEEE80211_RADIOTAP_RATE - u8 arg in 500kbps units (0x02 --> 1Mbps)
+ * IEEE80211_RADIOTAP_RATE
- - IEEE80211_RADIOTAP_ANTENNA - u8 arg, 0x00 = ant1, 0x01 = ant2
+ rate in 500kbps units, automatic if invalid or not present
- - IEEE80211_RADIOTAP_DBM_TX_POWER - u8 arg, dBm
+
+ * IEEE80211_RADIOTAP_ANTENNA
+
+ antenna to use, automatic if not present
+
+
+ * IEEE80211_RADIOTAP_DBM_TX_POWER
+
+ transmit power in dBm, automatic if not present
+
+
+ * IEEE80211_RADIOTAP_FLAGS
+
+ IEEE80211_RADIOTAP_F_FCS: FCS will be removed and recalculated
+ IEEE80211_RADIOTAP_F_WEP: frame will be encrypted if key available
+ IEEE80211_RADIOTAP_F_FRAG: frame will be fragmented if longer than the
+ current fragmentation threshold. Note that
+ this flag is only reliable when software
+ fragmentation is enabled)
+
+The injection code can also skip all other currently defined radiotap fields
+facilitating replay of captured radiotap headers directly.
Here is an example valid radiotap header defining these three parameters
diff --git a/Documentation/networking/net-modules.txt b/Documentation/networking/net-modules.txt
deleted file mode 100644
index 98c4392dd0fd..000000000000
--- a/Documentation/networking/net-modules.txt
+++ /dev/null
@@ -1,315 +0,0 @@
-Wed 2-Aug-95 <matti.aarnio@utu.fi>
-
- Linux network driver modules
-
- Do not mistake this for "README.modules" at the top-level
- directory! That document tells about modules in general, while
- this one tells only about network device driver modules.
-
- This is a potpourri of INSMOD-time(*) configuration options
- (if such exists) and their default values of various modules
- in the Linux network drivers collection.
-
- Some modules have also hidden (= non-documented) tunable values.
- The choice of not documenting them is based on general belief, that
- the less the user needs to know, the better. (There are things that
- driver developers can use, others should not confuse themselves.)
-
- In many cases it is highly preferred that insmod:ing is done
- ONLY with defining an explicit address for the card, AND BY
- NOT USING AUTO-PROBING!
-
- Now most cards have some explicitly defined base address that they
- are compiled with (to avoid auto-probing, among other things).
- If that compiled value does not match your actual configuration,
- do use the "io=0xXXX" -parameter for the insmod, and give there
- a value matching your environment.
-
- If you are adventurous, you can ask the driver to autoprobe
- by using the "io=0" parameter, however it is a potentially dangerous
- thing to do in a live system. (If you don't know where the
- card is located, you can try autoprobing, and after possible
- crash recovery, insmod with proper IO-address..)
-
- --------------------------
- (*) "INSMOD-time" means when you load module with
- /sbin/insmod you can feed it optional parameters.
- See "man insmod".
- --------------------------
-
-
- 8390 based Network Modules (Paul Gortmaker, Nov 12, 1995)
- --------------------------
-
-(Includes: smc-ultra, ne, wd, 3c503, hp, hp-plus, e2100 and ac3200)
-
-The 8390 series of network drivers now support multiple card systems without
-reloading the same module multiple times (memory efficient!) This is done by
-specifying multiple comma separated values, such as:
-
- insmod 3c503.o io=0x280,0x300,0x330,0x350 xcvr=0,1,0,1
-
-The above would have the one module controlling four 3c503 cards, with card 2
-and 4 using external transceivers. The "insmod" manual describes the usage
-of comma separated value lists.
-
-It is *STRONGLY RECOMMENDED* that you supply "io=" instead of autoprobing.
-If an "io=" argument is not supplied, then the ISA drivers will complain
-about autoprobing being not recommended, and begrudgingly autoprobe for
-a *SINGLE CARD ONLY* -- if you want to use multiple cards you *have* to
-supply an "io=0xNNN,0xQQQ,..." argument.
-
-The ne module is an exception to the above. A NE2000 is essentially an
-8390 chip, some bus glue and some RAM. Because of this, the ne probe is
-more invasive than the rest, and so at boot we make sure the ne probe is
-done last of all the 8390 cards (so that it won't trip over other 8390 based
-cards) With modules we can't ensure that all other non-ne 8390 cards have
-already been found. Because of this, the ne module REQUIRES an "io=0xNNN"
-argument passed in via insmod. It will refuse to autoprobe.
-
-It is also worth noting that auto-IRQ probably isn't as reliable during
-the flurry of interrupt activity on a running machine. Cards such as the
-ne2000 that can't get the IRQ setting from an EEPROM or configuration
-register are probably best supplied with an "irq=M" argument as well.
-
-
-----------------------------------------------------------------------
-Card/Module List - Configurable Parameters and Default Values
-----------------------------------------------------------------------
-
-3c501.c:
- io = 0x280 IO base address
- irq = 5 IRQ
- (Probes ports: 0x280, 0x300)
-
-3c503.c:
- io = 0 (It will complain if you don't supply an "io=0xNNN")
- irq = 0 (IRQ software selected by driver using autoIRQ)
- xcvr = 0 (Use xcvr=1 to select external transceiver.)
- (Probes ports: 0x300, 0x310, 0x330, 0x350, 0x250, 0x280, 0x2A0, 0x2E0)
-
-3c505.c:
- io = 0
- irq = 0
- dma = 6 (not autoprobed)
- (Probes ports: 0x300, 0x280, 0x310)
-
-3c507.c:
- io = 0x300
- irq = 0
- (Probes ports: 0x300, 0x320, 0x340, 0x280)
-
-3c509.c:
- io = 0
- irq = 0
- ( Module load-time probing Works reliably only on EISA, ISA ID-PROBE
- IS NOT RELIABLE! Compile this driver statically into kernel for
- now, if you need it auto-probing on an ISA-bus machine. )
-
-8390.c:
- (No public options, several other modules need this one)
-
-a2065.c:
- Since this is a Zorro board, it supports full autoprobing, even for
- multiple boards. (m68k/Amiga)
-
-ac3200.c:
- io = 0 (Checks 0x1000 to 0x8fff in 0x1000 intervals)
- irq = 0 (Read from config register)
- (EISA probing..)
-
-apricot.c:
- io = 0x300 (Can't be altered!)
- irq = 10
-
-arcnet.c:
- io = 0
- irqnum = 0
- shmem = 0
- num = 0
- DO SET THESE MANUALLY AT INSMOD!
- (When probing, looks at the following possible addresses:
- Suggested ones:
- 0x300, 0x2E0, 0x2F0, 0x2D0
- Other ones:
- 0x200, 0x210, 0x220, 0x230, 0x240, 0x250, 0x260, 0x270,
- 0x280, 0x290, 0x2A0, 0x2B0, 0x2C0,
- 0x310, 0x320, 0x330, 0x340, 0x350, 0x360, 0x370,
- 0x380, 0x390, 0x3A0, 0x3E0, 0x3F0 )
-
-ariadne.c:
- Since this is a Zorro board, it supports full autoprobing, even for
- multiple boards. (m68k/Amiga)
-
-at1700.c:
- io = 0x260
- irq = 0
- (Probes ports: 0x260, 0x280, 0x2A0, 0x240, 0x340, 0x320, 0x380, 0x300)
-
-atarilance.c:
- Supports full autoprobing. (m68k/Atari)
-
-atp.c: *Not modularized*
- (Probes ports: 0x378, 0x278, 0x3BC;
- fixed IRQs: 5 and 7 )
-
-cops.c:
- io = 0x240
- irq = 5
- nodeid = 0 (AutoSelect = 0, NodeID 1-254 is hand selected.)
- (Probes ports: 0x240, 0x340, 0x200, 0x210, 0x220, 0x230, 0x260,
- 0x2A0, 0x300, 0x310, 0x320, 0x330, 0x350, 0x360)
-
-de4x5.c:
- io = 0x000b
- irq = 10
- is_not_dec = 0 -- For non-DEC card using DEC 21040/21041/21140 chip, set this to 1
- (EISA, and PCI probing)
-
-de600.c:
- de600_debug = 0
- (On port 0x378, irq 7 -- lpt1; compile time configurable)
-
-de620.c:
- bnc = 0, utp = 0 <-- Force media by setting either.
- io = 0x378 (also compile-time configurable)
- irq = 7
-
-depca.c:
- io = 0x200
- irq = 7
- (Probes ports: ISA: 0x300, 0x200;
- EISA: 0x0c00 )
-
-dummy.c:
- No options
-
-e2100.c:
- io = 0 (It will complain if you don't supply an "io=0xNNN")
- irq = 0 (IRQ software selected by driver)
- mem = 0 (Override default shared memory start of 0xd0000)
- xcvr = 0 (Use xcvr=1 to select external transceiver.)
- (Probes ports: 0x300, 0x280, 0x380, 0x220)
-
-eepro.c:
- io = 0x200
- irq = 0
- (Probes ports: 0x200, 0x240, 0x280, 0x2C0, 0x300, 0x320, 0x340, 0x360)
-
-eexpress.c:
- io = 0x300
- irq = 0 (IRQ value read from EEPROM)
- (Probes ports: 0x300, 0x270, 0x320, 0x340)
-
-eql.c:
- (No parameters)
-
-ewrk3.c:
- io = 0x300
- irq = 5
- (With module no autoprobing!
- On EISA-bus does EISA probing.
- Static linkage probes ports on ISA bus:
- 0x100, 0x120, 0x140, 0x160, 0x180, 0x1A0, 0x1C0,
- 0x200, 0x220, 0x240, 0x260, 0x280, 0x2A0, 0x2C0, 0x2E0,
- 0x300, 0x340, 0x360, 0x380, 0x3A0, 0x3C0)
-
-hp-plus.c:
- io = 0 (It will complain if you don't supply an "io=0xNNN")
- irq = 0 (IRQ read from configuration register)
- (Probes ports: 0x200, 0x240, 0x280, 0x2C0, 0x300, 0x320, 0x340)
-
-hp.c:
- io = 0 (It will complain if you don't supply an "io=0xNNN")
- irq = 0 (IRQ software selected by driver using autoIRQ)
- (Probes ports: 0x300, 0x320, 0x340, 0x280, 0x2C0, 0x200, 0x240)
-
-hp100.c:
- hp100_port = 0 (IO-base address)
- (Does EISA-probing, if on EISA-slot;
- On ISA-bus probes all ports from 0x100 thru to 0x3E0
- in increments of 0x020)
-
-hydra.c:
- Since this is a Zorro board, it supports full autoprobing, even for
- multiple boards. (m68k/Amiga)
-
-ibmtr.c:
- io = 0xa20, 0xa24 (autoprobed by default)
- irq = 0 (driver cannot select irq - read from hardware)
- mem = 0 (shared memory base set at 0xd0000 and not yet
- able to override thru mem= parameter.)
-
-lance.c: *Not modularized*
- (PCI, and ISA probing; "CONFIG_PCI" needed for PCI support)
- (Probes ISA ports: 0x300, 0x320, 0x340, 0x360)
-
-loopback.c: *Static kernel component*
-
-ne.c:
- io = 0 (Explicitly *requires* an "io=0xNNN" value)
- irq = 0 (Tries to determine configured IRQ via autoIRQ)
- (Probes ports: 0x300, 0x280, 0x320, 0x340, 0x360)
-
-net_init.c: *Static kernel component*
-
-ni52.c: *Not modularized*
- (Probes ports: 0x300, 0x280, 0x360, 0x320, 0x340
- mems: 0xD0000, 0xD2000, 0xC8000, 0xCA000,
- 0xD4000, 0xD6000, 0xD8000 )
-
-ni65.c: *Not modularized* **16MB MEMORY BARRIER BUG**
- (Probes ports: 0x300, 0x320, 0x340, 0x360)
-
-pi2.c: *Not modularized* (well, NON-STANDARD modularization!)
- Only one card supported at this time.
- (Probes ports: 0x380, 0x300, 0x320, 0x340, 0x360, 0x3A0)
-
-plip.c:
- io = 0
- irq = 0 (by default, uses IRQ 5 for port at 0x3bc, IRQ 7
- for port at 0x378, and IRQ 2 for port at 0x278)
- (Probes ports: 0x278, 0x378, 0x3bc)
-
-ppp.c:
- No options (ppp-2.2+ has some, this is based on non-dynamic
- version from ppp-2.1.2d)
-
-seeq8005.c: *Not modularized*
- (Probes ports: 0x300, 0x320, 0x340, 0x360)
-
-skeleton.c: *Skeleton*
-
-slhc.c:
- No configuration parameters
-
-slip.c:
- slip_maxdev = 256 (default value from SL_NRUNIT on slip.h)
-
-
-smc-ultra.c:
- io = 0 (It will complain if you don't supply an "io=0xNNN")
- irq = 0 (IRQ val. read from EEPROM)
- (Probes ports: 0x200, 0x220, 0x240, 0x280, 0x300, 0x340, 0x380)
-
-tulip.c: *Partial modularization*
- (init-time memory allocation makes problems..)
-
-tunnel.c:
- No insmod parameters
-
-wavelan.c:
- io = 0x390 (Settable, but change not recommended)
- irq = 0 (Not honoured, if changed..)
-
-wd.c:
- io = 0 (It will complain if you don't supply an "io=0xNNN")
- irq = 0 (IRQ val. read from EEPROM, ancient cards use autoIRQ)
- mem = 0 (Force shared-memory on address 0xC8000, or whatever..)
- mem_end = 0 (Force non-std. mem. size via supplying mem_end val.)
- (eg. for 32k WD8003EBT, use mem=0xd0000 mem_end=0xd8000)
- (Probes ports: 0x300, 0x280, 0x380, 0x240)
-
-znet.c: *Not modularized*
- (Only one device on Zenith Z-Note (notebook?) systems,
- configuration information from (EE)PROM)
diff --git a/Documentation/networking/netconsole.txt b/Documentation/networking/netconsole.txt
index 1caa6c734691..3c2f2b328638 100644
--- a/Documentation/networking/netconsole.txt
+++ b/Documentation/networking/netconsole.txt
@@ -3,6 +3,10 @@ started by Ingo Molnar <mingo@redhat.com>, 2001.09.17
2.6 port and netpoll api by Matt Mackall <mpm@selenic.com>, Sep 9 2003
Please send bug reports to Matt Mackall <mpm@selenic.com>
+and Satyam Sharma <satyam.sharma@gmail.com>
+
+Introduction:
+=============
This module logs kernel printk messages over UDP allowing debugging of
problem where disk logging fails and serial consoles are impractical.
@@ -13,6 +17,9 @@ the specified interface as soon as possible. While this doesn't allow
capture of early kernel panics, it does capture most of the boot
process.
+Sender and receiver configuration:
+==================================
+
It takes a string configuration parameter "netconsole" in the
following format:
@@ -34,21 +41,113 @@ Examples:
insmod netconsole netconsole=@/,@10.0.0.2/
+It also supports logging to multiple remote agents by specifying
+parameters for the multiple agents separated by semicolons and the
+complete string enclosed in "quotes", thusly:
+
+ modprobe netconsole netconsole="@/,@10.0.0.2/;@/eth1,6892@10.0.0.3/"
+
Built-in netconsole starts immediately after the TCP stack is
initialized and attempts to bring up the supplied dev at the supplied
address.
The remote host can run either 'netcat -u -l -p <port>' or syslogd.
+Dynamic reconfiguration:
+========================
+
+Dynamic reconfigurability is a useful addition to netconsole that enables
+remote logging targets to be dynamically added, removed, or have their
+parameters reconfigured at runtime from a configfs-based userspace interface.
+[ Note that the parameters of netconsole targets that were specified/created
+from the boot/module option are not exposed via this interface, and hence
+cannot be modified dynamically. ]
+
+To include this feature, select CONFIG_NETCONSOLE_DYNAMIC when building the
+netconsole module (or kernel, if netconsole is built-in).
+
+Some examples follow (where configfs is mounted at the /sys/kernel/config
+mountpoint).
+
+To add a remote logging target (target names can be arbitrary):
+
+ cd /sys/kernel/config/netconsole/
+ mkdir target1
+
+Note that newly created targets have default parameter values (as mentioned
+above) and are disabled by default -- they must first be enabled by writing
+"1" to the "enabled" attribute (usually after setting parameters accordingly)
+as described below.
+
+To remove a target:
+
+ rmdir /sys/kernel/config/netconsole/othertarget/
+
+The interface exposes these parameters of a netconsole target to userspace:
+
+ enabled Is this target currently enabled? (read-write)
+ dev_name Local network interface name (read-write)
+ local_port Source UDP port to use (read-write)
+ remote_port Remote agent's UDP port (read-write)
+ local_ip Source IP address to use (read-write)
+ remote_ip Remote agent's IP address (read-write)
+ local_mac Local interface's MAC address (read-only)
+ remote_mac Remote agent's MAC address (read-write)
+
+The "enabled" attribute is also used to control whether the parameters of
+a target can be updated or not -- you can modify the parameters of only
+disabled targets (i.e. if "enabled" is 0).
+
+To update a target's parameters:
+
+ cat enabled # check if enabled is 1
+ echo 0 > enabled # disable the target (if required)
+ echo eth2 > dev_name # set local interface
+ echo 10.0.0.4 > remote_ip # update some parameter
+ echo cb:a9:87:65:43:21 > remote_mac # update more parameters
+ echo 1 > enabled # enable target again
+
+You can also update the local interface dynamically. This is especially
+useful if you want to use interfaces that have newly come up (and may not
+have existed when netconsole was loaded / initialized).
+
+Miscellaneous notes:
+====================
+
WARNING: the default target ethernet setting uses the broadcast
ethernet address to send packets, which can cause increased load on
other systems on the same ethernet segment.
+TIP: some LAN switches may be configured to suppress ethernet broadcasts
+so it is advised to explicitly specify the remote agents' MAC addresses
+from the config parameters passed to netconsole.
+
+TIP: to find out the MAC address of, say, 10.0.0.2, you may try using:
+
+ ping -c 1 10.0.0.2 ; /sbin/arp -n | grep 10.0.0.2
+
+TIP: in case the remote logging agent is on a separate LAN subnet than
+the sender, it is suggested to try specifying the MAC address of the
+default gateway (you may use /sbin/route -n to find it out) as the
+remote MAC address instead.
+
NOTE: the network device (eth1 in the above case) can run any kind
of other network traffic, netconsole is not intrusive. Netconsole
might cause slight delays in other traffic if the volume of kernel
messages is high, but should have no other impact.
+NOTE: if you find that the remote logging agent is not receiving or
+printing all messages from the sender, it is likely that you have set
+the "console_loglevel" parameter (on the sender) to only send high
+priority messages to the console. You can change this at runtime using:
+
+ dmesg -n 8
+
+or by specifying "debug" on the kernel command line at boot, to send
+all kernel messages to the console. A specific value for this parameter
+can also be set using the "loglevel" kernel boot option. See the
+dmesg(8) man page and Documentation/kernel-parameters.txt for details.
+
Netconsole was designed to be as instantaneous as possible, to
enable the logging of even the most critical kernel bugs. It works
from IRQ contexts as well, and does not enable interrupts while
diff --git a/Documentation/networking/netdevices.txt b/Documentation/networking/netdevices.txt
index 37869295fc70..d0f71fc7f782 100644
--- a/Documentation/networking/netdevices.txt
+++ b/Documentation/networking/netdevices.txt
@@ -73,7 +73,8 @@ dev->hard_start_xmit:
has to lock by itself when needed. It is recommended to use a try lock
for this and return NETDEV_TX_LOCKED when the spin lock fails.
The locking there should also properly protect against
- set_multicast_list.
+ set_multicast_list. Note that the use of NETIF_F_LLTX is deprecated.
+ Dont use it for new drivers.
Context: Process with BHs disabled or BH (timer),
will be called with interrupts disabled by netconsole.
@@ -95,9 +96,13 @@ dev->set_multicast_list:
Synchronization: netif_tx_lock spinlock.
Context: BHs disabled
-dev->poll:
- Synchronization: __LINK_STATE_RX_SCHED bit in dev->state. See
- dev_close code and comments in net/core/dev.c for more info.
+struct napi_struct synchronization rules
+========================================
+napi->poll:
+ Synchronization: NAPI_STATE_SCHED bit in napi->state. Device
+ driver's dev->close method will invoke napi_disable() on
+ all NAPI instances which will do a sleeping poll on the
+ NAPI_STATE_SCHED napi->state bit, waiting for all pending
+ NAPI activity to cease.
Context: softirq
will be called with interrupts disabled by netconsole.
-
diff --git a/Documentation/networking/proc_net_tcp.txt b/Documentation/networking/proc_net_tcp.txt
index 5e21f7cb6383..4a79209e77a7 100644
--- a/Documentation/networking/proc_net_tcp.txt
+++ b/Documentation/networking/proc_net_tcp.txt
@@ -1,8 +1,9 @@
This document describes the interfaces /proc/net/tcp and /proc/net/tcp6.
+Note that these interfaces are deprecated in favor of tcp_diag.
These /proc interfaces provide information about currently active TCP
-connections, and are implemented by tcp_get_info() in net/ipv4/tcp_ipv4.c and
-tcp6_get_info() in net/ipv6/tcp_ipv6.c, respectively.
+connections, and are implemented by tcp4_seq_show() in net/ipv4/tcp_ipv4.c
+and tcp6_seq_show() in net/ipv6/tcp_ipv6.c, respectively.
It will first list all listening TCP sockets, and next list all established
TCP connections. A typical entry of /proc/net/tcp would look like this (split
diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt
index cae231b1c134..c3669a3fb4af 100644
--- a/Documentation/networking/rxrpc.txt
+++ b/Documentation/networking/rxrpc.txt
@@ -689,7 +689,7 @@ such as the AFS filesystem. This permits such a utility to:
buffers manipulated directly.
To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket,
-bind an addess as appropriate and listen if it's to be a server socket, but
+bind an address as appropriate and listen if it's to be a server socket, but
then it passes this to the kernel interface functions.
The kernel interface functions are as follows:
@@ -857,3 +857,10 @@ The kernel interface functions are as follows:
This is used to extract the error number from a message indicating either
a local error occurred or a network error occurred.
+
+ (*) Allocate a null key for doing anonymous security.
+
+ struct key *rxrpc_get_null_key(const char *keyname);
+
+ This is used to allocate a null RxRPC key that can be used to indicate
+ anonymous security for a particular domain.
diff --git a/Documentation/networking/tc-actions-env-rules.txt b/Documentation/networking/tc-actions-env-rules.txt
new file mode 100644
index 000000000000..01e716d185f4
--- /dev/null
+++ b/Documentation/networking/tc-actions-env-rules.txt
@@ -0,0 +1,29 @@
+
+The "enviromental" rules for authors of any new tc actions are:
+
+1) If you stealeth or borroweth any packet thou shalt be branching
+from the righteous path and thou shalt cloneth.
+
+For example if your action queues a packet to be processed later
+or intentionaly branches by redirecting a packet then you need to
+clone the packet.
+There are certain fields in the skb tc_verd that need to be reset so we
+avoid loops etc. A few are generic enough so much so that skb_act_clone()
+resets them for you. So invoke skb_act_clone() rather than skb_clone()
+
+2) If you munge any packet thou shalt call pskb_expand_head in the case
+someone else is referencing the skb. After that you "own" the skb.
+You must also tell us if it is ok to munge the packet (TC_OK2MUNGE),
+this way any action downstream can stomp on the packet.
+
+3) dropping packets you dont own is a nono. You simply return
+TC_ACT_SHOT to the caller and they will drop it.
+
+The "enviromental" rules for callers of actions (qdiscs etc) are:
+
+*) thou art responsible for freeing anything returned as being
+TC_ACT_SHOT/STOLEN/QUEUED. If none of TC_ACT_SHOT/STOLEN/QUEUED is
+returned then all is great and you dont need to do anything.
+
+Post on netdev if something is unclear.
+
diff --git a/Documentation/networking/udplite.txt b/Documentation/networking/udplite.txt
index 6be09ba24a36..b6409cab075c 100644
--- a/Documentation/networking/udplite.txt
+++ b/Documentation/networking/udplite.txt
@@ -12,7 +12,7 @@
For in-depth information, you can consult:
o The UDP-Lite Homepage: http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/
- Fom here you can also download some example application source code.
+ From here you can also download some example application source code.
o The UDP-Lite HOWTO on
http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/UDP-Lite-HOWTO.txt
@@ -223,7 +223,7 @@
While it is important that such cases are dealt with correctly, they
are (annoyingly) rare: UDP-Lite is designed for optimising multimedia
performance over wireless (or generally noisy) links and thus smaller
- coverage lenghts are likely to be expected.
+ coverage lengths are likely to be expected.
V) UDP-LITE RUNTIME STATISTICS AND THEIR MEANING
@@ -259,7 +259,7 @@
VI) IPTABLES
There is packet match support for UDP-Lite as well as support for the LOG target.
- If you copy and paste the following line into /etc/protcols,
+ If you copy and paste the following line into /etc/protocols,
udplite 136 UDP-Lite # UDP-Lite [RFC 3828]
diff --git a/Documentation/parport-lowlevel.txt b/Documentation/parport-lowlevel.txt
index 8f2302415eff..265fcdcb8e5f 100644
--- a/Documentation/parport-lowlevel.txt
+++ b/Documentation/parport-lowlevel.txt
@@ -25,7 +25,6 @@ Global functions:
parport_open
parport_close
parport_device_id
- parport_device_num
parport_device_coords
parport_find_class
parport_find_device
@@ -735,7 +734,7 @@ NULL is returned.
SEE ALSO
-parport_register_device, parport_device_num
+parport_register_device
parport_close - unregister device for particular device number
-------------
@@ -787,29 +786,7 @@ Many devices have ill-formed IEEE 1284 Device IDs.
SEE ALSO
-parport_find_class, parport_find_device, parport_device_num
-
-parport_device_num - convert device coordinates to device number
-------------------
-
-SYNOPSIS
-
-#include <linux/parport.h>
-
-int parport_device_num (int parport, int mux, int daisy);
-
-DESCRIPTION
-
-Convert between device coordinates (port, multiplexor, daisy chain
-address) and device number (zero-based).
-
-RETURN VALUE
-
-Device number, or -1 if no device at given coordinates.
-
-SEE ALSO
-
-parport_device_coords, parport_open, parport_device_id
+parport_find_class, parport_find_device
parport_device_coords - convert device number to device coordinates
------------------
@@ -833,7 +810,7 @@ Zero on success, in which case the coordinates are (*parport, *mux,
SEE ALSO
-parport_device_num, parport_open, parport_device_id
+parport_open, parport_device_id
parport_find_class - find a device by its class
------------------
diff --git a/Documentation/power/00-INDEX b/Documentation/power/00-INDEX
new file mode 100644
index 000000000000..8db4e41a052d
--- /dev/null
+++ b/Documentation/power/00-INDEX
@@ -0,0 +1,34 @@
+00-INDEX
+ - This file
+basic-pm-debugging.txt
+ - Debugging suspend and resume
+devices.txt
+ - How drivers interact with system-wide power management
+drivers-testing.txt
+ - Testing suspend and resume support in device drivers
+freezing-of-tasks.txt
+ - How processes and controlled during suspend
+interface.txt
+ - Power management user interface in /sys/power
+notifiers.txt
+ - Registering suspend notifiers in device drivers
+pci.txt
+ - How the PCI Subsystem Does Power Management
+s2ram.txt
+ - How to get suspend to ram working (and debug it when it isn't)
+states.txt
+ - System power management states
+swsusp-and-swap-files.txt
+ - Using swap files with software suspend (to disk)
+swsusp-dmcrypt.txt
+ - How to use dm-crypt and software suspend (to disk) together
+swsusp.txt
+ - Goals, implementation, and usage of software suspend (ACPI S3)
+tricks.txt
+ - How to trick software suspend (to disk) into working when it isn't
+userland-swsusp.txt
+ - Experimental implementation of software suspend in userspace
+video_extension.txt
+ - ACPI video extensions
+video.txt
+ - Video issues during resume from suspend
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
index 1a85e2b964dc..57aef2f6e0de 100644
--- a/Documentation/power/basic-pm-debugging.txt
+++ b/Documentation/power/basic-pm-debugging.txt
@@ -78,8 +78,8 @@ c) Advanced debugging
In case the STD does not work on your system even in the minimal configuration
and compiling more drivers as modules is not practical or some modules cannot
be unloaded, you can use one of the more advanced debugging techniques to find
-the problem. First, if there is a serial port in your box, you can set the
-CONFIG_DISABLE_CONSOLE_SUSPEND kernel configuration option and try to log kernel
+the problem. First, if there is a serial port in your box, you can boot the
+kernel with the 'no_console_suspend' parameter and try to log kernel
messages using the serial console. This may provide you with some information
about the reasons of the suspend (resume) failure. Alternatively, it may be
possible to use a FireWire port for debugging with firescope
diff --git a/Documentation/power/drivers-testing.txt b/Documentation/power/drivers-testing.txt
index 33016c2f18dd..e4bdcaee24e4 100644
--- a/Documentation/power/drivers-testing.txt
+++ b/Documentation/power/drivers-testing.txt
@@ -14,8 +14,8 @@ the machine's BIOS.
Of course, for this purpose the test system has to be known to suspend and
resume without the driver being tested. Thus, if possible, you should first
resolve all suspend/resume-related problems in the test system before you start
-testing the new driver. Please see Documents/power/basic-pm-debugging.txt for
-more information about the debugging of suspend/resume functionality.
+testing the new driver. Please see Documentation/power/basic-pm-debugging.txt
+for more information about the debugging of suspend/resume functionality.
2. Testing the driver
diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
index 04dc1cf9d215..38b57248fd61 100644
--- a/Documentation/power/freezing-of-tasks.txt
+++ b/Documentation/power/freezing-of-tasks.txt
@@ -19,12 +19,13 @@ we only consider hibernation, but the description also applies to suspend).
Namely, as the first step of the hibernation procedure the function
freeze_processes() (defined in kernel/power/process.c) is called. It executes
try_to_freeze_tasks() that sets TIF_FREEZE for all of the freezable tasks and
-sends a fake signal to each of them. A task that receives such a signal and has
-TIF_FREEZE set, should react to it by calling the refrigerator() function
-(defined in kernel/power/process.c), which sets the task's PF_FROZEN flag,
-changes its state to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is
-cleared for it. Then, we say that the task is 'frozen' and therefore the set of
-functions handling this mechanism is called 'the freezer' (these functions are
+either wakes them up, if they are kernel threads, or sends fake signals to them,
+if they are user space processes. A task that has TIF_FREEZE set, should react
+to it by calling the function called refrigerator() (defined in
+kernel/power/process.c), which sets the task's PF_FROZEN flag, changes its state
+to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is cleared for it.
+Then, we say that the task is 'frozen' and therefore the set of functions
+handling this mechanism is referred to as 'the freezer' (these functions are
defined in kernel/power/process.c and include/linux/freezer.h). User space
processes are generally frozen before kernel threads.
@@ -35,21 +36,27 @@ task enter refrigerator() if the flag is set.
For user space processes try_to_freeze() is called automatically from the
signal-handling code, but the freezable kernel threads need to call it
-explicitly in suitable places. The code to do this may look like the following:
+explicitly in suitable places or use the wait_event_freezable() or
+wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
+that combine interruptible sleep with checking if TIF_FREEZE is set and calling
+try_to_freeze(). The main loop of a freezable kernel thread may look like the
+following one:
+ set_freezable();
do {
hub_events();
- wait_event_interruptible(khubd_wait,
- !list_empty(&hub_event_list));
- try_to_freeze();
- } while (!signal_pending(current));
+ wait_event_freezable(khubd_wait,
+ !list_empty(&hub_event_list) ||
+ kthread_should_stop());
+ } while (!kthread_should_stop() || !list_empty(&hub_event_list));
(from drivers/usb/core/hub.c::hub_thread()).
If a freezable kernel thread fails to call try_to_freeze() after the freezer has
set TIF_FREEZE for it, the freezing of tasks will fail and the entire
hibernation operation will be cancelled. For this reason, freezable kernel
-threads must call try_to_freeze() somewhere.
+threads must call try_to_freeze() somewhere or use one of the
+wait_event_freezable() and wait_event_freezable_timeout() macros.
After the system memory state has been restored from a hibernation image and
devices have been reinitialized, the function thaw_processes() is called in
@@ -81,7 +88,16 @@ hibernation image has been created and before the system is finally powered off.
The majority of these are user space processes, but if any of the kernel threads
may cause something like this to happen, they have to be freezable.
-2. The second reason is to prevent user space processes and some kernel threads
+2. Next, to create the hibernation image we need to free a sufficient amount of
+memory (approximately 50% of available RAM) and we need to do that before
+devices are deactivated, because we generally need them for swapping out. Then,
+after the memory for the image has been freed, we don't want tasks to allocate
+additional memory and we prevent them from doing that by freezing them earlier.
+[Of course, this also means that device drivers should not allocate substantial
+amounts of memory from their .suspend() callbacks before hibernation, but this
+is e separate issue.]
+
+3. The third reason is to prevent user space processes and some kernel threads
from interfering with the suspending and resuming of devices. A user space
process running on a second CPU while we are suspending devices may, for
example, be troublesome and without the freezing of tasks we would need some
@@ -111,7 +127,7 @@ frozen before the driver's .suspend() callback is executed and it will be
thawed after the driver's .resume() callback has run, so it won't be accessing
the device while it's suspended.
-3. Another reason for freezing tasks is to prevent user space processes from
+4. Another reason for freezing tasks is to prevent user space processes from
realizing that hibernation (or suspend) operation takes place. Ideally, user
space processes should not notice that such a system-wide operation has occurred
and should continue running without any problems after the restore (or resume
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
index fd5192a8fa8a..e67211fe0ee2 100644
--- a/Documentation/power/interface.txt
+++ b/Documentation/power/interface.txt
@@ -20,7 +20,7 @@ states.
/sys/power/disk controls the operating mode of the suspend-to-disk
mechanism. Suspend-to-disk can be handled in several ways. We have a
few options for putting the system to sleep - using the platform driver
-(e.g. ACPI or other pm_ops), powering off the system or rebooting the
+(e.g. ACPI or other suspend_ops), powering off the system or rebooting the
system (for testing).
Additionally, /sys/power/disk can be used to turn on one of the two testing
diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt
index 06f911a5f885..f281886de490 100644
--- a/Documentation/power/swsusp-and-swap-files.txt
+++ b/Documentation/power/swsusp-and-swap-files.txt
@@ -39,7 +39,7 @@ resume=<swap_file_partition> resume_offset=<swap_file_offset>
where <swap_file_partition> is the partition on which the swap file is located
and <swap_file_offset> is the offset of the swap header determined by the
application in 2) (of course, this step may be carried out automatically
-by the same application that determies the swap file's header offset using the
+by the same application that determines the swap file's header offset using the
FIBMAP ioctl)
OR
diff --git a/Documentation/powerpc/00-INDEX b/Documentation/powerpc/00-INDEX
index d6d65b9bcfe3..94a3c577b083 100644
--- a/Documentation/powerpc/00-INDEX
+++ b/Documentation/powerpc/00-INDEX
@@ -5,6 +5,8 @@ please mail me.
00-INDEX
- this file
+booting-without-of.txt
+ - Booting the Linux/ppc kernel without Open Firmware
cpu_features.txt
- info on how we support a variety of CPUs with minimal compile-time
options.
@@ -14,6 +16,8 @@ hvcs.txt
- IBM "Hypervisor Virtual Console Server" Installation Guide
mpc52xx.txt
- Linux 2.6.x on MPC52xx family
+mpc52xx-device-tree-bindings.txt
+ - MPC5200 Device Tree Bindings
ppc_htab.txt
- info about the Linux/PPC /proc/ppc_htab entry
SBC8260_memory_mapping.txt
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt
index 76733a3962f0..a96e85397eb7 100644
--- a/Documentation/powerpc/booting-without-of.txt
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -50,7 +50,7 @@ Table of Contents
g) Freescale SOC SEC Security Engines
h) Board Control and Status (BCSR)
i) Freescale QUICC Engine module (QE)
- j) Flash chip nodes
+ j) CFI or JEDEC memory-mapped NOR flash
k) Global Utilities Block
VII - Specifying interrupt information for devices
@@ -1510,7 +1510,10 @@ platforms are moved over to use the flattened-device-tree model.
i) Freescale QUICC Engine module (QE)
This represents qe module that is installed on PowerQUICC II Pro.
- Hopefully it will merge backward compatibility with CPM/CPM2.
+
+ NOTE: This is an interim binding; it should be updated to fit
+ in with the CPM binding later in this document.
+
Basically, it is a bus of devices, that could act more or less
as a complete entity (UCC, USB etc ). All of them should be siblings on
the "root" qe node, using the common properties from there.
@@ -1548,7 +1551,7 @@ platforms are moved over to use the flattened-device-tree model.
Required properties:
- device_type : should be "spi".
- compatible : should be "fsl_spi".
- - mode : the SPI operation mode, it can be "cpu" or "qe".
+ - mode : the SPI operation mode, it can be "cpu" or "cpu-qe".
- reg : Offset and length of the register set for the device
- interrupts : <a b> where a is the interrupt number and b is a
field that represents an encoding of the sense and level
@@ -1757,45 +1760,69 @@ platforms are moved over to use the flattened-device-tree model.
};
};
- j) Flash chip nodes
+ j) CFI or JEDEC memory-mapped NOR flash
Flash chips (Memory Technology Devices) are often used for solid state
file systems on embedded devices.
- Required properties:
-
- - device_type : has to be "rom"
- - compatible : Should specify what this flash device is compatible with.
- Currently, this is most likely to be "direct-mapped" (which
- corresponds to the MTD physmap mapping driver).
- - reg : Offset and length of the register set (or memory mapping) for
- the device.
- - bank-width : Width of the flash data bus in bytes. Required
- for the NOR flashes (compatible == "direct-mapped" and others) ONLY.
-
- Recommended properties :
-
- - partitions : Several pairs of 32-bit values where the first value is
- partition's offset from the start of the device and the second one is
- partition size in bytes with LSB used to signify a read only
- partition (so, the partition size should always be an even number).
- - partition-names : The list of concatenated zero terminated strings
- representing the partition names.
- - probe-type : The type of probe which should be done for the chip
- (JEDEC vs CFI actually). Valid ONLY for NOR flashes.
+ - compatible : should contain the specific model of flash chip(s)
+ used, if known, followed by either "cfi-flash" or "jedec-flash"
+ - reg : Address range of the flash chip
+ - bank-width : Width (in bytes) of the flash bank. Equal to the
+ device width times the number of interleaved chips.
+ - device-width : (optional) Width of a single flash chip. If
+ omitted, assumed to be equal to 'bank-width'.
+ - #address-cells, #size-cells : Must be present if the flash has
+ sub-nodes representing partitions (see below). In this case
+ both #address-cells and #size-cells must be equal to 1.
+
+ For JEDEC compatible devices, the following additional properties
+ are defined:
+
+ - vendor-id : Contains the flash chip's vendor id (1 byte).
+ - device-id : Contains the flash chip's device id (1 byte).
+
+ In addition to the information on the flash bank itself, the
+ device tree may optionally contain additional information
+ describing partitions of the flash address space. This can be
+ used on platforms which have strong conventions about which
+ portions of the flash are used for what purposes, but which don't
+ use an on-flash partition table such as RedBoot.
+
+ Each partition is represented as a sub-node of the flash device.
+ Each node's name represents the name of the corresponding
+ partition of the flash device.
+
+ Flash partitions
+ - reg : The partition's offset and size within the flash bank.
+ - label : (optional) The label / name for this flash partition.
+ If omitted, the label is taken from the node name (excluding
+ the unit address).
+ - read-only : (optional) This parameter, if present, is a hint to
+ Linux that this flash partition should only be mounted
+ read-only. This is usually used for flash partitions
+ containing early-boot firmware images or data which should not
+ be clobbered.
- Example:
+ Example:
- flash@ff000000 {
- device_type = "rom";
- compatible = "direct-mapped";
- probe-type = "CFI";
- reg = <ff000000 01000000>;
- bank-width = <4>;
- partitions = <00000000 00f80000
- 00f80000 00080001>;
- partition-names = "fs\0firmware";
- };
+ flash@ff000000 {
+ compatible = "amd,am29lv128ml", "cfi-flash";
+ reg = <ff000000 01000000>;
+ bank-width = <4>;
+ device-width = <1>;
+ #address-cells = <1>;
+ #size-cells = <1>;
+ fs@0 {
+ label = "fs";
+ reg = <0 f80000>;
+ };
+ firmware@f80000 {
+ label ="firmware";
+ reg = <f80000 80000>;
+ read-only;
+ };
+ };
k) Global Utilities Block
@@ -1824,6 +1851,397 @@ platforms are moved over to use the flattened-device-tree model.
fsl,has-rstcr;
};
+ l) Freescale Communications Processor Module
+
+ NOTE: This is an interim binding, and will likely change slightly,
+ as more devices are supported. The QE bindings especially are
+ incomplete.
+
+ i) Root CPM node
+
+ Properties:
+ - compatible : "fsl,cpm1", "fsl,cpm2", or "fsl,qe".
+ - reg : A 48-byte region beginning with CPCR.
+
+ Example:
+ cpm@119c0 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ #interrupt-cells = <2>;
+ compatible = "fsl,mpc8272-cpm", "fsl,cpm2";
+ reg = <119c0 30>;
+ }
+
+ ii) Properties common to mulitple CPM/QE devices
+
+ - fsl,cpm-command : This value is ORed with the opcode and command flag
+ to specify the device on which a CPM command operates.
+
+ - fsl,cpm-brg : Indicates which baud rate generator the device
+ is associated with. If absent, an unused BRG
+ should be dynamically allocated. If zero, the
+ device uses an external clock rather than a BRG.
+
+ - reg : Unless otherwise specified, the first resource represents the
+ scc/fcc/ucc registers, and the second represents the device's
+ parameter RAM region (if it has one).
+
+ iii) Serial
+
+ Currently defined compatibles:
+ - fsl,cpm1-smc-uart
+ - fsl,cpm2-smc-uart
+ - fsl,cpm1-scc-uart
+ - fsl,cpm2-scc-uart
+ - fsl,qe-uart
+
+ Example:
+
+ serial@11a00 {
+ device_type = "serial";
+ compatible = "fsl,mpc8272-scc-uart",
+ "fsl,cpm2-scc-uart";
+ reg = <11a00 20 8000 100>;
+ interrupts = <28 8>;
+ interrupt-parent = <&PIC>;
+ fsl,cpm-brg = <1>;
+ fsl,cpm-command = <00800000>;
+ };
+
+ iii) Network
+
+ Currently defined compatibles:
+ - fsl,cpm1-scc-enet
+ - fsl,cpm2-scc-enet
+ - fsl,cpm1-fec-enet
+ - fsl,cpm2-fcc-enet (third resource is GFEMR)
+ - fsl,qe-enet
+
+ Example:
+
+ ethernet@11300 {
+ device_type = "network";
+ compatible = "fsl,mpc8272-fcc-enet",
+ "fsl,cpm2-fcc-enet";
+ reg = <11300 20 8400 100 11390 1>;
+ local-mac-address = [ 00 00 00 00 00 00 ];
+ interrupts = <20 8>;
+ interrupt-parent = <&PIC>;
+ phy-handle = <&PHY0>;
+ linux,network-index = <0>;
+ fsl,cpm-command = <12000300>;
+ };
+
+ iv) MDIO
+
+ Currently defined compatibles:
+ fsl,pq1-fec-mdio (reg is same as first resource of FEC device)
+ fsl,cpm2-mdio-bitbang (reg is port C registers)
+
+ Properties for fsl,cpm2-mdio-bitbang:
+ fsl,mdio-pin : pin of port C controlling mdio data
+ fsl,mdc-pin : pin of port C controlling mdio clock
+
+ Example:
+
+ mdio@10d40 {
+ device_type = "mdio";
+ compatible = "fsl,mpc8272ads-mdio-bitbang",
+ "fsl,mpc8272-mdio-bitbang",
+ "fsl,cpm2-mdio-bitbang";
+ reg = <10d40 14>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ fsl,mdio-pin = <12>;
+ fsl,mdc-pin = <13>;
+ };
+
+ v) Baud Rate Generators
+
+ Currently defined compatibles:
+ fsl,cpm-brg
+ fsl,cpm1-brg
+ fsl,cpm2-brg
+
+ Properties:
+ - reg : There may be an arbitrary number of reg resources; BRG
+ numbers are assigned to these in order.
+ - clock-frequency : Specifies the base frequency driving
+ the BRG.
+
+ Example:
+
+ brg@119f0 {
+ compatible = "fsl,mpc8272-brg",
+ "fsl,cpm2-brg",
+ "fsl,cpm-brg";
+ reg = <119f0 10 115f0 10>;
+ clock-frequency = <d#25000000>;
+ };
+
+ vi) Interrupt Controllers
+
+ Currently defined compatibles:
+ - fsl,cpm1-pic
+ - only one interrupt cell
+ - fsl,pq1-pic
+ - fsl,cpm2-pic
+ - second interrupt cell is level/sense:
+ - 2 is falling edge
+ - 8 is active low
+
+ Example:
+
+ interrupt-controller@10c00 {
+ #interrupt-cells = <2>;
+ interrupt-controller;
+ reg = <10c00 80>;
+ compatible = "mpc8272-pic", "fsl,cpm2-pic";
+ };
+
+ vii) USB (Universal Serial Bus Controller)
+
+ Properties:
+ - compatible : "fsl,cpm1-usb", "fsl,cpm2-usb", "fsl,qe-usb"
+
+ Example:
+ usb@11bc0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "fsl,cpm2-usb";
+ reg = <11b60 18 8b00 100>;
+ interrupts = <b 8>;
+ interrupt-parent = <&PIC>;
+ fsl,cpm-command = <2e600000>;
+ };
+
+ viii) Multi-User RAM (MURAM)
+
+ The multi-user/dual-ported RAM is expressed as a bus under the CPM node.
+
+ Ranges must be set up subject to the following restrictions:
+
+ - Children's reg nodes must be offsets from the start of all muram, even
+ if the user-data area does not begin at zero.
+ - If multiple range entries are used, the difference between the parent
+ address and the child address must be the same in all, so that a single
+ mapping can cover them all while maintaining the ability to determine
+ CPM-side offsets with pointer subtraction. It is recommended that
+ multiple range entries not be used.
+ - A child address of zero must be translatable, even if no reg resources
+ contain it.
+
+ A child "data" node must exist, compatible with "fsl,cpm-muram-data", to
+ indicate the portion of muram that is usable by the OS for arbitrary
+ purposes. The data node may have an arbitrary number of reg resources,
+ all of which contribute to the allocatable muram pool.
+
+ Example, based on mpc8272:
+
+ muram@0 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges = <0 0 10000>;
+
+ data@0 {
+ compatible = "fsl,cpm-muram-data";
+ reg = <0 2000 9800 800>;
+ };
+ };
+
+ m) Chipselect/Local Bus
+
+ Properties:
+ - name : Should be localbus
+ - #address-cells : Should be either two or three. The first cell is the
+ chipselect number, and the remaining cells are the
+ offset into the chipselect.
+ - #size-cells : Either one or two, depending on how large each chipselect
+ can be.
+ - ranges : Each range corresponds to a single chipselect, and cover
+ the entire access window as configured.
+
+ Example:
+ localbus@f0010100 {
+ compatible = "fsl,mpc8272ads-localbus",
+ "fsl,mpc8272-localbus",
+ "fsl,pq2-localbus";
+ #address-cells = <2>;
+ #size-cells = <1>;
+ reg = <f0010100 40>;
+
+ ranges = <0 0 fe000000 02000000
+ 1 0 f4500000 00008000>;
+
+ flash@0,0 {
+ compatible = "jedec-flash";
+ reg = <0 0 2000000>;
+ bank-width = <4>;
+ device-width = <1>;
+ };
+
+ board-control@1,0 {
+ reg = <1 0 20>;
+ compatible = "fsl,mpc8272ads-bcsr";
+ };
+ };
+
+
+ n) 4xx/Axon EMAC ethernet nodes
+
+ The EMAC ethernet controller in IBM and AMCC 4xx chips, and also
+ the Axon bridge. To operate this needs to interact with a ths
+ special McMAL DMA controller, and sometimes an RGMII or ZMII
+ interface. In addition to the nodes and properties described
+ below, the node for the OPB bus on which the EMAC sits must have a
+ correct clock-frequency property.
+
+ i) The EMAC node itself
+
+ Required properties:
+ - device_type : "network"
+
+ - compatible : compatible list, contains 2 entries, first is
+ "ibm,emac-CHIP" where CHIP is the host ASIC (440gx,
+ 405gp, Axon) and second is either "ibm,emac" or
+ "ibm,emac4". For Axon, thus, we have: "ibm,emac-axon",
+ "ibm,emac4"
+ - interrupts : <interrupt mapping for EMAC IRQ and WOL IRQ>
+ - interrupt-parent : optional, if needed for interrupt mapping
+ - reg : <registers mapping>
+ - local-mac-address : 6 bytes, MAC address
+ - mal-device : phandle of the associated McMAL node
+ - mal-tx-channel : 1 cell, index of the tx channel on McMAL associated
+ with this EMAC
+ - mal-rx-channel : 1 cell, index of the rx channel on McMAL associated
+ with this EMAC
+ - cell-index : 1 cell, hardware index of the EMAC cell on a given
+ ASIC (typically 0x0 and 0x1 for EMAC0 and EMAC1 on
+ each Axon chip)
+ - max-frame-size : 1 cell, maximum frame size supported in bytes
+ - rx-fifo-size : 1 cell, Rx fifo size in bytes for 10 and 100 Mb/sec
+ operations.
+ For Axon, 2048
+ - tx-fifo-size : 1 cell, Tx fifo size in bytes for 10 and 100 Mb/sec
+ operations.
+ For Axon, 2048.
+ - fifo-entry-size : 1 cell, size of a fifo entry (used to calculate
+ thresholds).
+ For Axon, 0x00000010
+ - mal-burst-size : 1 cell, MAL burst size (used to calculate thresholds)
+ in bytes.
+ For Axon, 0x00000100 (I think ...)
+ - phy-mode : string, mode of operations of the PHY interface.
+ Supported values are: "mii", "rmii", "smii", "rgmii",
+ "tbi", "gmii", rtbi", "sgmii".
+ For Axon on CAB, it is "rgmii"
+ - mdio-device : 1 cell, required iff using shared MDIO registers
+ (440EP). phandle of the EMAC to use to drive the
+ MDIO lines for the PHY used by this EMAC.
+ - zmii-device : 1 cell, required iff connected to a ZMII. phandle of
+ the ZMII device node
+ - zmii-channel : 1 cell, required iff connected to a ZMII. Which ZMII
+ channel or 0xffffffff if ZMII is only used for MDIO.
+ - rgmii-device : 1 cell, required iff connected to an RGMII. phandle
+ of the RGMII device node.
+ For Axon: phandle of plb5/plb4/opb/rgmii
+ - rgmii-channel : 1 cell, required iff connected to an RGMII. Which
+ RGMII channel is used by this EMAC.
+ Fox Axon: present, whatever value is appropriate for each
+ EMAC, that is the content of the current (bogus) "phy-port"
+ property.
+
+ Recommended properties:
+ - linux,network-index : This is the intended "index" of this
+ network device. This is used by the bootwrapper to interpret
+ MAC addresses passed by the firmware when no information other
+ than indices is available to associate an address with a device.
+
+ Optional properties:
+ - phy-address : 1 cell, optional, MDIO address of the PHY. If absent,
+ a search is performed.
+ - phy-map : 1 cell, optional, bitmap of addresses to probe the PHY
+ for, used if phy-address is absent. bit 0x00000001 is
+ MDIO address 0.
+ For Axon it can be absent, thouugh my current driver
+ doesn't handle phy-address yet so for now, keep
+ 0x00ffffff in it.
+ - rx-fifo-size-gige : 1 cell, Rx fifo size in bytes for 1000 Mb/sec
+ operations (if absent the value is the same as
+ rx-fifo-size). For Axon, either absent or 2048.
+ - tx-fifo-size-gige : 1 cell, Tx fifo size in bytes for 1000 Mb/sec
+ operations (if absent the value is the same as
+ tx-fifo-size). For Axon, either absent or 2048.
+ - tah-device : 1 cell, optional. If connected to a TAH engine for
+ offload, phandle of the TAH device node.
+ - tah-channel : 1 cell, optional. If appropriate, channel used on the
+ TAH engine.
+
+ Example:
+
+ EMAC0: ethernet@40000800 {
+ linux,network-index = <0>;
+ device_type = "network";
+ compatible = "ibm,emac-440gp", "ibm,emac";
+ interrupt-parent = <&UIC1>;
+ interrupts = <1c 4 1d 4>;
+ reg = <40000800 70>;
+ local-mac-address = [00 04 AC E3 1B 1E];
+ mal-device = <&MAL0>;
+ mal-tx-channel = <0 1>;
+ mal-rx-channel = <0>;
+ cell-index = <0>;
+ max-frame-size = <5dc>;
+ rx-fifo-size = <1000>;
+ tx-fifo-size = <800>;
+ phy-mode = "rmii";
+ phy-map = <00000001>;
+ zmii-device = <&ZMII0>;
+ zmii-channel = <0>;
+ };
+
+ ii) McMAL node
+
+ Required properties:
+ - device_type : "dma-controller"
+ - compatible : compatible list, containing 2 entries, first is
+ "ibm,mcmal-CHIP" where CHIP is the host ASIC (like
+ emac) and the second is either "ibm,mcmal" or
+ "ibm,mcmal2".
+ For Axon, "ibm,mcmal-axon","ibm,mcmal2"
+ - interrupts : <interrupt mapping for the MAL interrupts sources:
+ 5 sources: tx_eob, rx_eob, serr, txde, rxde>.
+ For Axon: This is _different_ from the current
+ firmware. We use the "delayed" interrupts for txeob
+ and rxeob. Thus we end up with mapping those 5 MPIC
+ interrupts, all level positive sensitive: 10, 11, 32,
+ 33, 34 (in decimal)
+ - dcr-reg : < DCR registers range >
+ - dcr-parent : if needed for dcr-reg
+ - num-tx-chans : 1 cell, number of Tx channels
+ - num-rx-chans : 1 cell, number of Rx channels
+
+ iii) ZMII node
+
+ Required properties:
+ - compatible : compatible list, containing 2 entries, first is
+ "ibm,zmii-CHIP" where CHIP is the host ASIC (like
+ EMAC) and the second is "ibm,zmii".
+ For Axon, there is no ZMII node.
+ - reg : <registers mapping>
+
+ iv) RGMII node
+
+ Required properties:
+ - compatible : compatible list, containing 2 entries, first is
+ "ibm,rgmii-CHIP" where CHIP is the host ASIC (like
+ EMAC) and the second is "ibm,rgmii".
+ For Axon, "ibm,rgmii-axon","ibm,rgmii"
+ - reg : <registers mapping>
+ - revision : as provided by the RGMII new version register if
+ available.
+ For Axon: 0x0000012a
+
More devices will be defined as this spec matures.
VII - Specifying interrupt information for devices
diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt
index 4530d1bf0286..df7afe43d462 100644
--- a/Documentation/powerpc/eeh-pci-error-recovery.txt
+++ b/Documentation/powerpc/eeh-pci-error-recovery.txt
@@ -36,8 +36,8 @@ Causes of EEH Errors
EEH was originally designed to guard against hardware failure, such
as PCI cards dying from heat, humidity, dust, vibration and bad
electrical connections. The vast majority of EEH errors seen in
-"real life" are due to eithr poorly seated PCI cards, or,
-unfortunately quite commonly, due device driver bugs, device firmware
+"real life" are due to either poorly seated PCI cards, or,
+unfortunately quite commonly, due to device driver bugs, device firmware
bugs, and sometimes PCI card hardware bugs.
The most common software bug, is one that causes the device to
diff --git a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt
index e59fcbbe338c..5e03610e186f 100644
--- a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt
+++ b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt
@@ -17,12 +17,12 @@ passed by the boot loader to the kernel at boot time. The device tree
describes what devices are present on the board and how they are
connected. The device tree can either be passed as a binary blob (as
described in Documentation/powerpc/booting-without-of.txt), or passed
-by Open Firmare (IEEE 1275) compatible firmware using an OF compatible
+by Open Firmware (IEEE 1275) compatible firmware using an OF compatible
client interface API.
This document specifies the requirements on the device-tree for mpc5200
based boards. These requirements are above and beyond the details
-specified in either the OpenFirmware spec or booting-without-of.txt
+specified in either the Open Firmware spec or booting-without-of.txt
All new mpc5200-based boards are expected to match this document. In
cases where this document is not sufficient to support a new board port,
@@ -73,8 +73,8 @@ match on the compatible list; the 'most compatible' driver should be
selected.
The split between the MPC5200 and the MPC5200B leaves a bit of a
-connundrum. How should the compatible property be set up to provide
-maximum compatability information; but still acurately describe the
+conundrum. How should the compatible property be set up to provide
+maximum compatibility information; but still accurately describe the
chip? For the MPC5200; the answer is easy. Most of the SoC devices
originally appeared on the MPC5200. Since they didn't exist anywhere
else; the 5200 compatible properties will contain only one item;
@@ -84,7 +84,7 @@ The 5200B is almost the same as the 5200, but not quite. It fixes
silicon bugs and it adds a small number of enhancements. Most of the
devices either provide exactly the same interface as on the 5200. A few
devices have extra functions but still have a backwards compatible mode.
-To express this infomation as completely as possible, 5200B device trees
+To express this information as completely as possible, 5200B device trees
should have two items in the compatible list;
"mpc5200b-<device>\0mpc5200-<device>". It is *strongly* recommended
that 5200B device trees follow this convention (instead of only listing
@@ -185,7 +185,7 @@ bestcomm@<addr> dma-controller mpc5200-bestcomm 5200 pic also requires
Recommended soc5200 child nodes; populate as needed for your board
name device_type compatible Description
---- ----------- ---------- -----------
-gpt@<addr> gpt mpc5200-gpt General purpose timers
+gpt@<addr> gpt fsl,mpc5200-gpt General purpose timers
rtc@<addr> rtc mpc5200-rtc Real time clock
mscan@<addr> mscan mpc5200-mscan CAN bus controller
pci@<addr> pci mpc5200-pci PCI bridge
@@ -199,7 +199,7 @@ ethernet@<addr> network mpc5200-fec MPC5200 ethernet device
ata@<addr> ata mpc5200-ata IDE ATA interface
i2c@<addr> i2c mpc5200-i2c I2C controller
usb@<addr> usb-ohci-be mpc5200-ohci,ohci-be USB controller
-xlb@<addr> xlb mpc5200-xlb XLB arbritrator
+xlb@<addr> xlb mpc5200-xlb XLB arbitrator
Important child node properties
name type description
@@ -213,7 +213,7 @@ cell-index int When multiple devices are present, is the
5) General Purpose Timer nodes (child of soc5200 node)
On the mpc5200 and 5200b, GPT0 has a watchdog timer function. If the board
design supports the internal wdt, then the device node for GPT0 should
-include the empty property 'has-wdt'.
+include the empty property 'fsl,has-wdt'.
6) PSC nodes (child of soc5200 node)
PSC nodes can define the optional 'port-number' property to force assignment
diff --git a/Documentation/ramdisk.txt b/Documentation/ramdisk.txt
index 52f75b7d51c2..6c820baa19a6 100644
--- a/Documentation/ramdisk.txt
+++ b/Documentation/ramdisk.txt
@@ -22,16 +22,14 @@ The RAM disk dynamically grows as more space is required. It does this by using
RAM from the buffer cache. The driver marks the buffers it is using as dirty
so that the VM subsystem does not try to reclaim them later.
-Also, the RAM disk supports up to 16 RAM disks out of the box, and can
-be reconfigured to support up to 255 RAM disks - change "#define NUM_RAMDISKS"
-in drivers/block/rd.c. To use RAM disk support with your system, run
-'./MAKEDEV ram' from the /dev directory. RAM disks are all major number 1, and
-start with minor number 0 for /dev/ram0, etc. If used, modern kernels use
-/dev/ram0 for an initrd.
-
-The old "ramdisk=<ram_size>" has been changed to "ramdisk_size=<ram_size>" to
-make it clearer. The original "ramdisk=<ram_size>" has been kept around for
-compatibility reasons, but it may be removed in the future.
+The RAM disk supports up to 16 RAM disks by default, and can be reconfigured
+to support an unlimited number of RAM disks (at your own risk). Just change
+the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu
+and (re)build the kernel.
+
+To use RAM disk support with your system, run './MAKEDEV ram' from the /dev
+directory. RAM disks are all major number 1, and start with minor number 0
+for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd.
The new RAM disk also has the ability to load compressed RAM disk images,
allowing one to squeeze more programs onto an average installation or
diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
new file mode 100644
index 000000000000..a83ff23cd68c
--- /dev/null
+++ b/Documentation/rfkill.txt
@@ -0,0 +1,89 @@
+rfkill - RF switch subsystem support
+====================================
+
+1 Implementation details
+2 Driver support
+3 Userspace support
+
+===============================================================================
+1: Implementation details
+
+The rfkill switch subsystem offers support for keys often found on laptops
+to enable wireless devices like WiFi and Bluetooth.
+
+This is done by providing the user 3 possibilities:
+ 1 - The rfkill system handles all events; userspace is not aware of events.
+ 2 - The rfkill system handles all events; userspace is informed about the events.
+ 3 - The rfkill system does not handle events; userspace handles all events.
+
+The buttons to enable and disable the wireless radios are important in
+situations where the user is for example using his laptop on a location where
+wireless radios _must_ be disabled (e.g. airplanes).
+Because of this requirement, userspace support for the keys should not be
+made mandatory. Because userspace might want to perform some additional smarter
+tasks when the key is pressed, rfkill still provides userspace the possibility
+to take over the task to handle the key events.
+
+The system inside the kernel has been split into 2 separate sections:
+ 1 - RFKILL
+ 2 - RFKILL_INPUT
+
+The first option enables rfkill support and will make sure userspace will
+be notified of any events through the input device. It also creates several
+sysfs entries which can be used by userspace. See section "Userspace support".
+
+The second option provides an rfkill input handler. This handler will
+listen to all rfkill key events and will toggle the radio accordingly.
+With this option enabled userspace could either do nothing or simply
+perform monitoring tasks.
+
+====================================
+2: Driver support
+
+To build a driver with rfkill subsystem support, the driver should
+depend on the Kconfig symbol RFKILL; it should _not_ depend on
+RKFILL_INPUT.
+
+Unless key events trigger an interrupt to which the driver listens, polling
+will be required to determine the key state changes. For this the input
+layer providers the input-polldev handler.
+
+A driver should implement a few steps to correctly make use of the
+rfkill subsystem. First for non-polling drivers:
+
+ - rfkill_allocate()
+ - input_allocate_device()
+ - rfkill_register()
+ - input_register_device()
+
+For polling drivers:
+
+ - rfkill_allocate()
+ - input_allocate_polled_device()
+ - rfkill_register()
+ - input_register_polled_device()
+
+When a key event has been detected, the correct event should be
+sent over the input device which has been registered by the driver.
+
+====================================
+3: Userspace support
+
+For each key an input device will be created which will send out the correct
+key event when the rfkill key has been pressed.
+
+The following sysfs entries will be created:
+
+ name: Name assigned by driver to this key (interface or driver name).
+ type: Name of the key type ("wlan", "bluetooth", etc).
+ state: Current state of the key. 1: On, 0: Off.
+ claim: 1: Userspace handles events, 0: Kernel handles events
+
+Both the "state" and "claim" entries are also writable. For the "state" entry
+this means that when 1 or 0 is written all radios, not yet in the requested
+state, will be will be toggled accordingly.
+For the "claim" entry writing 1 to it means that the kernel no longer handles
+key events even though RFKILL_INPUT input was enabled. When "claim" has been
+set to 0, userspace should make sure that it listens for the input events or
+check the sysfs "state" entry regularly to correctly perform the required
+tasks when the rkfill key is pressed.
diff --git a/Documentation/s390/00-INDEX b/Documentation/s390/00-INDEX
new file mode 100644
index 000000000000..3a2b96302ecc
--- /dev/null
+++ b/Documentation/s390/00-INDEX
@@ -0,0 +1,26 @@
+00-INDEX
+ - this file.
+3270.ChangeLog
+ - ChangeLog for the UTS Global 3270-support patch (outdated).
+3270.txt
+ - how to use the IBM 3270 display system support.
+cds.txt
+ - s390 common device support (common I/O layer).
+CommonIO
+ - common I/O layer command line parameters, procfs and debugfs entries
+config3270.sh
+ - example configuration for 3270 devices.
+DASD
+ - information on the DASD disk device driver.
+Debugging390.txt
+ - hints for debugging on s390 systems.
+driver-model.txt
+ - information on s390 devices and the driver model.
+monreader.txt
+ - information on accessing the z/VM monitor stream from Linux.
+s390dbf.txt
+ - information on using the s390 debug feature.
+TAPE
+ - information on the driver for channel-attached tapes.
+zfcpdump
+ - information on the s390 SCSI dump tool.
diff --git a/Documentation/s390/CommonIO b/Documentation/s390/CommonIO
index 22f82f21bc60..86320aa3fb0b 100644
--- a/Documentation/s390/CommonIO
+++ b/Documentation/s390/CommonIO
@@ -1,5 +1,5 @@
-S/390 common I/O-Layer - command line parameters and /proc entries
-==================================================================
+S/390 common I/O-Layer - command line parameters, procfs and debugfs entries
+============================================================================
Command line parameters
-----------------------
@@ -7,9 +7,9 @@ Command line parameters
* cio_msg = yes | no
Determines whether information on found devices and sensed device
- characteristics should be shown during startup, i. e. messages of the types
- "Detected device 0.0.4711 on subchannel 0.0.0042" and "SenseID: Device
- 0.0.4711 reports: ...".
+ characteristics should be shown during startup or when new devices are
+ found, i. e. messages of the types "Detected device 0.0.4711 on subchannel
+ 0.0.0042" and "SenseID: Device 0.0.4711 reports: ...".
Default is off.
@@ -26,8 +26,10 @@ Command line parameters
An ignored device can be un-ignored later; see the "/proc entries"-section for
details.
- The devices must be given either as bus ids (0.0.abcd) or as hexadecimal
- device numbers (0xabcd or abcd, for 2.4 backward compatibility).
+ The devices must be given either as bus ids (0.x.abcd) or as hexadecimal
+ device numbers (0xabcd or abcd, for 2.4 backward compatibility). If you
+ give a device number 0xabcd, it will be interpreted as 0.0.abcd.
+
You can use the 'all' keyword to ignore all devices.
The '!' operator will cause the I/O-layer to _not_ ignore a device.
The command line is parsed from left to right.
@@ -81,31 +83,36 @@ Command line parameters
will add 0.0.a000-0.0.accc and 0.0.af00-0.0.afff to the list of ignored
devices.
- The devices can be specified either by bus id (0.0.abcd) or, for 2.4 backward
- compatibility, by the device number in hexadecimal (0xabcd or abcd).
+ The devices can be specified either by bus id (0.x.abcd) or, for 2.4 backward
+ compatibility, by the device number in hexadecimal (0xabcd or abcd). Device
+ numbers given as 0xabcd will be interpreted as 0.0.abcd.
+
+* For some of the information present in the /proc filesystem in 2.4 (namely,
+ /proc/subchannels and /proc/chpids), see driver-model.txt.
+ Information formerly in /proc/irq_count is now in /proc/interrupts.
+
+debugfs entries
+---------------
-* /proc/s390dbf/cio_*/ (S/390 debug feature)
+* /sys/kernel/debug/s390dbf/cio_*/ (S/390 debug feature)
Some views generated by the debug feature to hold various debug outputs.
- - /proc/s390dbf/cio_crw/sprintf
+ - /sys/kernel/debug/s390dbf/cio_crw/sprintf
Messages from the processing of pending channel report words (machine check
- handling), which will also show when CONFIG_DEBUG_CRW is defined.
+ handling).
- - /proc/s390dbf/cio_msg/sprintf
- Various debug messages from the common I/O-layer; generally, messages which
- will also show when CONFIG_DEBUG_IO is defined.
+ - /sys/kernel/debug/s390dbf/cio_msg/sprintf
+ Various debug messages from the common I/O-layer, including messages
+ printed when cio_msg=yes.
- - /proc/s390dbf/cio_trace/hex_ascii
+ - /sys/kernel/debug/s390dbf/cio_trace/hex_ascii
Logs the calling of functions in the common I/O-layer and, if applicable,
which subchannel they were called for, as well as dumps of some data
structures (like irb in an error case).
The level of logging can be changed to be more or less verbose by piping to
- /proc/s390dbf/cio_*/level a number between 0 and 6; see the documentation on
- the S/390 debug feature (Documentation/s390/s390dbf.txt) for details.
-
-* For some of the information present in the /proc filesystem in 2.4 (namely,
- /proc/subchannels and /proc/chpids), see driver-model.txt.
- Information formerly in /proc/irq_count is now in /proc/interrupts.
+ /sys/kernel/debug/s390dbf/cio_*/level a number between 0 and 6; see the
+ documentation on the S/390 debug feature (Documentation/s390/s390dbf.txt)
+ for details.
diff --git a/Documentation/s390/cds.txt b/Documentation/s390/cds.txt
index 58919d6a593a..3081927cc2d6 100644
--- a/Documentation/s390/cds.txt
+++ b/Documentation/s390/cds.txt
@@ -286,10 +286,10 @@ first:
timeout value
-EIO: the common I/O layer terminated the request due to an error state
-If the concurrent sense flag in the extended status word in the irb is set, the
-field irb->scsw.count describes the number of device specific sense bytes
-available in the extended control word irb->scsw.ecw[0]. No device sensing by
-the device driver itself is required.
+If the concurrent sense flag in the extended status word (esw) in the irb is
+set, the field erw.scnt in the esw describes the number of device specific
+sense bytes available in the extended control word irb->scsw.ecw[]. No device
+sensing by the device driver itself is required.
The device interrupt handler can use the following definitions to investigate
the primary unit check source coded in sense byte 0 :
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt
index 84901e7c0508..88bcb8767335 100644
--- a/Documentation/sched-design-CFS.txt
+++ b/Documentation/sched-design-CFS.txt
@@ -117,3 +117,70 @@ Some implementation details:
iterators of the scheduling modules are used. The balancing code got
quite a bit simpler as a result.
+
+Group scheduler extension to CFS
+================================
+
+Normally the scheduler operates on individual tasks and strives to provide
+fair CPU time to each task. Sometimes, it may be desirable to group tasks
+and provide fair CPU time to each such task group. For example, it may
+be desirable to first provide fair CPU time to each user on the system
+and then to each task belonging to a user.
+
+CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
+SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
+groups. At present, there are two (mutually exclusive) mechanisms to group
+tasks for CPU bandwidth control purpose:
+
+ - Based on user id (CONFIG_FAIR_USER_SCHED)
+ In this option, tasks are grouped according to their user id.
+ - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
+ This options lets the administrator create arbitrary groups
+ of tasks, using the "cgroup" pseudo filesystem. See
+ Documentation/cgroups.txt for more information about this
+ filesystem.
+
+Only one of these options to group tasks can be chosen and not both.
+
+Group scheduler tunables:
+
+When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
+each new user and a "cpu_share" file is added in that directory.
+
+ # cd /sys/kernel/uids
+ # cat 512/cpu_share # Display user 512's CPU share
+ 1024
+ # echo 2048 > 512/cpu_share # Modify user 512's CPU share
+ # cat 512/cpu_share # Display user 512's CPU share
+ 2048
+ #
+
+CPU bandwidth between two users are divided in the ratio of their CPU shares.
+For ex: if you would like user "root" to get twice the bandwidth of user
+"guest", then set the cpu_share for both the users such that "root"'s
+cpu_share is twice "guest"'s cpu_share
+
+
+When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
+for each group created using the pseudo filesystem. See example steps
+below to create task groups and modify their CPU share using the "cgroups"
+pseudo filesystem
+
+ # mkdir /dev/cpuctl
+ # mount -t cgroup -ocpu none /dev/cpuctl
+ # cd /dev/cpuctl
+
+ # mkdir multimedia # create "multimedia" group of tasks
+ # mkdir browser # create "browser" group of tasks
+
+ # #Configure the multimedia group to receive twice the CPU bandwidth
+ # #that of browser group
+
+ # echo 2048 > multimedia/cpu.shares
+ # echo 1024 > browser/cpu.shares
+
+ # firefox & # Launch firefox and move it to "browser" group
+ # echo <firefox_pid> > browser/tasks
+
+ # #Launch gmplayer (or your favourite movie player)
+ # echo <movie_player_pid> > multimedia/tasks
diff --git a/Documentation/scsi/00-INDEX b/Documentation/scsi/00-INDEX
index 12354830c6b0..aa1f7e927834 100644
--- a/Documentation/scsi/00-INDEX
+++ b/Documentation/scsi/00-INDEX
@@ -2,14 +2,20 @@
- this file
53c700.txt
- info on driver for 53c700 based adapters
-AM53C974.txt
- - info on driver for AM53c974 based adapters
BusLogic.txt
- info on driver for adapters with BusLogic chips
-ChangeLog
+ChangeLog.1992-1997
- Changes to scsi files, if not listed elsewhere
+ChangeLog.arcmsr
+ - Changes to driver for ARECA's SATA RAID controller cards
ChangeLog.ips
- IBM ServeRAID driver Changelog
+ChangeLog.lpfc
+ - Changes to lpfc driver
+ChangeLog.megaraid
+ - Changes to LSI megaraid controller.
+ChangeLog.megaraid_sas
+ - Changes to serial attached scsi version of LSI megaraid controller.
ChangeLog.ncr53c8xx
- Changes to ncr53c8xx driver
ChangeLog.sym53c8xx
@@ -20,26 +26,44 @@ FlashPoint.txt
- info on driver for BusLogic FlashPoint adapters
LICENSE.FlashPoint
- Licence of the Flashpoint driver
+LICENSE.qla2xxx
+ - License for QLogic Linux Fibre Channel HBA Driver firmware.
Mylex.txt
- info on driver for Mylex adapters
NinjaSCSI.txt
- info on WorkBiT NinjaSCSI-32/32Bi driver
+aacraid.txt
+ - Driver supporting Adaptec RAID controllers
aha152x.txt
- info on driver for Adaptec AHA152x based adapters
+aic79xx.txt
+ - Adaptec Ultra320 SCSI host adapters
aic7xxx.txt
- info on driver for Adaptec controllers
aic7xxx_old.txt
- info on driver for Adaptec controllers, old generation
+arcmsr_spec.txt
+ - ARECA FIRMWARE SPEC (for IOP331 adapter)
+dc395x.txt
+ - README file for the dc395x SCSI driver
dpti.txt
- info on driver for DPT SmartRAID and Adaptec I2O RAID based adapters
dtc3x80.txt
- info on driver for DTC 2x80 based adapters
g_NCR5380.txt
- info on driver for NCR5380 and NCR53c400 based adapters
+hptiop.txt
+ - HIGHPOINT ROCKETRAID 3xxx RAID DRIVER
ibmmca.txt
- info on driver for IBM adapters with MCA bus
in2000.txt
- info on in2000 driver
+libsas.txt
+ - Serial Attached SCSI management layer.
+lpfc.txt
+ - LPFC driver release notes
+megaraid.txt
+ - Common Management Module, shared code handling ioctls for LSI drivers
ncr53c7xx.txt
- info on driver for NCR53c7xx based adapters
ncr53c8xx.txt
@@ -50,6 +74,8 @@ ppa.txt
- info on driver for IOmega zip drive
qlogicfas.txt
- info on driver for QLogic FASxxx based adapters
+scsi-changer.txt
+ - README for the SCSI media changer driver
scsi-generic.txt
- info on the sg driver for generic (non-disk/CD/tape) SCSI devices.
scsi.txt
@@ -58,6 +84,8 @@ scsi_mid_low_api.txt
- info on API between SCSI layer and low level drivers
scsi_eh.txt
- info on SCSI midlayer error handling infrastructure
+scsi_fc_transport.txt
+ - SCSI Fiber Channel Tansport
st.txt
- info on scsi tape driver
sym53c500_cs.txt
diff --git a/Documentation/scsi/ChangeLog.arcmsr b/Documentation/scsi/ChangeLog.arcmsr
index 162c47fdf45f..cd8403a33ee6 100644
--- a/Documentation/scsi/ChangeLog.arcmsr
+++ b/Documentation/scsi/ChangeLog.arcmsr
@@ -53,4 +53,19 @@
** for linux standard list
** enable usage of pci message signal interrupt
** follow Randy.Danlup kindness suggestion cleanup this code
-************************************************************************** \ No newline at end of file
+** 1.20.00.14 05/02/2007 Erich Chen & Nick Cheng
+** 1.implement PCI-Express error recovery function and AER capability
+** 2.implement the selection of ARCMSR_MAX_XFER_SECTORS_B=4096
+** if firmware version is newer than 1.42
+** 3.modify arcmsr_iop_reset to improve the ability
+** 4.modify the ISR, arcmsr_interrupt routine,to prevent the
+** inconsistency with sg_mod driver if application directly calls
+** the arcmsr driver w/o passing through scsi mid layer
+** specially thanks to Yanmin Zhang's openhanded help about AER
+** 1.20.00.15 08/30/2007 Erich Chen & Nick Cheng
+** 1. support ARC1200/1201/1202 SATA RAID adapter, which is named
+** ACB_ADAPTER_TYPE_B
+** 2. modify the arcmsr_pci_slot_reset function
+** 3. modify the arcmsr_pci_ers_disconnect_forepart function
+** 4. modify the arcmsr_pci_ers_need_reset_forepart function
+**************************************************************************
diff --git a/Documentation/scsi/ChangeLog.ncr53c8xx b/Documentation/scsi/ChangeLog.ncr53c8xx
index 7d03e9d5b5f7..a9f721aeb11c 100644
--- a/Documentation/scsi/ChangeLog.ncr53c8xx
+++ b/Documentation/scsi/ChangeLog.ncr53c8xx
@@ -195,9 +195,9 @@ Sun Feb 14:00 1999 Gerard Roudier (groudier@club-internet.fr)
Pointed out by Leonard Zubkoff.
- Allow to tune request_irq() flags from the boot command line using
ncr53c8xx=irqm:??, as follows:
- a) If bit 0x10 is set in irqm, SA_SHIRQ flag is not used.
- b) If bit 0x20 is set in irqm, SA_INTERRUPT flag is not used.
- By default the driver uses both SA_SHIRQ and SA_INTERRUPT.
+ a) If bit 0x10 is set in irqm, IRQF_SHARED flag is not used.
+ b) If bit 0x20 is set in irqm, IRQF_DISABLED flag is not used.
+ By default the driver uses both IRQF_SHARED and IRQF_DISABLED.
Option 'ncr53c8xx=irqm:0x20' may be used when an IRQ is shared by
a 53C8XX adapter and a network board.
- Tiny mispelling fixed (ABORT instead of ABRT). Was fortunately
diff --git a/Documentation/scsi/aacraid.txt b/Documentation/scsi/aacraid.txt
index cc12b55d4b3d..a8257840695a 100644
--- a/Documentation/scsi/aacraid.txt
+++ b/Documentation/scsi/aacraid.txt
@@ -38,10 +38,8 @@ Supported Cards/Chipsets
9005:0286:9005:02ac Adaptec 1800 (Typhoon44)
9005:0285:9005:02b5 Adaptec 5445 (Voodoo44)
9005:0285:15d9:02b5 SMC AOC-USAS-S4i
- 9005:0285:15d9:02c9 SMC AOC-USAS-S4iR
9005:0285:9005:02b6 Adaptec 5805 (Voodoo80)
9005:0285:15d9:02b6 SMC AOC-USAS-S8i
- 9005:0285:15d9:02ca SMC AOC-USAS-S8iR
9005:0285:9005:02b7 Adaptec 5085 (Voodoo08)
9005:0285:9005:02bb Adaptec 3405 (Marauder40LP)
9005:0285:9005:02bc Adaptec 3805 (Marauder80LP)
@@ -50,9 +48,14 @@ Supported Cards/Chipsets
9005:0285:9005:02be Adaptec 31605 (Marauder160)
9005:0285:9005:02c3 Adaptec 51205 (Voodoo120)
9005:0285:9005:02c4 Adaptec 51605 (Voodoo160)
+ 9005:0285:15d9:02c9 SMC AOC-USAS-S4iR
+ 9005:0285:15d9:02ca SMC AOC-USAS-S8iR
9005:0285:9005:02ce Adaptec 51245 (Voodoo124)
9005:0285:9005:02cf Adaptec 51645 (Voodoo164)
9005:0285:9005:02d0 Adaptec 52445 (Voodoo244)
+ 9005:0285:9005:02d1 Adaptec 5405 (Voodoo40)
+ 9005:0285:15d9:02d2 SMC AOC-USAS-S8i-LP
+ 9005:0285:15d9:02d3 SMC AOC-USAS-S8iR-LP
1011:0046:9005:0364 Adaptec 5400S (Mustang)
9005:0287:9005:0800 Adaptec Themisto (Jupiter)
9005:0200:9005:0200 Adaptec Themisto (Jupiter)
@@ -103,6 +106,7 @@ Supported Cards/Chipsets
9005:0285:108e:7aac SUN STK RAID REM (Voodoo44 Coyote)
9005:0285:108e:0286 SUN STK RAID INT (Cougar)
9005:0285:108e:0287 SUN STK RAID EXT (Prometheus)
+ 9005:0285:108e:7aae SUN STK RAID EM (Narvi)
People
-------------------------
diff --git a/Documentation/scsi/advansys.txt b/Documentation/scsi/advansys.txt
new file mode 100644
index 000000000000..4a3db62b7424
--- /dev/null
+++ b/Documentation/scsi/advansys.txt
@@ -0,0 +1,243 @@
+AdvanSys (Advanced System Products, Inc.) manufactures the following
+RISC-based, Bus-Mastering, Fast (10 Mhz) and Ultra (20 Mhz) Narrow
+(8-bit transfer) SCSI Host Adapters for the ISA, EISA, VL, and PCI
+buses and RISC-based, Bus-Mastering, Ultra (20 Mhz) Wide (16-bit
+transfer) SCSI Host Adapters for the PCI bus.
+
+The CDB counts below indicate the number of SCSI CDB (Command
+Descriptor Block) requests that can be stored in the RISC chip
+cache and board LRAM. A CDB is a single SCSI command. The driver
+detect routine will display the number of CDBs available for each
+adapter detected. The number of CDBs used by the driver can be
+lowered in the BIOS by changing the 'Host Queue Size' adapter setting.
+
+Laptop Products:
+ ABP-480 - Bus-Master CardBus (16 CDB)
+
+Connectivity Products:
+ ABP510/5150 - Bus-Master ISA (240 CDB)
+ ABP5140 - Bus-Master ISA PnP (16 CDB)
+ ABP5142 - Bus-Master ISA PnP with floppy (16 CDB)
+ ABP902/3902 - Bus-Master PCI (16 CDB)
+ ABP3905 - Bus-Master PCI (16 CDB)
+ ABP915 - Bus-Master PCI (16 CDB)
+ ABP920 - Bus-Master PCI (16 CDB)
+ ABP3922 - Bus-Master PCI (16 CDB)
+ ABP3925 - Bus-Master PCI (16 CDB)
+ ABP930 - Bus-Master PCI (16 CDB)
+ ABP930U - Bus-Master PCI Ultra (16 CDB)
+ ABP930UA - Bus-Master PCI Ultra (16 CDB)
+ ABP960 - Bus-Master PCI MAC/PC (16 CDB)
+ ABP960U - Bus-Master PCI MAC/PC Ultra (16 CDB)
+
+Single Channel Products:
+ ABP542 - Bus-Master ISA with floppy (240 CDB)
+ ABP742 - Bus-Master EISA (240 CDB)
+ ABP842 - Bus-Master VL (240 CDB)
+ ABP940 - Bus-Master PCI (240 CDB)
+ ABP940U - Bus-Master PCI Ultra (240 CDB)
+ ABP940UA/3940UA - Bus-Master PCI Ultra (240 CDB)
+ ABP970 - Bus-Master PCI MAC/PC (240 CDB)
+ ABP970U - Bus-Master PCI MAC/PC Ultra (240 CDB)
+ ABP3960UA - Bus-Master PCI MAC/PC Ultra (240 CDB)
+ ABP940UW/3940UW - Bus-Master PCI Ultra-Wide (253 CDB)
+ ABP970UW - Bus-Master PCI MAC/PC Ultra-Wide (253 CDB)
+ ABP3940U2W - Bus-Master PCI LVD/Ultra2-Wide (253 CDB)
+
+Multi-Channel Products:
+ ABP752 - Dual Channel Bus-Master EISA (240 CDB Per Channel)
+ ABP852 - Dual Channel Bus-Master VL (240 CDB Per Channel)
+ ABP950 - Dual Channel Bus-Master PCI (240 CDB Per Channel)
+ ABP950UW - Dual Channel Bus-Master PCI Ultra-Wide (253 CDB Per Channel)
+ ABP980 - Four Channel Bus-Master PCI (240 CDB Per Channel)
+ ABP980U - Four Channel Bus-Master PCI Ultra (240 CDB Per Channel)
+ ABP980UA/3980UA - Four Channel Bus-Master PCI Ultra (16 CDB Per Chan.)
+ ABP3950U2W - Bus-Master PCI LVD/Ultra2-Wide and Ultra-Wide (253 CDB)
+ ABP3950U3W - Bus-Master PCI Dual LVD2/Ultra3-Wide (253 CDB)
+
+Driver Compile Time Options and Debugging
+
+The following constants can be defined in the source file.
+
+1. ADVANSYS_ASSERT - Enable driver assertions (Def: Enabled)
+
+ Enabling this option adds assertion logic statements to the
+ driver. If an assertion fails a message will be displayed to
+ the console, but the system will continue to operate. Any
+ assertions encountered should be reported to the person
+ responsible for the driver. Assertion statements may proactively
+ detect problems with the driver and facilitate fixing these
+ problems. Enabling assertions will add a small overhead to the
+ execution of the driver.
+
+2. ADVANSYS_DEBUG - Enable driver debugging (Def: Disabled)
+
+ Enabling this option adds tracing functions to the driver and the
+ ability to set a driver tracing level at boot time. This option is
+ very useful for debugging the driver, but it will add to the size
+ of the driver execution image and add overhead to the execution of
+ the driver.
+
+ The amount of debugging output can be controlled with the global
+ variable 'asc_dbglvl'. The higher the number the more output. By
+ default the debug level is 0.
+
+ If the driver is loaded at boot time and the LILO Driver Option
+ is included in the system, the debug level can be changed by
+ specifying a 5th (ASC_NUM_IOPORT_PROBE + 1) I/O Port. The
+ first three hex digits of the pseudo I/O Port must be set to
+ 'deb' and the fourth hex digit specifies the debug level: 0 - F.
+ The following command line will look for an adapter at 0x330
+ and set the debug level to 2.
+
+ linux advansys=0x330,0,0,0,0xdeb2
+
+ If the driver is built as a loadable module this variable can be
+ defined when the driver is loaded. The following insmod command
+ will set the debug level to one.
+
+ insmod advansys.o asc_dbglvl=1
+
+ Debugging Message Levels:
+ 0: Errors Only
+ 1: High-Level Tracing
+ 2-N: Verbose Tracing
+
+ To enable debug output to console, please make sure that:
+
+ a. System and kernel logging is enabled (syslogd, klogd running).
+ b. Kernel messages are routed to console output. Check
+ /etc/syslog.conf for an entry similar to this:
+
+ kern.* /dev/console
+
+ c. klogd is started with the appropriate -c parameter
+ (e.g. klogd -c 8)
+
+ This will cause printk() messages to be be displayed on the
+ current console. Refer to the klogd(8) and syslogd(8) man pages
+ for details.
+
+ Alternatively you can enable printk() to console with this
+ program. However, this is not the 'official' way to do this.
+ Debug output is logged in /var/log/messages.
+
+ main()
+ {
+ syscall(103, 7, 0, 0);
+ }
+
+ Increasing LOG_BUF_LEN in kernel/printk.c to something like
+ 40960 allows more debug messages to be buffered in the kernel
+ and written to the console or log file.
+
+3. ADVANSYS_STATS - Enable statistics (Def: Enabled)
+
+ Enabling this option adds statistics collection and display
+ through /proc to the driver. The information is useful for
+ monitoring driver and device performance. It will add to the
+ size of the driver execution image and add minor overhead to
+ the execution of the driver.
+
+ Statistics are maintained on a per adapter basis. Driver entry
+ point call counts and transfer size counts are maintained.
+ Statistics are only available for kernels greater than or equal
+ to v1.3.0 with the CONFIG_PROC_FS (/proc) file system configured.
+
+ AdvanSys SCSI adapter files have the following path name format:
+
+ /proc/scsi/advansys/{0,1,2,3,...}
+
+ This information can be displayed with cat. For example:
+
+ cat /proc/scsi/advansys/0
+
+ When ADVANSYS_STATS is not defined the AdvanSys /proc files only
+ contain adapter and device configuration information.
+
+Driver LILO Option
+
+If init/main.c is modified as described in the 'Directions for Adding
+the AdvanSys Driver to Linux' section (B.4.) above, the driver will
+recognize the 'advansys' LILO command line and /etc/lilo.conf option.
+This option can be used to either disable I/O port scanning or to limit
+scanning to 1 - 4 I/O ports. Regardless of the option setting EISA and
+PCI boards will still be searched for and detected. This option only
+affects searching for ISA and VL boards.
+
+Examples:
+ 1. Eliminate I/O port scanning:
+ boot: linux advansys=
+ or
+ boot: linux advansys=0x0
+ 2. Limit I/O port scanning to one I/O port:
+ boot: linux advansys=0x110
+ 3. Limit I/O port scanning to four I/O ports:
+ boot: linux advansys=0x110,0x210,0x230,0x330
+
+For a loadable module the same effect can be achieved by setting
+the 'asc_iopflag' variable and 'asc_ioport' array when loading
+the driver, e.g.
+
+ insmod advansys.o asc_iopflag=1 asc_ioport=0x110,0x330
+
+If ADVANSYS_DEBUG is defined a 5th (ASC_NUM_IOPORT_PROBE + 1)
+I/O Port may be added to specify the driver debug level. Refer to
+the 'Driver Compile Time Options and Debugging' section above for
+more information.
+
+Credits (Chronological Order)
+
+Bob Frey <bfrey@turbolinux.com.cn> wrote the AdvanSys SCSI driver
+and maintained it up to 3.3F. He continues to answer questions
+and help maintain the driver.
+
+Nathan Hartwell <mage@cdc3.cdc.net> provided the directions and
+basis for the Linux v1.3.X changes which were included in the
+1.2 release.
+
+Thomas E Zerucha <zerucha@shell.portal.com> pointed out a bug
+in advansys_biosparam() which was fixed in the 1.3 release.
+
+Erik Ratcliffe <erik@caldera.com> has done testing of the
+AdvanSys driver in the Caldera releases.
+
+Rik van Riel <H.H.vanRiel@fys.ruu.nl> provided a patch to
+AscWaitTixISRDone() which he found necessary to make the
+driver work with a SCSI-1 disk.
+
+Mark Moran <mmoran@mmoran.com> has helped test Ultra-Wide
+support in the 3.1A driver.
+
+Doug Gilbert <dgilbert@interlog.com> has made changes and
+suggestions to improve the driver and done a lot of testing.
+
+Ken Mort <ken@mort.net> reported a DEBUG compile bug fixed
+in 3.2K.
+
+Tom Rini <trini@kernel.crashing.org> provided the CONFIG_ISA
+patch and helped with PowerPC wide and narrow board support.
+
+Philip Blundell <philb@gnu.org> provided an
+advansys_interrupts_enabled patch.
+
+Dave Jones <dave@denial.force9.co.uk> reported the compiler
+warnings generated when CONFIG_PROC_FS was not defined in
+the 3.2M driver.
+
+Jerry Quinn <jlquinn@us.ibm.com> fixed PowerPC support (endian
+problems) for wide cards.
+
+Bryan Henderson <bryanh@giraffe-data.com> helped debug narrow
+card error handling.
+
+Manuel Veloso <veloso@pobox.com> worked hard on PowerPC narrow
+board support and fixed a bug in AscGetEEPConfig().
+
+Arnaldo Carvalho de Melo <acme@conectiva.com.br> made
+save_flags/restore_flags changes.
+
+Andy Kellner <AKellner@connectcom.net> continued the Advansys SCSI
+driver development for ConnectCom (Version > 3.3F).
+
+Ken Witherow for extensive testing during the development of version 3.4.
diff --git a/Documentation/scsi/aic79xx.txt b/Documentation/scsi/aic79xx.txt
index 6aa9a891f3d0..683ccae00ad4 100644
--- a/Documentation/scsi/aic79xx.txt
+++ b/Documentation/scsi/aic79xx.txt
@@ -120,7 +120,7 @@ The following information is available in this file:
list size to avoid SCSI malloc pool fragmentation.
- Cleanup channel display in our /proc output.
- Workaround duplicate device entries in the mid-layer
- devlice list during add-single-device.
+ device list during add-single-device.
1.3.6 (March 28th, 2003)
- Correct a double free in the Domain Validation code.
diff --git a/Documentation/scsi/aic7xxx.txt b/Documentation/scsi/aic7xxx.txt
index 5f34d2ba69b4..b7e238cbb5a7 100644
--- a/Documentation/scsi/aic7xxx.txt
+++ b/Documentation/scsi/aic7xxx.txt
@@ -159,7 +159,7 @@ The following information is available in this file:
- Add support for 2.5.X's scsi_report_device_reset().
6.2.34 (May 5th, 2003)
- - Fix locking regression instroduced in 6.2.29 that
+ - Fix locking regression introduced in 6.2.29 that
could cause a lock order reversal between the io_request_lock
and our per-softc lock. This was only possible on RH9,
SuSE, and kernel.org 2.4.X kernels.
@@ -264,7 +264,7 @@ The following information is available in this file:
Option: tag_info:{{value[,value...]}[,{value[,value...]}...]}
Definition: Set the per-target tagged queue depth on a
per controller basis. Both controllers and targets
- may be ommitted indicating that they should retain
+ may be omitted indicating that they should retain
the default tag depth.
Examples: tag_info:{{16,32,32,64,8,8,,32,32,32,32,32,32,32,32,32}
On Controller 0
@@ -290,7 +290,7 @@ The following information is available in this file:
-----------------------------------------------------------------
Option: dv: {value[,value...]}
Definition: Set Domain Validation Policy on a per-controller basis.
- Controllers may be ommitted indicating that
+ Controllers may be omitted indicating that
they should retain the default read streaming setting.
Example: dv:{-1,0,,1,1,0}
On Controller 0 leave DV at its default setting.
diff --git a/Documentation/scsi/arcmsr_spec.txt b/Documentation/scsi/arcmsr_spec.txt
index 5e0042340fd3..45d9482c1517 100644
--- a/Documentation/scsi/arcmsr_spec.txt
+++ b/Documentation/scsi/arcmsr_spec.txt
@@ -3,7 +3,7 @@
*******************************************************************************
** Usage of IOP331 adapter
** (All In/Out is in IOP331's view)
-** 1. Message 0 --> InitThread message and retrun code
+** 1. Message 0 --> InitThread message and return code
** 2. Doorbell is used for RS-232 emulation
** inDoorBell : bit0 -- data in ready
** (DRIVER DATA WRITE OK)
diff --git a/Documentation/scsi/ibmmca.txt b/Documentation/scsi/ibmmca.txt
index 9707941704e3..a810421f1fb3 100644
--- a/Documentation/scsi/ibmmca.txt
+++ b/Documentation/scsi/ibmmca.txt
@@ -21,7 +21,7 @@
versions older than 4.0 do not work with kernels 2.4.0 or later! If you
try to compile your kernel with the wrong driver source, the
compilation is aborted and you get a corresponding error message. This is
- no bug in the driver. It prevents you from using the wrong sourcecode
+ no bug in the driver; it prevents you from using the wrong source code
with the wrong kernel version.
Authors of this Driver
@@ -58,7 +58,7 @@
5 Users' Manual
5.1 Commandline Parameters
5.2 Troubleshooting
- 5.3 Bugreports
+ 5.3 Bug reports
5.4 Support WWW-page
6 References
7 Credits to
@@ -71,13 +71,13 @@
1 Abstract
----------
- This README-file describes the IBM SCSI-subsystem low level driver for
- Linux. The descriptions which were formerly kept in the source-code have
- been taken out to this file to easify the codes' readability. The driver
+ This README-file describes the IBM SCSI-subsystem low level driver for
+ Linux. The descriptions which were formerly kept in the source code have
+ been taken out of this file to simplify the codes readability. The driver
description has been updated, as most of the former description was already
- quite outdated. The history of the driver development is also kept inside
- here. Multiple historical developments have been summarized to shorten the
- textsize a bit. At the end of this file you can find a small manual for
+ quite outdated. The history of the driver development is also kept inside
+ here. Multiple historical developments have been summarized to shorten the
+ text size a bit. At the end of this file you can find a small manual for
this driver and hints to get it running on your machine.
2 Driver Description
@@ -186,7 +186,7 @@
between 0 and 7). The IBM SCSI-2 F/W adapter offers this on up to two
busses and provides support for 30 logical devices at the same time, where
in wide-addressing mode you can have 16 puns with 32 luns on each device.
- This section dexribes you the handling of devices on non-F/W adapters.
+ This section describes the handling of devices on non-F/W adapters.
Just imagine, that you can have 16 * 32 = 512 devices on a F/W adapter
which means a lot of possible devices for such a small machine.
@@ -209,10 +209,10 @@
--------------------------------------------------------
One consequence of information hiding is that the real (pun,lun)
numbers are also hidden. The two possibilities to get around this problem
- is to offer fake pun/lun combinations to the operating system or to
+ are to offer fake pun/lun combinations to the operating system or to
delete the whole mapping of the adapter and to reassign the ldns, using
the immediate assign command of the SCSI-subsystem for probing through
- all possible pun/lun combinations. a ldn is a "logical device number"
+ all possible pun/lun combinations. An ldn is a "logical device number"
which is used by IBM SCSI-subsystems to access some valid SCSI-device.
At the beginning of the development of this driver, the following approach
was used:
@@ -251,9 +251,9 @@
lun>0 or to non-existing devices, in order to satisfy the subsystem, if
there are less than 15 SCSI-devices connected. In the case of more than 15
devices, the dynamical mapping goes active. If the get_scsi[][] reports a
- device to be existant, but it has no ldn assigned, it gets a ldn out of 7
- to 14. The numbers are assigned in cyclic order. Therefore it takes 8
- dynamical reassignments on the SCSI-devices, until a certain device
+ device to be existent, but it has no ldn assigned, it gets an ldn out of 7
+ to 14. The numbers are assigned in cyclic order, therefore it takes 8
+ dynamical reassignments on the SCSI-devices until a certain device
loses its ldn again. This assures that dynamical remapping is avoided
during intense I/O between up to 15 SCSI-devices (means pun,lun
combinations). A further advantage of this method is that people who
@@ -551,7 +551,7 @@
than devices are available, they are assigned to non existing pun,lun
combinations to satisfy the adapter. With this, the dynamical mapping
was possible to implement. (For further info see the text in the
- source-code and in the description below. Read the description
+ source code and in the description below. Read the description
below BEFORE installing this driver on your system!)
2) Changed the name IBMMCA_DRIVER_VERSION to IBMMCA_SCSI_DRIVER_VERSION.
3) The LED-display shows on PS/2-95 no longer the ldn, but the SCSI-ID
@@ -762,9 +762,9 @@
- Michael Lang
Apr 23, 2000 (v3.2pre1)
- 1) During a very long time, I collected a huge amount of bugreports from
+ 1) During a very long time, I collected a huge amount of bug reports from
various people, trying really quite different things on their SCSI-
- PS/2s. Today, all these bugreports are taken into account and should be
+ PS/2s. Today, all these bug reports are taken into account and should be
mostly solved. The major topics were:
- Driver crashes during boottime by no obvious reason.
- Driver panics while the midlevel-SCSI-driver is trying to inquire
@@ -819,7 +819,7 @@
- Michael Lang
July 17, 2000 (v3.2pre8)
- A long period of collecting bugreports from all corners of the world
+ A long period of collecting bug reports from all corners of the world
now lead to the following corrections to the code:
1) SCSI-2 F/W support crashed with a COMMAND ERROR. The reason for this
was that it is possible to disable Fast-SCSI for the external bus.
@@ -873,7 +873,7 @@
July 26, 2000 (v3.2pre11)
1) I passed a horrible weekend getting mad with NMIs on kernel 2.2.14 and
a model 9595. Asking around in the community, nobody except of me has
- seen such errors. Weired, but I am trying to recompile everything on
+ seen such errors. Weird, but I am trying to recompile everything on
the model 9595. Maybe, as I use a specially modified gcc, that could
cause problems. But, it was not the reason. The true background was,
that the kernel was compiled for i386 and the 9595 has a 486DX-2.
@@ -886,7 +886,7 @@
alive rotator during boottime. This makes sense, when no monitor is
connected to the system. You can get rid of all display activity, if
you do not use any parameter or just ibmmcascsi=activity, for the
- harddrive activity LED, existant on all PS/2, except models 8595-XXX.
+ harddrive activity LED, existent on all PS/2, except models 8595-XXX.
If no monitor is available, please use ibmmcascsi=display, which works
fine together with the linuxinfo utility for the LED-panel.
- Michael Lang
@@ -1115,7 +1115,7 @@
If this really happens, do also send e-mail to the maintainer, as
forced detection should be never necessary. Forced detection is in
principal some flaw of the driver adapter detection and goes into
- bugreports.
+ bug reports.
Q: The driver screws up, if it starts to probe SCSI-devices, is there
some way out of it?
A: Yes, that was some recognition problem of the correct SCSI-adapter
@@ -1172,7 +1172,7 @@
recommended version is 3.2 or later. Here, the F/W support is in
a stable and reliable condition. Wide-addressing is in addition
supported.
- Q: I get a Ooops message and something like "killing interrupt".
+ Q: I get an Oops message and something like "killing interrupt".
A: The reason for this is that the IBM SCSI-subsystem only sends a
termination status back, if some error appeared. In former releases
of the driver, it was not checked, if the termination status block
@@ -1188,7 +1188,7 @@
and 15 get ignored by the driver & adapter!
Q: I have a 9595 and I get a NMI during heavy SCSI I/O e.g. during fsck.
A COMMAND ERROR is reported and characters on the screen are missing.
- Warm reboot is not possible. Things look like quite weired.
+ Warm reboot is not possible. Things look like quite weird.
A: Check the processor type of your 9595. If you have an 80486 or 486DX-2
processor complex on your mainboard and you compiled a kernel that
supports 80386 processors, it is possible, that the kernel cannot
@@ -1213,21 +1213,21 @@
problem. Not yet tried, but guessing that it could work. To get this,
set unchecked_isa_dma argument of ibmmca.h from 0 to 1.
- 5.3 Bugreports
+ 5.3 Bug reports
--------------
- If you really find bugs in the sourcecode or the driver will successfully
+ If you really find bugs in the source code or the driver will successfully
refuse to work on your machine, you should send a bug report to me. The
best for this is to follow the instructions on the WWW-page for this
driver. Fill out the bug-report form, placed on the WWW-page and ship it,
so the bugs can be taken into account with maximum efforts. But, please
do not send bug reports about this driver to Linus Torvalds or Leonard
- Zubkoff, as Linus is burried in E-Mail and Leonard is supervising all
+ Zubkoff, as Linus is buried in E-Mail and Leonard is supervising all
SCSI-drivers and won't have the time left to look inside every single
driver to fix a bug and especially DO NOT send modified code to Linus
Torvalds or Alan J. Cox which has not been checked here!!! They are both
- quite burried in E-mail (as me, sometimes, too) and one should first check
+ quite buried in E-mail (as me, sometimes, too) and one should first check
for problems on my local teststand. Recently, I got a lot of
- bugreports for errors in the ibmmca.c code, which I could not imagine, but
+ bug reports for errors in the ibmmca.c code, which I could not imagine, but
a look inside some Linux-distribution showed me quite often some modified
code, which did no longer work on most other machines than the one of the
modifier. Ok, so now that there is maintenance service available for this
@@ -1261,7 +1261,7 @@
some e-mail directly, but at least with the same information as required by
the formular.
- If you have extensive bugreports, including Ooops messages and
+ If you have extensive bug reports, including Oops messages and
screen-shots, please feel free to send it directly to the address
of the maintainer, too. The current address of the maintainer is:
@@ -1318,7 +1318,7 @@
detailed bug reports and ideas for this driver (and his
patience ;-)).
Alan J. Cox
- for his bugreports and his bold activities in cross-checking
+ for his bug reports and his bold activities in cross-checking
the driver-code with his teststand.
7.2 Sponsors & Supporters
diff --git a/Documentation/scsi/link_power_management_policy.txt b/Documentation/scsi/link_power_management_policy.txt
new file mode 100644
index 000000000000..d18993d01884
--- /dev/null
+++ b/Documentation/scsi/link_power_management_policy.txt
@@ -0,0 +1,19 @@
+This parameter allows the user to set the link (interface) power management.
+There are 3 possible options:
+
+Value Effect
+----------------------------------------------------------------------------
+min_power Tell the controller to try to make the link use the
+ least possible power when possible. This may
+ sacrifice some performance due to increased latency
+ when coming out of lower power states.
+
+max_performance Generally, this means no power management. Tell
+ the controller to have performance be a priority
+ over power management.
+
+medium_power Tell the controller to enter a lower power state
+ when possible, but do not enter the lowest power
+ state, thus improving latency over min_power setting.
+
+
diff --git a/Documentation/scsi/ncr53c8xx.txt b/Documentation/scsi/ncr53c8xx.txt
index 39d409a8efe5..230e30846ef2 100644
--- a/Documentation/scsi/ncr53c8xx.txt
+++ b/Documentation/scsi/ncr53c8xx.txt
@@ -785,8 +785,8 @@ port address 0x1400.
irqm:0 always open drain
irqm:1 same as initial settings (assumed BIOS settings)
irqm:2 always totem pole
- irqm:0x10 driver will not use SA_SHIRQ flag when requesting irq
- irqm:0x20 driver will not use SA_INTERRUPT flag when requesting irq
+ irqm:0x10 driver will not use IRQF_SHARED flag when requesting irq
+ irqm:0x20 driver will not use IRQF_DISABLED flag when requesting irq
(Bits 0x10 and 0x20 can be combined with hardware irq mode option)
@@ -1236,15 +1236,15 @@ when the SCSI DATA IN phase is reentered after a phase mismatch.
When an IRQ is shared by devices that are handled by different drivers, it
may happen that one driver complains about the request of the IRQ having
failed. Inder Linux-2.0, this may be due to one driver having requested the
-IRQ using the SA_INTERRUPT flag but some other having requested the same IRQ
+IRQ using the IRQF_DISABLED flag but some other having requested the same IRQ
without this flag. Under both Linux-2.0 and linux-2.2, this may be caused by
-one driver not having requested the IRQ with the SA_SHIRQ flag.
+one driver not having requested the IRQ with the IRQF_SHARED flag.
By default, the ncr53c8xx and sym53c8xx drivers request IRQs with both the
-SA_INTERRUPT and the SA_SHIRQ flag under Linux-2.0 and with only the SA_SHIRQ
+IRQF_DISABLED and the IRQF_SHARED flag under Linux-2.0 and with only the IRQF_SHARED
flag under Linux-2.2.
-Under Linux-2.0, you can disable use of SA_INTERRUPT flag from the boot
+Under Linux-2.0, you can disable use of IRQF_DISABLED flag from the boot
command line by using the following option:
ncr53c8xx=irqm:0x20 (for the generic ncr53c8xx driver)
@@ -1252,7 +1252,7 @@ command line by using the following option:
If this does not fix the problem, then you may want to check how all other
drivers are requesting the IRQ and report the problem. Note that if at least
-a single driver does not request the IRQ with the SA_SHIRQ flag (share IRQ),
+a single driver does not request the IRQ with the IRQF_SHARED flag (share IRQ),
then the request of the IRQ obviously will not succeed for all the drivers.
15. SCSI problem troubleshooting
diff --git a/Documentation/scsi/sym53c8xx_2.txt b/Documentation/scsi/sym53c8xx_2.txt
index 3d9f06bb3d00..49ea5c58c6bc 100644
--- a/Documentation/scsi/sym53c8xx_2.txt
+++ b/Documentation/scsi/sym53c8xx_2.txt
@@ -449,25 +449,14 @@ options as above.
cmd_per_lun=#tags (#tags > 1) tagged command queuing enabled
#tags will be truncated to the max queued commands configuration parameter.
-10.2.2 Detailed control of tagged commands
- This option allows you to specify a command queue depth for each device
- that supports tagged command queueing.
- Example:
- tag_ctrl=10/t2t3q16-t5q24/t1u2q32
- will set devices queue depth as follow:
- - controller #0 target #2 and target #3 -> 16 commands,
- - controller #0 target #5 -> 24 commands,
- - controller #1 target #1 logical unit #2 -> 32 commands,
- - all other logical units (all targets, all controllers) -> 10 commands.
-
-10.2.3 Burst max
+10.2.2 Burst max
burst=0 burst disabled
burst=255 get burst length from initial IO register settings.
burst=#x burst enabled (1<<#x burst transfers max)
#x is an integer value which is log base 2 of the burst transfers max.
By default the driver uses the maximum value supported by the chip.
-10.2.4 LED support
+10.2.3 LED support
led=1 enable LED support
led=0 disable LED support
Do not enable LED support if your scsi board does not use SDMS BIOS.
@@ -560,9 +549,9 @@ Previously, the sym2 driver accepted arguments of the form
sym53c8xx=tags:4,sync:10,debug:0x200
As a result of the new module parameters, this is no longer available.
-Most of the options have remained the same, but tags has split into
-cmd_per_lun and tag_ctrl for its two different purposes. The sample above
-would be specified as:
+Most of the options have remained the same, but tags has become
+cmd_per_lun to reflect its different purposes. The sample above would
+be specified as:
modprobe sym53c8xx cmd_per_lun=4 sync=10 debug=0x200
or on the kernel boot line as:
diff --git a/Documentation/sharedsubtree.txt b/Documentation/sharedsubtree.txt
index ccf1cebe744f..736540045dc7 100644
--- a/Documentation/sharedsubtree.txt
+++ b/Documentation/sharedsubtree.txt
@@ -153,6 +153,7 @@ replicas continue to be exactly same.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
+ #include <string.h>
#include <sys/mount.h>
#include <sys/fsuid.h>
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index 241e26c4ff92..4b48c2e82c3c 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -365,13 +365,14 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
Module snd-cmipci
-----------------
- Module for C-Media CMI8338 and 8738 PCI sound cards.
+ Module for C-Media CMI8338/8738/8768/8770 PCI sound cards.
- mpu_port - 0x300,0x310,0x320,0x330 = legacy port,
- 1 = integrated PCI port,
+ mpu_port - port address of MIDI interface (8338 only):
+ 0x300,0x310,0x320,0x330 = legacy port,
0 = disable (default)
- fm_port - 0x388 = legacy port,
- 1 = integrated PCI port (default),
+ fm_port - port address of OPL-3 FM synthesizer (8x38 only):
+ 0x388 = legacy port,
+ 1 = integrated PCI port (default on 8738),
0 = disable
soft_ac3 - Software-conversion of raw SPDIF packets (model 033 only)
(default = 1)
@@ -768,6 +769,10 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
single_cmd - Use single immediate commands to communicate with
codecs (for debugging only)
enable_msi - Enable Message Signaled Interrupt (MSI) (default = off)
+ power_save - Automatic power-saving timtout (in second, 0 =
+ disable)
+ power_save_controller - Reset HD-audio controller in power-saving mode
+ (default = on)
This module supports one card and autoprobe.
@@ -828,6 +833,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
ALC268
3stack 3-stack model
+ toshiba Toshiba A205
+ acer Acer laptops
auto auto-config reading BIOS (default)
ALC662
@@ -842,7 +849,11 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
3stack-dig 3-jack with SPDIF I/O
6stack-dig 6-jack digital with SPDIF I/O
arima Arima W820Di1
+ targa Targa T8, MSI-1049 T8
+ asus-a7j ASUS A7J
+ asus-a7m ASUS A7M
macpro MacPro support
+ mbp3 Macbook Pro rev3
imac24 iMac 24'' with jack detection
w2jc ASUS W2JC
auto auto-config reading BIOS (default)
@@ -854,6 +865,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
3stack-6ch-dig 3-jack 6-channel with SPDIF I/O
6stack-dig-demo 6-jack digital for Intel demo board
acer Acer laptops (Travelmate 3012WTMi, Aspire 5600, etc)
+ acer-aspire Acer Aspire 9810
medion Medion Laptops
medion-md2 Medion MD2
targa-dig Targa/MSI
@@ -862,6 +874,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
lenovo-101e Lenovo 101E
lenovo-nb0763 Lenovo NB0763
lenovo-ms7195-dig Lenovo MS7195
+ haier-w66 Haier W66
6stack-hp HP machines with 6stack (Nettle boards)
3stack-hp HP machines with 3stack (Lucknow, Samba boards)
auto auto-config reading BIOS (default)
@@ -885,6 +898,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
3stack-660-digout 3-jack with SPDIF OUT (for ALC660VD)
lenovo Lenovo 3000 C200
dallas Dallas laptops
+ hp HP TX1000
auto auto-config reading BIOS (default)
CMI9880
@@ -920,6 +934,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
3stack 3-stack, shared surrounds
laptop 2-channel only (FSC V2060, Samsung M50)
laptop-eapd 2-channel with EAPD (Samsung R65, ASUS A6J)
+ laptop-automute 2-channel with EAPD and HP-automute (Lenovo N100)
ultra 2-channel with EAPD (Samsung Ultra tablet PC)
AD1988
@@ -945,14 +960,30 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
can be adjusted. Appearing only when compiled with
$CONFIG_SND_DEBUG=y
- STAC9200/9205/9254
+ STAC9200
ref Reference board
+ dell-d21 Dell (unknown)
+ dell-d22 Dell (unknown)
+ dell-d23 Dell (unknown)
+ dell-m21 Dell Inspiron 630m, Dell Inspiron 640m
+ dell-m22 Dell Latitude D620, Dell Latitude D820
+ dell-m23 Dell XPS M1710, Dell Precision M90
+ dell-m24 Dell Latitude 120L
+ dell-m25 Dell Inspiron E1505n
+ dell-m26 Dell Inspiron 1501
+ dell-m27 Dell Inspiron E1705/9400
+ gateway Gateway laptops with EAPD control
+
+ STAC9205/9254
+ ref Reference board
+ dell-m42 Dell (unknown)
+ dell-m43 Dell Precision
+ dell-m44 Dell Inspiron
STAC9220/9221
ref Reference board
3stack D945 3stack
5stack D945 5stack + SPDIF
- dell Dell XPS M1210
intel-mac-v1 Intel Mac Type 1
intel-mac-v2 Intel Mac Type 2
intel-mac-v3 Intel Mac Type 3
@@ -964,6 +995,10 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
macbook-pro Intel Mac Book Pro 2nd generation (eq. type 3)
imac-intel Intel iMac (eq. type 2)
imac-intel-20 Intel iMac (newer version) (eq. type 3)
+ dell-d81 Dell (unknown)
+ dell-d82 Dell (unknown)
+ dell-m81 Dell (unknown)
+ dell-m82 Dell XPS M1210
STAC9202/9250/9251
ref Reference board, base config
@@ -975,6 +1010,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
ref Reference board
3stack D965 3stack
5stack D965 5stack + SPDIF
+ dell-3stack Dell Dimension E520
STAC9872
vaio Setup for VAIO FE550G/SZ110
@@ -989,6 +1025,9 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
subsystem ID (output of "lspci -nv") to ALSA BTS or alsa-devel
ML (see the section "Links and Addresses").
+ power_save and power_save_controller options are for power-saving
+ mode. See powersave.txt for details.
+
Note 2: If you get click noises on output, try the module option
position_fix=1 or 2. position_fix=1 will use the SD_LPIB
register value without FIFO size correction as the current
@@ -1349,7 +1388,6 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
port - port number or -1 (disable)
irq - IRQ number or -1 (disable)
pnp - PnP detection - 0 = disable, 1 = enable (default)
- uart_enter - Issue UART_ENTER command at open - bool, default = on
This module supports multiple devices and PnP.
@@ -1630,6 +1668,21 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
The power-management is supported.
+ Module snd-sc6000
+ -----------------
+
+ Module for Gallant SC-6000 soundcard.
+
+ port - Port # (0x220 or 0x240)
+ mss_port - MSS Port # (0x530 or 0xe80)
+ irq - IRQ # (5,7,9,10,11)
+ mpu_irq - MPU-401 IRQ # (5,7,9,10) ,0 - no MPU-401 irq
+ dma - DMA # (1,3,0)
+
+ This module supports multiple cards.
+
+ This card is also known as Audio Excel DSP 16 or Zoltrix AV302.
+
Module snd-sgalaxy
------------------
@@ -1650,9 +1703,11 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
Module for ENSONIQ SoundScape PnP cards.
port - Port # (PnP setup)
+ wss_port - WSS Port # (PnP setup)
irq - IRQ # (PnP setup)
mpu_irq - MPU-401 IRQ # (PnP setup)
dma - DMA # (PnP setup)
+ dma2 - 2nd DMA # (PnP setup, -1 to disable)
This module supports multiple cards. ISA PnP must be enabled.
You need sscape_ctl tool in alsa-tools package for loading
@@ -1697,8 +1752,52 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
dma2 - DMA2 # for CS4232 PCM interface.
isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
+ The below are options for wavefront_synth features:
+ wf_raw - Assume that we need to boot the OS (default:no)
+ If yes, then during driver loading, the state of the board is
+ ignored, and we reset the board and load the firmware anyway.
+ fx_raw - Assume that the FX process needs help (default:yes)
+ If false, we'll leave the FX processor in whatever state it is
+ when the driver is loaded. The default is to download the
+ microprogram and associated coefficients to set it up for
+ "default" operation, whatever that means.
+ debug_default - Debug parameters for card initialization
+ wait_usecs - How long to wait without sleeping, usecs
+ (default:150)
+ This magic number seems to give pretty optimal throughput
+ based on my limited experimentation.
+ If you want to play around with it and find a better value, be
+ my guest. Remember, the idea is to get a number that causes us
+ to just busy wait for as many WaveFront commands as possible,
+ without coming up with a number so large that we hog the whole
+ CPU.
+ Specifically, with this number, out of about 134,000 status
+ waits, only about 250 result in a sleep.
+ sleep_interval - How long to sleep when waiting for reply
+ (default: 100)
+ sleep_tries - How many times to try sleeping during a wait
+ (default: 50)
+ ospath - Pathname to processed ICS2115 OS firmware
+ (default:wavefront.os)
+ The path name of the ISC2115 OS firmware. In the recent
+ version, it's handled via firmware loader framework, so it
+ must be installed in the proper path, typically,
+ /lib/firmware.
+ reset_time - How long to wait for a reset to take effect
+ (default:2)
+ ramcheck_time - How many seconds to wait for the RAM test
+ (default:20)
+ osrun_time - How many seconds to wait for the ICS2115 OS
+ (default:10)
+
This module supports multiple cards and ISA PnP.
+ Note: the firmware file "wavefront.os" was located in the earlier
+ version in /etc. Now it's loaded via firmware loader, and
+ must be in the proper firmware path, such as /lib/firmware.
+ Copy (or symlink) the file appropriately if you get an error
+ regarding firmware downloading after upgrading the kernel.
+
Module snd-sonicvibes
---------------------
diff --git a/Documentation/sound/alsa/CMIPCI.txt b/Documentation/sound/alsa/CMIPCI.txt
index 4b2b15387056..16935c8561f7 100644
--- a/Documentation/sound/alsa/CMIPCI.txt
+++ b/Documentation/sound/alsa/CMIPCI.txt
@@ -1,5 +1,5 @@
- Brief Notes on C-Media 8738/8338 Driver
- =======================================
+ Brief Notes on C-Media 8338/8738/8768/8770 Driver
+ =================================================
Takashi Iwai <tiwai@suse.de>
@@ -209,10 +209,13 @@ In addition to the standard SB mixer, CM8x38 provides more functions.
MIDI CONTROLLER
---------------
-The MPU401-UART interface is disabled as default. You need to set
-module option "mpu_port" with a valid I/O port address to enable the
-MIDI support. The valid I/O ports are 0x300, 0x310, 0x320 and 0x330.
-Choose the value which doesn't conflict with other cards.
+With CMI8338 chips, the MPU401-UART interface is disabled as default.
+You need to set the module option "mpu_port" to a valid I/O port address
+to enable MIDI support. Valid I/O ports are 0x300, 0x310, 0x320 and
+0x330. Choose a value that doesn't conflict with other cards.
+
+With CMI8738 and newer chips, the MIDI interface is enabled by default
+and the driver automatically chooses a port address.
There is _no_ hardware wavetable function on this chip (except for
OPL3 synth below).
@@ -230,6 +233,8 @@ Set "fm_port" module option for more cards.
The output quality of FM OPL/3 is, however, very weird.
I don't know why..
+CMI8768 and newer chips do not have the FM synth.
+
Joystick and Modem
------------------
diff --git a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
index 74d3a35b59bc..2c3fc3cb3b6b 100644
--- a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
@@ -18,8 +18,8 @@
</affiliation>
</author>
- <date>November 17, 2005</date>
- <edition>0.3.6</edition>
+ <date>September 10, 2007</date>
+ <edition>0.3.7</edition>
<abstract>
<para>
@@ -405,8 +405,9 @@
/* definition of the chip-specific record */
struct mychip {
struct snd_card *card;
- // rest of implementation will be in the section
- // "PCI Resource Managements"
+ /* rest of implementation will be in the section
+ * "PCI Resource Managements"
+ */
};
/* chip-specific destructor
@@ -414,7 +415,7 @@
*/
static int snd_mychip_free(struct mychip *chip)
{
- .... // will be implemented later...
+ .... /* will be implemented later... */
}
/* component-destructor
@@ -440,8 +441,9 @@
*rchip = NULL;
- // check PCI availability here
- // (see "PCI Resource Managements")
+ /* check PCI availability here
+ * (see "PCI Resource Managements")
+ */
....
/* allocate a chip-specific data with zero filled */
@@ -451,12 +453,13 @@
chip->card = card;
- // rest of initialization here; will be implemented
- // later, see "PCI Resource Managements"
+ /* rest of initialization here; will be implemented
+ * later, see "PCI Resource Managements"
+ */
....
- if ((err = snd_device_new(card, SNDRV_DEV_LOWLEVEL,
- chip, &ops)) < 0) {
+ err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops);
+ if (err < 0) {
snd_mychip_free(chip);
return err;
}
@@ -490,7 +493,8 @@
return -ENOMEM;
/* (3) */
- if ((err = snd_mychip_create(card, pci, &chip)) < 0) {
+ err = snd_mychip_create(card, pci, &chip);
+ if (err < 0) {
snd_card_free(card);
return err;
}
@@ -502,10 +506,11 @@
card->shortname, chip->ioport, chip->irq);
/* (5) */
- .... // implemented later
+ .... /* implemented later */
/* (6) */
- if ((err = snd_card_register(card)) < 0) {
+ err = snd_card_register(card);
+ if (err < 0) {
snd_card_free(card);
return err;
}
@@ -605,7 +610,8 @@
<![CDATA[
struct mychip *chip;
....
- if ((err = snd_mychip_create(card, pci, &chip)) < 0) {
+ err = snd_mychip_create(card, pci, &chip);
+ if (err < 0) {
snd_card_free(card);
return err;
}
@@ -666,7 +672,8 @@
<informalexample>
<programlisting>
<![CDATA[
- if ((err = snd_card_register(card)) < 0) {
+ err = snd_card_register(card);
+ if (err < 0) {
snd_card_free(card);
return err;
}
@@ -1091,7 +1098,7 @@
static int snd_mychip_free(struct mychip *chip)
{
/* disable hardware here if any */
- .... // (not implemented in this document)
+ .... /* (not implemented in this document) */
/* release the irq */
if (chip->irq >= 0)
@@ -1119,7 +1126,8 @@
*rchip = NULL;
/* initialize the PCI entry */
- if ((err = pci_enable_device(pci)) < 0)
+ err = pci_enable_device(pci);
+ if (err < 0)
return err;
/* check PCI availability (28bit DMA) */
if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 ||
@@ -1141,7 +1149,8 @@
chip->irq = -1;
/* (1) PCI resource allocation */
- if ((err = pci_request_regions(pci, "My Chip")) < 0) {
+ err = pci_request_regions(pci, "My Chip");
+ if (err < 0) {
kfree(chip);
pci_disable_device(pci);
return err;
@@ -1156,10 +1165,10 @@
chip->irq = pci->irq;
/* (2) initialization of the chip hardware */
- .... // (not implemented in this document)
+ .... /* (not implemented in this document) */
- if ((err = snd_device_new(card, SNDRV_DEV_LOWLEVEL,
- chip, &ops)) < 0) {
+ err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops);
+ if (err < 0) {
snd_mychip_free(chip);
return err;
}
@@ -1233,7 +1242,8 @@
<informalexample>
<programlisting>
<![CDATA[
- if ((err = pci_enable_device(pci)) < 0)
+ err = pci_enable_device(pci);
+ if (err < 0)
return err;
if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 ||
pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) {
@@ -1294,7 +1304,8 @@
<informalexample>
<programlisting>
<![CDATA[
- if ((err = pci_request_regions(pci, "My Chip")) < 0) {
+ err = pci_request_regions(pci, "My Chip");
+ if (err < 0) {
kfree(chip);
pci_disable_device(pci);
return err;
@@ -1322,7 +1333,7 @@
<programlisting>
<![CDATA[
if (request_irq(pci->irq, snd_mychip_interrupt,
- IRQF_DISABLED|IRQF_SHARED, "My Chip", chip)) {
+ IRQF_SHARED, "My Chip", chip)) {
printk(KERN_ERR "cannot grab irq %d\n", pci->irq);
snd_mychip_free(chip);
return -EBUSY;
@@ -1773,7 +1784,8 @@
struct snd_pcm_runtime *runtime = substream->runtime;
runtime->hw = snd_mychip_playback_hw;
- // more hardware-initialization will be done here
+ /* more hardware-initialization will be done here */
+ ....
return 0;
}
@@ -1781,7 +1793,8 @@
static int snd_mychip_playback_close(struct snd_pcm_substream *substream)
{
struct mychip *chip = snd_pcm_substream_chip(substream);
- // the hardware-specific codes will be here
+ /* the hardware-specific codes will be here */
+ ....
return 0;
}
@@ -1793,7 +1806,8 @@
struct snd_pcm_runtime *runtime = substream->runtime;
runtime->hw = snd_mychip_capture_hw;
- // more hardware-initialization will be done here
+ /* more hardware-initialization will be done here */
+ ....
return 0;
}
@@ -1801,7 +1815,8 @@
static int snd_mychip_capture_close(struct snd_pcm_substream *substream)
{
struct mychip *chip = snd_pcm_substream_chip(substream);
- // the hardware-specific codes will be here
+ /* the hardware-specific codes will be here */
+ ....
return 0;
}
@@ -1844,10 +1859,12 @@
{
switch (cmd) {
case SNDRV_PCM_TRIGGER_START:
- // do something to start the PCM engine
+ /* do something to start the PCM engine */
+ ....
break;
case SNDRV_PCM_TRIGGER_STOP:
- // do something to stop the PCM engine
+ /* do something to stop the PCM engine */
+ ....
break;
default:
return -EINVAL;
@@ -1900,8 +1917,8 @@
struct snd_pcm *pcm;
int err;
- if ((err = snd_pcm_new(chip->card, "My Chip", 0, 1, 1,
- &pcm)) < 0)
+ err = snd_pcm_new(chip->card, "My Chip", 0, 1, 1, &pcm);
+ if (err < 0)
return err;
pcm->private_data = chip;
strcpy(pcm->name, "My Chip");
@@ -1939,8 +1956,8 @@
struct snd_pcm *pcm;
int err;
- if ((err = snd_pcm_new(chip->card, "My Chip", 0, 1, 1,
- &pcm)) < 0)
+ err = snd_pcm_new(chip->card, "My Chip", 0, 1, 1, &pcm);
+ if (err < 0)
return err;
pcm->private_data = chip;
strcpy(pcm->name, "My Chip");
@@ -2097,7 +2114,7 @@
struct mychip *chip = snd_pcm_chip(pcm);
/* free your own data */
kfree(chip->my_private_pcm_data);
- // do what you like else
+ /* do what you like else */
....
}
@@ -2884,10 +2901,10 @@ struct _snd_pcm_runtime {
<![CDATA[
switch (cmd) {
case SNDRV_PCM_TRIGGER_START:
- // do something to start the PCM engine
+ /* do something to start the PCM engine */
break;
case SNDRV_PCM_TRIGGER_STOP:
- // do something to stop the PCM engine
+ /* do something to stop the PCM engine */
break;
default:
return -EINVAL;
@@ -3071,7 +3088,7 @@ struct _snd_pcm_runtime {
spin_unlock(&chip->lock);
snd_pcm_period_elapsed(chip->substream);
spin_lock(&chip->lock);
- // acknowledge the interrupt if necessary
+ /* acknowledge the interrupt if necessary */
}
....
spin_unlock(&chip->lock);
@@ -3134,7 +3151,7 @@ struct _snd_pcm_runtime {
snd_pcm_period_elapsed(substream);
spin_lock(&chip->lock);
}
- // acknowledge the interrupt if necessary
+ /* acknowledge the interrupt if necessary */
}
....
spin_unlock(&chip->lock);
@@ -3456,6 +3473,13 @@ struct _snd_pcm_runtime {
</para>
<para>
+ The <structfield>tlv</structfield> field can be used to provide
+ metadata about the control; see the
+ <link linkend="control-interface-tlv">
+ <citetitle>Metadata</citetitle></link> subsection.
+ </para>
+
+ <para>
The other three are
<link linkend="control-interface-callbacks"><citetitle>
callback functions</citetitle></link>.
@@ -3604,7 +3628,7 @@ struct _snd_pcm_runtime {
<title>Example of info callback</title>
<programlisting>
<![CDATA[
- static int snd_myctl_info(struct snd_kcontrol *kcontrol,
+ static int snd_myctl_mono_info(struct snd_kcontrol *kcontrol,
struct snd_ctl_elem_info *uinfo)
{
uinfo->type = SNDRV_CTL_ELEM_TYPE_BOOLEAN;
@@ -3639,7 +3663,7 @@ struct _snd_pcm_runtime {
<informalexample>
<programlisting>
<![CDATA[
- static int snd_myctl_info(struct snd_kcontrol *kcontrol,
+ static int snd_myctl_enum_info(struct snd_kcontrol *kcontrol,
struct snd_ctl_elem_info *uinfo)
{
static char *texts[4] = {
@@ -3658,6 +3682,16 @@ struct _snd_pcm_runtime {
</programlisting>
</informalexample>
</para>
+
+ <para>
+ Some common info callbacks are prepared for easy use:
+ <function>snd_ctl_boolean_mono_info()</function> and
+ <function>snd_ctl_boolean_stereo_info()</function>.
+ Obviously, the former is an info callback for a mono channel
+ boolean item, just like <function>snd_myctl_mono_info</function>
+ above, and the latter is for a stereo channel boolean item.
+ </para>
+
</section>
<section id="control-interface-callbacks-get">
@@ -3794,7 +3828,8 @@ struct _snd_pcm_runtime {
<informalexample>
<programlisting>
<![CDATA[
- if ((err = snd_ctl_add(card, snd_ctl_new1(&my_control, chip))) < 0)
+ err = snd_ctl_add(card, snd_ctl_new1(&my_control, chip));
+ if (err < 0)
return err;
]]>
</programlisting>
@@ -3843,6 +3878,56 @@ struct _snd_pcm_runtime {
</para>
</section>
+ <section id="control-interface-tlv">
+ <title>Metadata</title>
+ <para>
+ To provide information about the dB values of a mixer control, use
+ on of the <constant>DECLARE_TLV_xxx</constant> macros from
+ <filename>&lt;sound/tlv.h&gt;</filename> to define a variable
+ containing this information, set the<structfield>tlv.p
+ </structfield> field to point to this variable, and include the
+ <constant>SNDRV_CTL_ELEM_ACCESS_TLV_READ</constant> flag in the
+ <structfield>access</structfield> field; like this:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static DECLARE_TLV_DB_SCALE(db_scale_my_control, -4050, 150, 0);
+
+ static struct snd_kcontrol_new my_control __devinitdata = {
+ ...
+ .access = SNDRV_CTL_ELEM_ACCESS_READWRITE |
+ SNDRV_CTL_ELEM_ACCESS_TLV_READ,
+ ...
+ .tlv.p = db_scale_my_control,
+ };
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The <function>DECLARE_TLV_DB_SCALE</function> macro defines
+ information about a mixer control where each step in the control's
+ value changes the dB value by a constant dB amount.
+ The first parameter is the name of the variable to be defined.
+ The second parameter is the minimum value, in units of 0.01 dB.
+ The third parameter is the step size, in units of 0.01 dB.
+ Set the fourth parameter to 1 if the minimum value actually mutes
+ the control.
+ </para>
+
+ <para>
+ The <function>DECLARE_TLV_DB_LINEAR</function> macro defines
+ information about a mixer control where the control's value affects
+ the output linearly.
+ The first parameter is the name of the variable to be defined.
+ The second parameter is the minimum value, in units of 0.01 dB.
+ The third parameter is the maximum value, in units of 0.01 dB.
+ If the minimum value mutes the control, set the second parameter to
+ <constant>TLV_DB_GAIN_MUTE</constant>.
+ </para>
+ </section>
+
</chapter>
@@ -3880,7 +3965,7 @@ struct _snd_pcm_runtime {
{
struct mychip *chip = ac97->private_data;
....
- // read a register value here from the codec
+ /* read a register value here from the codec */
return the_register_value;
}
@@ -3889,7 +3974,7 @@ struct _snd_pcm_runtime {
{
struct mychip *chip = ac97->private_data;
....
- // write the given register value to the codec
+ /* write the given register value to the codec */
}
static int snd_mychip_ac97(struct mychip *chip)
@@ -3902,7 +3987,8 @@ struct _snd_pcm_runtime {
.read = snd_mychip_ac97_read,
};
- if ((err = snd_ac97_bus(chip->card, 0, &ops, NULL, &bus)) < 0)
+ err = snd_ac97_bus(chip->card, 0, &ops, NULL, &bus);
+ if (err < 0)
return err;
memset(&ac97, 0, sizeof(ac97));
ac97.private_data = chip;
@@ -4447,10 +4533,10 @@ struct _snd_pcm_runtime {
<informalexample>
<programlisting>
<![CDATA[
- struct list_head *list;
struct snd_rawmidi_substream *substream;
- list_for_each(list, &rmidi->streams[SNDRV_RAWMIDI_STREAM_OUTPUT].substreams) {
- substream = list_entry(list, struct snd_rawmidi_substream, list);
+ list_for_each_entry(substream,
+ &rmidi->streams[SNDRV_RAWMIDI_STREAM_OUTPUT].substreams,
+ list {
sprintf(substream->name, "My MIDI Port %d", substream->number + 1);
}
/* same for SNDRV_RAWMIDI_STREAM_INPUT */
diff --git a/Documentation/sound/alsa/OSS-Emulation.txt b/Documentation/sound/alsa/OSS-Emulation.txt
index bfa0c9aacb4b..022aaeb0e9dd 100644
--- a/Documentation/sound/alsa/OSS-Emulation.txt
+++ b/Documentation/sound/alsa/OSS-Emulation.txt
@@ -303,10 +303,3 @@ ICE1712 supports only the unconventional format, interleaved
the buffer as the conventional (mono or 2-channels, 8 or 16bit) format
on OSS.
-USB devices
------------
-Some USB devices support only 24bit format packed in 3bytes. This
-format is not supported by OSS and no conversion is provided by kernel
-OSS emulation. You can use the user-space OSS emulation via libaoss
-instead.
-
diff --git a/Documentation/sound/alsa/hda_codec.txt b/Documentation/sound/alsa/hda_codec.txt
index 4eaae2a45534..8e1b02526698 100644
--- a/Documentation/sound/alsa/hda_codec.txt
+++ b/Documentation/sound/alsa/hda_codec.txt
@@ -49,6 +49,9 @@ struct hda_bus_ops {
unsigned int verb, unsigned int parm);
unsigned int (*get_response)(struct hda_codec *codec);
void (*private_free)(struct hda_bus *);
+#ifdef CONFIG_SND_HDA_POWER_SAVE
+ void (*pm_notify)(struct hda_codec *codec);
+#endif
};
The command callback is called when the codec module needs to send a
@@ -56,9 +59,16 @@ VERB to the controller. It's always a single command.
The get_response callback is called when the codec requires the answer
for the last command. These two callbacks are mandatory and have to
be given.
-The last, private_free callback, is optional. It's called in the
+The third, private_free callback, is optional. It's called in the
destructor to release any necessary data in the lowlevel driver.
+The pm_notify callback is available only with
+CONFIG_SND_HDA_POWER_SAVE kconfig. It's called when the codec needs
+to power up or may power down. The controller should check the all
+belonging codecs on the bus whether they are actually powered off
+(check codec->power_on), and optionally the driver may power down the
+contoller side, too.
+
The bus instance is created via snd_hda_bus_new(). You need to pass
the card instance, the template, and the pointer to store the
resultant bus instance.
@@ -86,10 +96,8 @@ resultant codec instance (can be NULL if not needed).
The codec is stored in a linked list of bus instance. You can follow
the codec list like:
- struct list_head *p;
struct hda_codec *codec;
- list_for_each(p, &bus->codec_list) {
- codec = list_entry(p, struct hda_codec, list);
+ list_for_each_entry(codec, &bus->codec_list, list) {
...
}
@@ -100,10 +108,15 @@ initialization sequence is called when the controls are built later.
Codec Access
============
-To access codec, use snd_codec_read() and snd_codec_write().
+To access codec, use snd_hda_codec_read() and snd_hda_codec_write().
snd_hda_param_read() is for reading parameters.
For writing a sequence of verbs, use snd_hda_sequence_write().
+There are variants of cached read/write, snd_hda_codec_write_cache(),
+snd_hda_sequence_write_cache(). These are used for recording the
+register states for the power-mangement resume. When no PM is needed,
+these are equivalent with non-cached version.
+
To retrieve the number of sub nodes connected to the given node, use
snd_hda_get_sub_nodes(). The connection list can be obtained via
snd_hda_get_connections() call.
@@ -239,6 +252,10 @@ set the codec->patch_ops field. This is defined as below:
int (*suspend)(struct hda_codec *codec, pm_message_t state);
int (*resume)(struct hda_codec *codec);
#endif
+ #ifdef CONFIG_SND_HDA_POWER_SAVE
+ int (*check_power_status)(struct hda_codec *codec,
+ hda_nid_t nid);
+ #endif
};
The build_controls callback is called from snd_hda_build_controls().
@@ -251,6 +268,18 @@ The unsol_event callback is called when an unsolicited event is
received.
The suspend and resume callbacks are for power management.
+They can be NULL if no special sequence is required. When the resume
+callback is NULL, the driver calls the init callback and resumes the
+registers from the cache. If other handling is needed, you'd need to
+write your own resume callback. There, the amp values can be resumed
+via
+ void snd_hda_codec_resume_amp(struct hda_codec *codec);
+and the other codec registers via
+ void snd_hda_codec_resume_cache(struct hda_codec *codec);
+
+The check_power_status callback is called when the amp value of the
+given widget NID is changed. The codec code can turn on/off the power
+appropriately from this information.
Each entry can be NULL if not necessary to be called.
@@ -267,8 +296,7 @@ Digital I/O
===========
Call snd_hda_create_spdif_out_ctls() from the patch to create controls
-related with SPDIF out. In the patch resume callback, call
-snd_hda_resume_spdif().
+related with SPDIF out.
Helper Functions
@@ -284,12 +312,7 @@ as a module parameter, and PCI subsystem IDs. If the matching entry
is found, it returns the config field value.
snd_hda_add_new_ctls() can be used to create and add control entries.
-Pass the zero-terminated array of struct snd_kcontrol_new. The same array
-can be passed to snd_hda_resume_ctls() for resume.
-Note that this will call control->put callback of these entries. So,
-put callback should check codec->in_resume and force to restore the
-given value if it's non-zero even if the value is identical with the
-cached value.
+Pass the zero-terminated array of struct snd_kcontrol_new
Macros HDA_CODEC_VOLUME(), HDA_CODEC_MUTE() and their variables can be
used for the entry of struct snd_kcontrol_new.
diff --git a/Documentation/sound/alsa/powersave.txt b/Documentation/sound/alsa/powersave.txt
new file mode 100644
index 000000000000..9657e8099228
--- /dev/null
+++ b/Documentation/sound/alsa/powersave.txt
@@ -0,0 +1,41 @@
+Notes on Power-Saving Mode
+==========================
+
+AC97 and HD-audio drivers have the automatic power-saving mode.
+This feature is enabled via Kconfig CONFIG_SND_AC97_POWER_SAVE
+and CONFIG_SND_HDA_POWER_SAVE options, respectively.
+
+With the automatic power-saving, the driver turns off the codec power
+appropriately when no operation is required. When no applications use
+the device and/or no analog loopback is set, the power disablement is
+done fully or partially. It'll save a certain power consumption, thus
+good for laptops (even for desktops).
+
+The time-out for automatic power-off can be specified via power_save
+module option of snd-ac97-codec and snd-hda-intel modules. Specify
+the time-out value in seconds. 0 means to disable the automatic
+power-saving. The default value of timeout is given via
+CONFIG_SND_AC97_POWER_SAVE_DEFAULT and
+CONFIG_SND_HDA_POWER_SAVE_DEFAULT Kconfig options. Setting this to 1
+(the minimum value) isn't recommended because many applications try to
+reopen the device frequently. 10 would be a good choice for normal
+operations.
+
+The power_save option is exported as writable. This means you can
+adjust the value via sysfs on the fly. For example, to turn on the
+automatic power-save mode with 10 seconds, write to
+/sys/modules/snd_ac97_codec/parameters/power_save (usually as root):
+
+ # echo 10 > /sys/modules/snd_ac97_codec/parameters/power_save
+
+
+Note that you might hear click noise/pop when changing the power
+state. Also, it often takes certain time to wake up from the
+power-down to the active state. These are often hardly to fix, so
+don't report extra bug reports unless you have a fix patch ;-)
+
+For HD-audio interface, there is another module option,
+power_save_controller. This enables/disables the power-save mode of
+the controller side. Setting this on may reduce a bit more power
+consumption, but might result in longer wake-up time and click noise.
+Try to turn it off when you experience such a thing too often.
diff --git a/Documentation/sound/alsa/soc/DAI.txt b/Documentation/sound/alsa/soc/DAI.txt
index 58cbfd01ea8f..3feeb9ecdec4 100644
--- a/Documentation/sound/alsa/soc/DAI.txt
+++ b/Documentation/sound/alsa/soc/DAI.txt
@@ -20,12 +20,12 @@ I2S
===
I2S is a common 4 wire DAI used in HiFi, STB and portable devices. The Tx and
-Rx lines are used for audio transmision, whilst the bit clock (BCLK) and
+Rx lines are used for audio transmission, whilst the bit clock (BCLK) and
left/right clock (LRC) synchronise the link. I2S is flexible in that either the
controller or CODEC can drive (master) the BCLK and LRC clock lines. Bit clock
usually varies depending on the sample rate and the master system clock
(SYSCLK). LRCLK is the same as the sample rate. A few devices support separate
-ADC and DAC LRCLK's, this allows for similtanious capture and playback at
+ADC and DAC LRCLK's, this allows for simultaneous capture and playback at
different sample rates.
I2S has several different operating modes:-
@@ -41,12 +41,12 @@ I2S has several different operating modes:-
PCM
===
-PCM is another 4 wire interface, very similar to I2S, that can support a more
+PCM is another 4 wire interface, very similar to I2S, which can support a more
flexible protocol. It has bit clock (BCLK) and sync (SYNC) lines that are used
to synchronise the link whilst the Tx and Rx lines are used to transmit and
receive the audio data. Bit clock usually varies depending on sample rate
whilst sync runs at the sample rate. PCM also supports Time Division
-Multiplexing (TDM) in that several devices can use the bus similtaniuosly (This
+Multiplexing (TDM) in that several devices can use the bus simultaneously (this
is sometimes referred to as network mode).
Common PCM operating modes:-
diff --git a/Documentation/sound/alsa/soc/clocking.txt b/Documentation/sound/alsa/soc/clocking.txt
index e93960d53a1e..14930887c25f 100644
--- a/Documentation/sound/alsa/soc/clocking.txt
+++ b/Documentation/sound/alsa/soc/clocking.txt
@@ -2,20 +2,20 @@ Audio Clocking
==============
This text describes the audio clocking terms in ASoC and digital audio in
-general. Note: Audio clocking can be complex !
+general. Note: Audio clocking can be complex!
Master Clock
------------
-Every audio subsystem is driven by a master clock (sometimes refered to as MCLK
+Every audio subsystem is driven by a master clock (sometimes referred to as MCLK
or SYSCLK). This audio master clock can be derived from a number of sources
(e.g. crystal, PLL, CPU clock) and is responsible for producing the correct
audio playback and capture sample rates.
-Some master clocks (e.g. PLL's and CPU based clocks) are configuarble in that
+Some master clocks (e.g. PLL's and CPU based clocks) are configurable in that
their speed can be altered by software (depending on the system use and to save
-power). Other master clocks are fixed at at set frequency (i.e. crystals).
+power). Other master clocks are fixed at a set frequency (i.e. crystals).
DAI Clocks
@@ -44,7 +44,7 @@ This relationship depends on the codec or SoC CPU in particular. In general
it's best to configure BCLK to the lowest possible speed (depending on your
rate, number of channels and wordsize) to save on power.
-It's also desireable to use the codec (if possible) to drive (or master) the
+It's also desirable to use the codec (if possible) to drive (or master) the
audio clocks as it's usually gives more accurate sample rates than the CPU.
diff --git a/Documentation/sound/alsa/soc/codec.txt b/Documentation/sound/alsa/soc/codec.txt
index 48983c75aad9..1e766ad0ebd1 100644
--- a/Documentation/sound/alsa/soc/codec.txt
+++ b/Documentation/sound/alsa/soc/codec.txt
@@ -19,7 +19,7 @@ Optionally, codec drivers can also provide:-
6) DAPM event handler.
7) DAC Digital mute control.
-It's probably best to use this guide in conjuction with the existing codec
+It's probably best to use this guide in conjunction with the existing codec
driver code in sound/soc/codecs/
ASoC Codec driver breakdown
@@ -28,7 +28,7 @@ ASoC Codec driver breakdown
1 - Codec DAI and PCM configuration
-----------------------------------
Each codec driver must have a struct snd_soc_codec_dai to define it's DAI and
-PCM's capablities and operations. This struct is exported so that it can be
+PCM's capabilities and operations. This struct is exported so that it can be
registered with the core by your machine driver.
e.g.
@@ -67,7 +67,7 @@ EXPORT_SYMBOL_GPL(wm8731_dai);
2 - Codec control IO
--------------------
-The codec can ususally be controlled via an I2C or SPI style interface (AC97
+The codec can usually be controlled via an I2C or SPI style interface (AC97
combines control with data in the DAI). The codec drivers will have to provide
functions to read and write the codec registers along with supplying a register
cache:-
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt
index c11877f5b4a1..ab0766fd7869 100644
--- a/Documentation/sound/alsa/soc/dapm.txt
+++ b/Documentation/sound/alsa/soc/dapm.txt
@@ -11,7 +11,7 @@ other PM systems.
DAPM is also completely transparent to all user space applications as all power
switching is done within the ASoC core. No code changes or recompiling are
-required for user space applications. DAPM makes power switching descisions based
+required for user space applications. DAPM makes power switching decisions based
upon any audio stream (capture/playback) activity and audio mixer settings
within the device.
@@ -38,7 +38,7 @@ There are 4 power domains within DAPM
Enabled and disabled when stream playback/capture is started and
stopped respectively. e.g. aplay, arecord.
-All DAPM power switching descisons are made automatically by consulting an audio
+All DAPM power switching decisions are made automatically by consulting an audio
routing map of the whole machine. This map is specific to each machine and
consists of the interconnections between every audio component (including
internal codec components). All audio components that effect power are called
diff --git a/Documentation/sound/alsa/soc/overview.txt b/Documentation/sound/alsa/soc/overview.txt
index 753c5cc5984a..c47ce9530677 100644
--- a/Documentation/sound/alsa/soc/overview.txt
+++ b/Documentation/sound/alsa/soc/overview.txt
@@ -2,18 +2,19 @@ ALSA SoC Layer
==============
The overall project goal of the ALSA System on Chip (ASoC) layer is to provide
-better ALSA support for embedded system on chip procesors (e.g. pxa2xx, au1x00,
+better ALSA support for embedded system-on-chip processors (e.g. pxa2xx, au1x00,
iMX, etc) and portable audio codecs. Currently there is some support in the
kernel for SoC audio, however it has some limitations:-
* Currently, codec drivers are often tightly coupled to the underlying SoC
- cpu. This is not ideal and leads to code duplication i.e. Linux now has 4
+ CPU. This is not ideal and leads to code duplication i.e. Linux now has 4
different wm8731 drivers for 4 different SoC platforms.
- * There is no standard method to signal user initiated audio events.
- e.g. Headphone/Mic insertion, Headphone/Mic detection after an insertion
- event. These are quite common events on portable devices and ofter require
- machine specific code to re route audio, enable amps etc after such an event.
+ * There is no standard method to signal user initiated audio events (e.g.
+ Headphone/Mic insertion, Headphone/Mic detection after an insertion
+ event). These are quite common events on portable devices and often require
+ machine specific code to re-route audio, enable amps, etc., after such an
+ event.
* Current drivers tend to power up the entire codec when playing
(or recording) audio. This is fine for a PC, but tends to waste a lot of
@@ -44,7 +45,7 @@ features :-
signals the codec when to change power states.
* Machine specific controls: Allow machines to add controls to the sound card
- e.g. volume control for speaker amp.
+ (e.g. volume control for speaker amp).
To achieve all this, ASoC basically splits an embedded audio system into 3
components :-
@@ -57,7 +58,7 @@ components :-
interface drivers (e.g. I2S, AC97, PCM) for that platform.
* Machine driver: The machine driver handles any machine specific controls and
- audio events. i.e. turing on an amp at start of playback.
+ audio events (e.g. turning on an amp at start of playback).
Documentation
diff --git a/Documentation/sound/alsa/soc/platform.txt b/Documentation/sound/alsa/soc/platform.txt
index e95b16d5a53b..d4678b4dc6c6 100644
--- a/Documentation/sound/alsa/soc/platform.txt
+++ b/Documentation/sound/alsa/soc/platform.txt
@@ -20,7 +20,7 @@ struct snd_soc_ops {
int (*trigger)(struct snd_pcm_substream *, int);
};
-The platform driver exports it's DMA functionailty via struct snd_soc_platform:-
+The platform driver exports its DMA functionality via struct snd_soc_platform:-
struct snd_soc_platform {
char *name;
diff --git a/Documentation/sound/alsa/soc/pops_clicks.txt b/Documentation/sound/alsa/soc/pops_clicks.txt
index 2cf7ee5b3d74..3371bd9d7cfa 100644
--- a/Documentation/sound/alsa/soc/pops_clicks.txt
+++ b/Documentation/sound/alsa/soc/pops_clicks.txt
@@ -2,7 +2,7 @@ Audio Pops and Clicks
=====================
Pops and clicks are unwanted audio artifacts caused by the powering up and down
-of components within the audio subsystem. This is noticable on PC's when an
+of components within the audio subsystem. This is noticeable on PCs when an
audio module is either loaded or unloaded (at module load time the sound card is
powered up and causes a popping noise on the speakers).
@@ -16,7 +16,7 @@ Minimising Playback Pops and Clicks
===================================
Playback pops in portable audio subsystems cannot be completely eliminated atm,
-however future audio codec hardware will have better pop and click supression.
+however future audio codec hardware will have better pop and click suppression.
Pops can be reduced within playback by powering the audio components in a
specific order. This order is different for startup and shutdown and follows
some basic rules:-
@@ -33,7 +33,7 @@ Minimising Capture Pops and Clicks
==================================
Capture artifacts are somewhat easier to get rid as we can delay activating the
-ADC until all the pops have occured. This follows similar power rules to
+ADC until all the pops have occurred. This follows similar power rules to
playback in that components are powered in a sequence depending upon stream
startup or shutdown.
diff --git a/Documentation/sound/oss/es1371 b/Documentation/sound/oss/es1371
deleted file mode 100644
index c3151266771c..000000000000
--- a/Documentation/sound/oss/es1371
+++ /dev/null
@@ -1,64 +0,0 @@
-/proc/sound, /dev/sndstat
--------------------------
-
-/proc/sound and /dev/sndstat is not supported by the
-driver. To find out whether the driver succeeded loading,
-check the kernel log (dmesg).
-
-
-ALaw/uLaw sample formats
-------------------------
-
-This driver does not support the ALaw/uLaw sample formats.
-ALaw is the default mode when opening a sound device
-using OSS/Free. The reason for the lack of support is
-that the hardware does not support these formats, and adding
-conversion routines to the kernel would lead to very ugly
-code in the presence of the mmap interface to the driver.
-And since xquake uses mmap, mmap is considered important :-)
-and no sane application uses ALaw/uLaw these days anyway.
-In short, playing a Sun .au file as follows:
-
-cat my_file.au > /dev/dsp
-
-does not work. Instead, you may use the play script from
-Chris Bagwell's sox-12.14 package (available from the URL
-below) to play many different audio file formats.
-The script automatically determines the audio format
-and does do audio conversions if necessary.
-http://home.sprynet.com/sprynet/cbagwell/projects.html
-
-
-Blocking vs. nonblocking IO
----------------------------
-
-Unlike OSS/Free this driver honours the O_NONBLOCK file flag
-not only during open, but also during read and write.
-This is an effort to make the sound driver interface more
-regular. Timidity has problems with this; a patch
-is available from http://www.ife.ee.ethz.ch/~sailer/linux/pciaudio.html.
-(Timidity patched will also run on OSS/Free).
-
-
-MIDI UART
----------
-
-The driver supports a simple MIDI UART interface, with
-no ioctl's supported.
-
-
-MIDI synthesizer
-----------------
-
-This soundcard does not have any hardware MIDI synthesizer;
-MIDI synthesis has to be done in software. To allow this
-the driver/soundcard supports two PCM (/dev/dsp) interfaces.
-
-There is a freely available software package that allows
-MIDI file playback on this soundcard called Timidity.
-See http://www.cgs.fi/~tt/timidity/.
-
-
-
-Thomas Sailer
-t.sailer@alumni.ethz.ch
diff --git a/Documentation/sparc/sbus_drivers.txt b/Documentation/sparc/sbus_drivers.txt
index 8418d35484fc..eb1e28ad8822 100644
--- a/Documentation/sparc/sbus_drivers.txt
+++ b/Documentation/sparc/sbus_drivers.txt
@@ -67,10 +67,12 @@ probe in an SBUS driver under Linux:
MODULE_DEVICE_TABLE(of, mydevice_match);
static struct of_platform_driver mydevice_driver = {
- .name = "mydevice",
.match_table = mydevice_match,
.probe = mydevice_probe,
.remove = __devexit_p(mydevice_remove),
+ .driver = {
+ .name = "mydevice",
+ },
};
static int __init mydevice_init(void)
diff --git a/Documentation/spi/pxa2xx b/Documentation/spi/pxa2xx
index 215e3b8e7266..f3853cc37bde 100644
--- a/Documentation/spi/pxa2xx
+++ b/Documentation/spi/pxa2xx
@@ -1,4 +1,4 @@
-PXA2xx SPI on SSP driver HOWTO
+PXA2xx SPI on SSP driver HOWTO
===================================================
This a mini howto on the pxa2xx_spi driver. The driver turns a PXA2xx
synchronous serial port into a SPI master controller
diff --git a/Documentation/spi/spi-summary b/Documentation/spi/spi-summary
index 76ea6c837be5..8861e47e5a2d 100644
--- a/Documentation/spi/spi-summary
+++ b/Documentation/spi/spi-summary
@@ -156,21 +156,29 @@ using the driver model to connect controller and protocol drivers using
device tables provided by board specific initialization code. SPI
shows up in sysfs in several locations:
+ /sys/devices/.../CTLR ... physical node for a given SPI controller
+
/sys/devices/.../CTLR/spiB.C ... spi_device on bus "B",
chipselect C, accessed through CTLR.
+ /sys/bus/spi/devices/spiB.C ... symlink to that physical
+ .../CTLR/spiB.C device
+
/sys/devices/.../CTLR/spiB.C/modalias ... identifies the driver
that should be used with this device (for hotplug/coldplug)
- /sys/bus/spi/devices/spiB.C ... symlink to the physical
- spiB.C device
-
/sys/bus/spi/drivers/D ... driver for one or more spi*.* devices
- /sys/class/spi_master/spiB ... class device for the controller
- managing bus "B". All the spiB.* devices share the same
+ /sys/class/spi_master/spiB ... symlink (or actual device node) to
+ a logical node which could hold class related state for the
+ controller managing bus "B". All spiB.* devices share one
physical SPI bus segment, with SCLK, MOSI, and MISO.
+Note that the actual location of the controller's class state depends
+on whether you enabled CONFIG_SYSFS_DEPRECATED or not. At this time,
+the only class-specific state is the bus number ("B" in "spiB"), so
+those /sys/class entries are only useful to quickly identify busses.
+
How does board-specific init code declare SPI devices?
------------------------------------------------------
@@ -337,7 +345,8 @@ SPI protocol drivers somewhat resemble platform device drivers:
The driver core will autmatically attempt to bind this driver to any SPI
device whose board_info gave a modalias of "CHIP". Your probe() code
-might look like this unless you're creating a class_device:
+might look like this unless you're creating a device which is managing
+a bus (appearing under /sys/class/spi_master).
static int __devinit CHIP_probe(struct spi_device *spi)
{
@@ -442,7 +451,7 @@ An SPI controller will probably be registered on the platform_bus; write
a driver to bind to the device, whichever bus is involved.
The main task of this type of driver is to provide an "spi_master".
-Use spi_alloc_master() to allocate the master, and class_get_devdata()
+Use spi_alloc_master() to allocate the master, and spi_master_get_devdata()
to get the driver-private data allocated for that device.
struct spi_master *master;
@@ -452,7 +461,7 @@ to get the driver-private data allocated for that device.
if (!master)
return -ENODEV;
- c = class_get_devdata(&master->cdev);
+ c = spi_master_get_devdata(master);
The driver will initialize the fields of that spi_master, including the
bus number (maybe the same as the platform device ID) and three methods
diff --git a/Documentation/spi/spidev_test.c b/Documentation/spi/spidev_test.c
index 218e86215297..cf0e3ce0d526 100644
--- a/Documentation/spi/spidev_test.c
+++ b/Documentation/spi/spidev_test.c
@@ -29,7 +29,7 @@ static void pabort(const char *s)
abort();
}
-static char *device = "/dev/spidev1.1";
+static const char *device = "/dev/spidev1.1";
static uint8_t mode;
static uint8_t bits = 8;
static uint32_t speed = 500000;
@@ -69,7 +69,7 @@ static void transfer(int fd)
puts("");
}
-void print_usage(char *prog)
+void print_usage(const char *prog)
{
printf("Usage: %s [-DsbdlHOLC3]\n", prog);
puts(" -D --device device to use (default /dev/spidev1.1)\n"
@@ -88,7 +88,7 @@ void print_usage(char *prog)
void parse_opts(int argc, char *argv[])
{
while (1) {
- static struct option lopts[] = {
+ static const struct option lopts[] = {
{ "device", 1, 0, 'D' },
{ "speed", 1, 0, 's' },
{ "delay", 1, 0, 'd' },
diff --git a/Documentation/sysctl/00-INDEX b/Documentation/sysctl/00-INDEX
new file mode 100644
index 000000000000..a20a9066dc4c
--- /dev/null
+++ b/Documentation/sysctl/00-INDEX
@@ -0,0 +1,16 @@
+00-INDEX
+ - this file.
+README
+ - general information about /proc/sys/ sysctl files.
+abi.txt
+ - documentation for /proc/sys/abi/*.
+ctl_unnumbered.txt
+ - explanation of why one should not add new binary sysctl numbers.
+fs.txt
+ - documentation for /proc/sys/fs/*.
+kernel.txt
+ - documentation for /proc/sys/kernel/*.
+sunrpc.txt
+ - documentation for /proc/sys/sunrpc/*.
+vm.txt
+ - documentation for /proc/sys/vm/*.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 111fd28727ec..8984a5396271 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -320,6 +320,14 @@ kernel. This value defaults to SHMMAX.
==============================================================
+softlockup_thresh:
+
+This value can be used to lower the softlockup tolerance
+threshold. The default threshold is 10s. If a cpu is locked up
+for 10s, the kernel complains. Valid values are 1-60s.
+
+==============================================================
+
tainted:
Non-zero if the kernel has been tainted. Numeric values, which
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index a0ccc5b60260..b89570c30434 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -31,6 +31,7 @@ Currently, these files are in /proc/sys/vm:
- min_unmapped_ratio
- min_slab_ratio
- panic_on_oom
+- oom_kill_allocating_task
- mmap_min_address
- numa_zonelist_order
@@ -111,6 +112,12 @@ of kilobytes free. The VM uses this number to compute a pages_min
value for each lowmem zone in the system. Each lowmem zone gets
a number of reserved free pages based proportionally on its size.
+Some minimal ammount of memory is needed to satisfy PF_MEMALLOC
+allocations; if you set this to lower than 1024KB, your system will
+become subtly broken, and prone to deadlock under high loads.
+
+Setting this too high will OOM your machine instantly.
+
==============================================================
percpu_pagelist_fraction
@@ -220,6 +227,27 @@ The default value is 0.
1 and 2 are for failover of clustering. Please select either
according to your policy of failover.
+=============================================================
+
+oom_kill_allocating_task
+
+This enables or disables killing the OOM-triggering task in
+out-of-memory situations.
+
+If this is set to zero, the OOM killer will scan through the entire
+tasklist and select a task based on heuristics to kill. This normally
+selects a rogue memory-hogging task that frees up a large amount of
+memory when killed.
+
+If this is set to non-zero, the OOM killer simply kills the task that
+triggered the out-of-memory condition. This avoids the expensive
+tasklist scan.
+
+If panic_on_oom is selected, it takes precedence over whatever value
+is used in oom_kill_allocating_task.
+
+The default value is 0.
+
==============================================================
mmap_min_addr
diff --git a/Documentation/telephony/00-INDEX b/Documentation/telephony/00-INDEX
new file mode 100644
index 000000000000..4ffe0ed5b6fb
--- /dev/null
+++ b/Documentation/telephony/00-INDEX
@@ -0,0 +1,4 @@
+00-INDEX
+ - this file.
+ixj.txt
+ - document describing the Quicknet drivers.
diff --git a/Documentation/thinkpad-acpi.txt b/Documentation/thinkpad-acpi.txt
index 60953d6c919d..ec499265deca 100644
--- a/Documentation/thinkpad-acpi.txt
+++ b/Documentation/thinkpad-acpi.txt
@@ -105,10 +105,15 @@ The version of thinkpad-acpi's sysfs interface is exported by the driver
as a driver attribute (see below).
Sysfs driver attributes are on the driver's sysfs attribute space,
-for 2.6.20 this is /sys/bus/platform/drivers/thinkpad_acpi/.
+for 2.6.23 this is /sys/bus/platform/drivers/thinkpad_acpi/ and
+/sys/bus/platform/drivers/thinkpad_hwmon/
-Sysfs device attributes are on the driver's sysfs attribute space,
-for 2.6.20 this is /sys/devices/platform/thinkpad_acpi/.
+Sysfs device attributes are on the thinkpad_acpi device sysfs attribute
+space, for 2.6.23 this is /sys/devices/platform/thinkpad_acpi/.
+
+Sysfs device attributes for the sensors and fan are on the
+thinkpad_hwmon device's sysfs attribute space, but you should locate it
+looking for a hwmon device with the name attribute of "thinkpad".
Driver version
--------------
@@ -766,7 +771,7 @@ Temperature sensors
-------------------
procfs: /proc/acpi/ibm/thermal
-sysfs device attributes: (hwmon) temp*_input
+sysfs device attributes: (hwmon "thinkpad") temp*_input
Most ThinkPads include six or more separate temperature sensors but only
expose the CPU temperature through the standard ACPI methods. This
@@ -989,7 +994,9 @@ Fan control and monitoring: fan speed, fan enable/disable
---------------------------------------------------------
procfs: /proc/acpi/ibm/fan
-sysfs device attributes: (hwmon) fan_input, pwm1, pwm1_enable
+sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1,
+ pwm1_enable
+sysfs hwmon driver attributes: fan_watchdog
NOTE NOTE NOTE: fan control operations are disabled by default for
safety reasons. To enable them, the module parameter "fan_control=1"
@@ -1028,7 +1035,7 @@ enable it if necessary to avoid overheating.
An enabled fan in level "auto" may stop spinning if the EC decides the
ThinkPad is cool enough and doesn't need the extra airflow. This is
-normal, and the EC will spin the fan up if the varios thermal readings
+normal, and the EC will spin the fan up if the various thermal readings
rise too much.
On the X40, this seems to depend on the CPU and HDD temperatures.
@@ -1131,7 +1138,7 @@ hwmon device attribute fan1_input:
which can take up to two minutes. May return rubbish on older
ThinkPads.
-driver attribute fan_watchdog:
+hwmon driver attribute fan_watchdog:
Fan safety watchdog timer interval, in seconds. Minimum is
1 second, maximum is 120 seconds. 0 disables the watchdog.
@@ -1196,7 +1203,7 @@ for example:
Enabling debugging output
-------------------------
-The module takes a debug paramater which can be used to selectively
+The module takes a debug parameter which can be used to selectively
enable various classes of debugging output, for example:
modprobe ibm_acpi debug=0xffff
@@ -1233,3 +1240,9 @@ Sysfs interface changelog:
layer, the radio switch generates input event EV_RADIO,
and the driver enables hot key handling by default in
the firmware.
+
+0x020000: ABI fix: added a separate hwmon platform device and
+ driver, which must be located by name (thinkpad)
+ and the hwmon class for libsensors4 (lm-sensors 3)
+ compatibility. Moved all hwmon attributes to this
+ new platform device.
diff --git a/Documentation/usb/authorization.txt b/Documentation/usb/authorization.txt
new file mode 100644
index 000000000000..2af400609498
--- /dev/null
+++ b/Documentation/usb/authorization.txt
@@ -0,0 +1,92 @@
+
+Authorizing (or not) your USB devices to connect to the system
+
+(C) 2007 Inaky Perez-Gonzalez <inaky@linux.intel.com> Intel Corporation
+
+This feature allows you to control if a USB device can be used (or
+not) in a system. This feature will allow you to implement a lock-down
+of USB devices, fully controlled by user space.
+
+As of now, when a USB device is connected it is configured and
+it's interfaces inmediately made available to the users. With this
+modification, only if root authorizes the device to be configured will
+then it be possible to use it.
+
+Usage:
+
+Authorize a device to connect:
+
+$ echo 1 > /sys/usb/devices/DEVICE/authorized
+
+Deauthorize a device:
+
+$ echo 0 > /sys/usb/devices/DEVICE/authorized
+
+Set new devices connected to hostX to be deauthorized by default (ie:
+lock down):
+
+$ echo 0 > /sys/bus/devices/usbX/authorized_default
+
+Remove the lock down:
+
+$ echo 1 > /sys/bus/devices/usbX/authorized_default
+
+By default, Wired USB devices are authorized by default to
+connect. Wireless USB hosts deauthorize by default all new connected
+devices (this is so because we need to do an authentication phase
+before authorizing).
+
+
+Example system lockdown (lame)
+-----------------------
+
+Imagine you want to implement a lockdown so only devices of type XYZ
+can be connected (for example, it is a kiosk machine with a visible
+USB port):
+
+boot up
+rc.local ->
+
+ for host in /sys/bus/devices/usb*
+ do
+ echo 0 > $host/authorized_default
+ done
+
+Hookup an script to udev, for new USB devices
+
+ if device_is_my_type $DEV
+ then
+ echo 1 > $device_path/authorized
+ done
+
+
+Now, device_is_my_type() is where the juice for a lockdown is. Just
+checking if the class, type and protocol match something is the worse
+security verification you can make (or the best, for someone willing
+to break it). If you need something secure, use crypto and Certificate
+Authentication or stuff like that. Something simple for an storage key
+could be:
+
+function device_is_my_type()
+{
+ echo 1 > authorized # temporarily authorize it
+ # FIXME: make sure none can mount it
+ mount DEVICENODE /mntpoint
+ sum=$(md5sum /mntpoint/.signature)
+ if [ $sum = $(cat /etc/lockdown/keysum) ]
+ then
+ echo "We are good, connected"
+ umount /mntpoint
+ # Other stuff so others can use it
+ else
+ echo 0 > authorized
+ fi
+}
+
+
+Of course, this is lame, you'd want to do a real certificate
+verification stuff with PKI, so you don't depend on a shared secret,
+etc, but you get the idea. Anybody with access to a device gadget kit
+can fake descriptors and device info. Don't trust that. You are
+welcome.
+
diff --git a/Documentation/usb/power-management.txt b/Documentation/usb/power-management.txt
new file mode 100644
index 000000000000..97842deec471
--- /dev/null
+++ b/Documentation/usb/power-management.txt
@@ -0,0 +1,517 @@
+ Power Management for USB
+
+ Alan Stern <stern@rowland.harvard.edu>
+
+ October 5, 2007
+
+
+
+ What is Power Management?
+ -------------------------
+
+Power Management (PM) is the practice of saving energy by suspending
+parts of a computer system when they aren't being used. While a
+component is "suspended" it is in a nonfunctional low-power state; it
+might even be turned off completely. A suspended component can be
+"resumed" (returned to a functional full-power state) when the kernel
+needs to use it. (There also are forms of PM in which components are
+placed in a less functional but still usable state instead of being
+suspended; an example would be reducing the CPU's clock rate. This
+document will not discuss those other forms.)
+
+When the parts being suspended include the CPU and most of the rest of
+the system, we speak of it as a "system suspend". When a particular
+device is turned off while the system as a whole remains running, we
+call it a "dynamic suspend" (also known as a "runtime suspend" or
+"selective suspend"). This document concentrates mostly on how
+dynamic PM is implemented in the USB subsystem, although system PM is
+covered to some extent (see Documentation/power/*.txt for more
+information about system PM).
+
+Note: Dynamic PM support for USB is present only if the kernel was
+built with CONFIG_USB_SUSPEND enabled. System PM support is present
+only if the kernel was built with CONFIG_SUSPEND or CONFIG_HIBERNATION
+enabled.
+
+
+ What is Remote Wakeup?
+ ----------------------
+
+When a device has been suspended, it generally doesn't resume until
+the computer tells it to. Likewise, if the entire computer has been
+suspended, it generally doesn't resume until the user tells it to, say
+by pressing a power button or opening the cover.
+
+However some devices have the capability of resuming by themselves, or
+asking the kernel to resume them, or even telling the entire computer
+to resume. This capability goes by several names such as "Wake On
+LAN"; we will refer to it generically as "remote wakeup". When a
+device is enabled for remote wakeup and it is suspended, it may resume
+itself (or send a request to be resumed) in response to some external
+event. Examples include a suspended keyboard resuming when a key is
+pressed, or a suspended USB hub resuming when a device is plugged in.
+
+
+ When is a USB device idle?
+ --------------------------
+
+A device is idle whenever the kernel thinks it's not busy doing
+anything important and thus is a candidate for being suspended. The
+exact definition depends on the device's driver; drivers are allowed
+to declare that a device isn't idle even when there's no actual
+communication taking place. (For example, a hub isn't considered idle
+unless all the devices plugged into that hub are already suspended.)
+In addition, a device isn't considered idle so long as a program keeps
+its usbfs file open, whether or not any I/O is going on.
+
+If a USB device has no driver, its usbfs file isn't open, and it isn't
+being accessed through sysfs, then it definitely is idle.
+
+
+ Forms of dynamic PM
+ -------------------
+
+Dynamic suspends can occur in two ways: manual and automatic.
+"Manual" means that the user has told the kernel to suspend a device,
+whereas "automatic" means that the kernel has decided all by itself to
+suspend a device. Automatic suspend is called "autosuspend" for
+short. In general, a device won't be autosuspended unless it has been
+idle for some minimum period of time, the so-called idle-delay time.
+
+Of course, nothing the kernel does on its own initiative should
+prevent the computer or its devices from working properly. If a
+device has been autosuspended and a program tries to use it, the
+kernel will automatically resume the device (autoresume). For the
+same reason, an autosuspended device will usually have remote wakeup
+enabled, if the device supports remote wakeup.
+
+It is worth mentioning that many USB drivers don't support
+autosuspend. In fact, at the time of this writing (Linux 2.6.23) the
+only drivers which do support it are the hub driver, kaweth, asix,
+usblp, usblcd, and usb-skeleton (which doesn't count). If a
+non-supporting driver is bound to a device, the device won't be
+autosuspended. In effect, the kernel pretends the device is never
+idle.
+
+We can categorize power management events in two broad classes:
+external and internal. External events are those triggered by some
+agent outside the USB stack: system suspend/resume (triggered by
+userspace), manual dynamic suspend/resume (also triggered by
+userspace), and remote wakeup (triggered by the device). Internal
+events are those triggered within the USB stack: autosuspend and
+autoresume.
+
+
+ The user interface for dynamic PM
+ ---------------------------------
+
+The user interface for controlling dynamic PM is located in the power/
+subdirectory of each USB device's sysfs directory, that is, in
+/sys/bus/usb/devices/.../power/ where "..." is the device's ID. The
+relevant attribute files are: wakeup, level, and autosuspend.
+
+ power/wakeup
+
+ This file is empty if the device does not support
+ remote wakeup. Otherwise the file contains either the
+ word "enabled" or the word "disabled", and you can
+ write those words to the file. The setting determines
+ whether or not remote wakeup will be enabled when the
+ device is next suspended. (If the setting is changed
+ while the device is suspended, the change won't take
+ effect until the following suspend.)
+
+ power/level
+
+ This file contains one of three words: "on", "auto",
+ or "suspend". You can write those words to the file
+ to change the device's setting.
+
+ "on" means that the device should be resumed and
+ autosuspend is not allowed. (Of course, system
+ suspends are still allowed.)
+
+ "auto" is the normal state in which the kernel is
+ allowed to autosuspend and autoresume the device.
+
+ "suspend" means that the device should remain
+ suspended, and autoresume is not allowed. (But remote
+ wakeup may still be allowed, since it is controlled
+ separately by the power/wakeup attribute.)
+
+ power/autosuspend
+
+ This file contains an integer value, which is the
+ number of seconds the device should remain idle before
+ the kernel will autosuspend it (the idle-delay time).
+ The default is 2. 0 means to autosuspend as soon as
+ the device becomes idle, and -1 means never to
+ autosuspend. You can write a number to the file to
+ change the autosuspend idle-delay time.
+
+Writing "-1" to power/autosuspend and writing "on" to power/level do
+essentially the same thing -- they both prevent the device from being
+autosuspended. Yes, this is a redundancy in the API.
+
+(In 2.6.21 writing "0" to power/autosuspend would prevent the device
+from being autosuspended; the behavior was changed in 2.6.22. The
+power/autosuspend attribute did not exist prior to 2.6.21, and the
+power/level attribute did not exist prior to 2.6.22.)
+
+
+ Changing the default idle-delay time
+ ------------------------------------
+
+The default autosuspend idle-delay time is controlled by a module
+parameter in usbcore. You can specify the value when usbcore is
+loaded. For example, to set it to 5 seconds instead of 2 you would
+do:
+
+ modprobe usbcore autosuspend=5
+
+Equivalently, you could add to /etc/modprobe.conf a line saying:
+
+ options usbcore autosuspend=5
+
+Some distributions load the usbcore module very early during the boot
+process, by means of a program or script running from an initramfs
+image. To alter the parameter value you would have to rebuild that
+image.
+
+If usbcore is compiled into the kernel rather than built as a loadable
+module, you can add
+
+ usbcore.autosuspend=5
+
+to the kernel's boot command line.
+
+Finally, the parameter value can be changed while the system is
+running. If you do:
+
+ echo 5 >/sys/module/usbcore/parameters/autosuspend
+
+then each new USB device will have its autosuspend idle-delay
+initialized to 5. (The idle-delay values for already existing devices
+will not be affected.)
+
+Setting the initial default idle-delay to -1 will prevent any
+autosuspend of any USB device. This is a simple alternative to
+disabling CONFIG_USB_SUSPEND and rebuilding the kernel, and it has the
+added benefit of allowing you to enable autosuspend for selected
+devices.
+
+
+ Warnings
+ --------
+
+The USB specification states that all USB devices must support power
+management. Nevertheless, the sad fact is that many devices do not
+support it very well. You can suspend them all right, but when you
+try to resume them they disconnect themselves from the USB bus or
+they stop working entirely. This seems to be especially prevalent
+among printers and scanners, but plenty of other types of device have
+the same deficiency.
+
+For this reason, by default the kernel disables autosuspend (the
+power/level attribute is initialized to "on") for all devices other
+than hubs. Hubs, at least, appear to be reasonably well-behaved in
+this regard.
+
+(In 2.6.21 and 2.6.22 this wasn't the case. Autosuspend was enabled
+by default for almost all USB devices. A number of people experienced
+problems as a result.)
+
+This means that non-hub devices won't be autosuspended unless the user
+or a program explicitly enables it. As of this writing there aren't
+any widespread programs which will do this; we hope that in the near
+future device managers such as HAL will take on this added
+responsibility. In the meantime you can always carry out the
+necessary operations by hand or add them to a udev script. You can
+also change the idle-delay time; 2 seconds is not the best choice for
+every device.
+
+Sometimes it turns out that even when a device does work okay with
+autosuspend there are still problems. For example, there are
+experimental patches adding autosuspend support to the usbhid driver,
+which manages keyboards and mice, among other things. Tests with a
+number of keyboards showed that typing on a suspended keyboard, while
+causing the keyboard to do a remote wakeup all right, would
+nonetheless frequently result in lost keystrokes. Tests with mice
+showed that some of them would issue a remote-wakeup request in
+response to button presses but not to motion, and some in response to
+neither.
+
+The kernel will not prevent you from enabling autosuspend on devices
+that can't handle it. It is even possible in theory to damage a
+device by suspending it at the wrong time -- for example, suspending a
+USB hard disk might cause it to spin down without parking the heads.
+(Highly unlikely, but possible.) Take care.
+
+
+ The driver interface for Power Management
+ -----------------------------------------
+
+The requirements for a USB driver to support external power management
+are pretty modest; the driver need only define
+
+ .suspend
+ .resume
+ .reset_resume
+
+methods in its usb_driver structure, and the reset_resume method is
+optional. The methods' jobs are quite simple:
+
+ The suspend method is called to warn the driver that the
+ device is going to be suspended. If the driver returns a
+ negative error code, the suspend will be aborted. Normally
+ the driver will return 0, in which case it must cancel all
+ outstanding URBs (usb_kill_urb()) and not submit any more.
+
+ The resume method is called to tell the driver that the
+ device has been resumed and the driver can return to normal
+ operation. URBs may once more be submitted.
+
+ The reset_resume method is called to tell the driver that
+ the device has been resumed and it also has been reset.
+ The driver should redo any necessary device initialization,
+ since the device has probably lost most or all of its state
+ (although the interfaces will be in the same altsettings as
+ before the suspend).
+
+The reset_resume method is used by the USB Persist facility (see
+Documentation/usb/persist.txt) and it can also be used under certain
+circumstances when CONFIG_USB_PERSIST is not enabled. Currently, if a
+device is reset during a resume and the driver does not have a
+reset_resume method, the driver won't receive any notification about
+the resume. Later kernels will call the driver's disconnect method;
+2.6.23 doesn't do this.
+
+USB drivers are bound to interfaces, so their suspend and resume
+methods get called when the interfaces are suspended or resumed. In
+principle one might want to suspend some interfaces on a device (i.e.,
+force the drivers for those interface to stop all activity) without
+suspending the other interfaces. The USB core doesn't allow this; all
+interfaces are suspended when the device itself is suspended and all
+interfaces are resumed when the device is resumed. It isn't possible
+to suspend or resume some but not all of a device's interfaces. The
+closest you can come is to unbind the interfaces' drivers.
+
+
+ The driver interface for autosuspend and autoresume
+ ---------------------------------------------------
+
+To support autosuspend and autoresume, a driver should implement all
+three of the methods listed above. In addition, a driver indicates
+that it supports autosuspend by setting the .supports_autosuspend flag
+in its usb_driver structure. It is then responsible for informing the
+USB core whenever one of its interfaces becomes busy or idle. The
+driver does so by calling these three functions:
+
+ int usb_autopm_get_interface(struct usb_interface *intf);
+ void usb_autopm_put_interface(struct usb_interface *intf);
+ int usb_autopm_set_interface(struct usb_interface *intf);
+
+The functions work by maintaining a counter in the usb_interface
+structure. When intf->pm_usage_count is > 0 then the interface is
+deemed to be busy, and the kernel will not autosuspend the interface's
+device. When intf->pm_usage_count is <= 0 then the interface is
+considered to be idle, and the kernel may autosuspend the device.
+
+(There is a similar pm_usage_count field in struct usb_device,
+associated with the device itself rather than any of its interfaces.
+This field is used only by the USB core.)
+
+The driver owns intf->pm_usage_count; it can modify the value however
+and whenever it likes. A nice aspect of the usb_autopm_* routines is
+that the changes they make are protected by the usb_device structure's
+PM mutex (udev->pm_mutex); however drivers may change pm_usage_count
+without holding the mutex.
+
+ usb_autopm_get_interface() increments pm_usage_count and
+ attempts an autoresume if the new value is > 0 and the
+ device is suspended.
+
+ usb_autopm_put_interface() decrements pm_usage_count and
+ attempts an autosuspend if the new value is <= 0 and the
+ device isn't suspended.
+
+ usb_autopm_set_interface() leaves pm_usage_count alone.
+ It attempts an autoresume if the value is > 0 and the device
+ is suspended, and it attempts an autosuspend if the value is
+ <= 0 and the device isn't suspended.
+
+There also are a couple of utility routines drivers can use:
+
+ usb_autopm_enable() sets pm_usage_cnt to 1 and then calls
+ usb_autopm_set_interface(), which will attempt an autoresume.
+
+ usb_autopm_disable() sets pm_usage_cnt to 0 and then calls
+ usb_autopm_set_interface(), which will attempt an autosuspend.
+
+The conventional usage pattern is that a driver calls
+usb_autopm_get_interface() in its open routine and
+usb_autopm_put_interface() in its close or release routine. But
+other patterns are possible.
+
+The autosuspend attempts mentioned above will often fail for one
+reason or another. For example, the power/level attribute might be
+set to "on", or another interface in the same device might not be
+idle. This is perfectly normal. If the reason for failure was that
+the device hasn't been idle for long enough, a delayed workqueue
+routine is automatically set up to carry out the operation when the
+autosuspend idle-delay has expired.
+
+Autoresume attempts also can fail. This will happen if power/level is
+set to "suspend" or if the device doesn't manage to resume properly.
+Unlike autosuspend, there's no delay for an autoresume.
+
+
+ Other parts of the driver interface
+ -----------------------------------
+
+Sometimes a driver needs to make sure that remote wakeup is enabled
+during autosuspend. For example, there's not much point
+autosuspending a keyboard if the user can't cause the keyboard to do a
+remote wakeup by typing on it. If the driver sets
+intf->needs_remote_wakeup to 1, the kernel won't autosuspend the
+device if remote wakeup isn't available or has been disabled through
+the power/wakeup attribute. (If the device is already autosuspended,
+though, setting this flag won't cause the kernel to autoresume it.
+Normally a driver would set this flag in its probe method, at which
+time the device is guaranteed not to be autosuspended.)
+
+The usb_autopm_* routines have to run in a sleepable process context;
+they must not be called from an interrupt handler or while holding a
+spinlock. In fact, the entire autosuspend mechanism is not well geared
+toward interrupt-driven operation. However there is one thing a
+driver can do in an interrupt handler:
+
+ usb_mark_last_busy(struct usb_device *udev);
+
+This sets udev->last_busy to the current time. udev->last_busy is the
+field used for idle-delay calculations; updating it will cause any
+pending autosuspend to be moved back. The usb_autopm_* routines will
+also set the last_busy field to the current time.
+
+Calling urb_mark_last_busy() from within an URB completion handler is
+subject to races: The kernel may have just finished deciding the
+device has been idle for long enough but not yet gotten around to
+calling the driver's suspend method. The driver would have to be
+responsible for synchronizing its suspend method with its URB
+completion handler and causing the autosuspend to fail with -EBUSY if
+an URB had completed too recently.
+
+External suspend calls should never be allowed to fail in this way,
+only autosuspend calls. The driver can tell them apart by checking
+udev->auto_pm; this flag will be set to 1 for internal PM events
+(autosuspend or autoresume) and 0 for external PM events.
+
+Many of the ingredients in the autosuspend framework are oriented
+towards interfaces: The usb_interface structure contains the
+pm_usage_cnt field, and the usb_autopm_* routines take an interface
+pointer as their argument. But somewhat confusingly, a few of the
+pieces (usb_mark_last_busy() and udev->auto_pm) use the usb_device
+structure instead. Drivers need to keep this straight; they can call
+interface_to_usbdev() to find the device structure for a given
+interface.
+
+
+ Locking requirements
+ --------------------
+
+All three suspend/resume methods are always called while holding the
+usb_device's PM mutex. For external events -- but not necessarily for
+autosuspend or autoresume -- the device semaphore (udev->dev.sem) will
+also be held. This implies that external suspend/resume events are
+mutually exclusive with calls to probe, disconnect, pre_reset, and
+post_reset; the USB core guarantees that this is true of internal
+suspend/resume events as well.
+
+If a driver wants to block all suspend/resume calls during some
+critical section, it can simply acquire udev->pm_mutex.
+Alternatively, if the critical section might call some of the
+usb_autopm_* routines, the driver can avoid deadlock by doing:
+
+ down(&udev->dev.sem);
+ rc = usb_autopm_get_interface(intf);
+
+and at the end of the critical section:
+
+ if (!rc)
+ usb_autopm_put_interface(intf);
+ up(&udev->dev.sem);
+
+Holding the device semaphore will block all external PM calls, and the
+usb_autopm_get_interface() will prevent any internal PM calls, even if
+it fails. (Exercise: Why?)
+
+The rules for locking order are:
+
+ Never acquire any device semaphore while holding any PM mutex.
+
+ Never acquire udev->pm_mutex while holding the PM mutex for
+ a device that isn't a descendant of udev.
+
+In other words, PM mutexes should only be acquired going up the device
+tree, and they should be acquired only after locking all the device
+semaphores you need to hold. These rules don't matter to drivers very
+much; they usually affect just the USB core.
+
+Still, drivers do need to be careful. For example, many drivers use a
+private mutex to synchronize their normal I/O activities with their
+disconnect method. Now if the driver supports autosuspend then it
+must call usb_autopm_put_interface() from somewhere -- maybe from its
+close method. It should make the call while holding the private mutex,
+since a driver shouldn't call any of the usb_autopm_* functions for an
+interface from which it has been unbound.
+
+But the usb_autpm_* routines always acquire the device's PM mutex, and
+consequently the locking order has to be: private mutex first, PM
+mutex second. Since the suspend method is always called with the PM
+mutex held, it mustn't try to acquire the private mutex. It has to
+synchronize with the driver's I/O activities in some other way.
+
+
+ Interaction between dynamic PM and system PM
+ --------------------------------------------
+
+Dynamic power management and system power management can interact in
+a couple of ways.
+
+Firstly, a device may already be manually suspended or autosuspended
+when a system suspend occurs. Since system suspends are supposed to
+be as transparent as possible, the device should remain suspended
+following the system resume. The 2.6.23 kernel obeys this principle
+for manually suspended devices but not for autosuspended devices; they
+do get resumed when the system wakes up. (Presumably they will be
+autosuspended again after their idle-delay time expires.) In later
+kernels this behavior will be fixed.
+
+(There is an exception. If a device would undergo a reset-resume
+instead of a normal resume, and the device is enabled for remote
+wakeup, then the reset-resume takes place even if the device was
+already suspended when the system suspend began. The justification is
+that a reset-resume is a kind of remote-wakeup event. Or to put it
+another way, a device which needs a reset won't be able to generate
+normal remote-wakeup signals, so it ought to be resumed immediately.)
+
+Secondly, a dynamic power-management event may occur as a system
+suspend is underway. The window for this is short, since system
+suspends don't take long (a few seconds usually), but it can happen.
+For example, a suspended device may send a remote-wakeup signal while
+the system is suspending. The remote wakeup may succeed, which would
+cause the system suspend to abort. If the remote wakeup doesn't
+succeed, it may still remain active and thus cause the system to
+resume as soon as the system suspend is complete. Or the remote
+wakeup may fail and get lost. Which outcome occurs depends on timing
+and on the hardware and firmware design.
+
+More interestingly, a device might undergo a manual resume or
+autoresume during system suspend. With current kernels this shouldn't
+happen, because manual resumes must be initiated by userspace and
+autoresumes happen in response to I/O requests, but all user processes
+and I/O should be quiescent during a system suspend -- thanks to the
+freezer. However there are plans to do away with the freezer, which
+would mean these things would become possible. If and when this comes
+about, the USB core will carefully arrange matters so that either type
+of resume will block until the entire system has resumed.
diff --git a/Documentation/usb/usb-serial.txt b/Documentation/usb/usb-serial.txt
index 5b635ae84944..8b077e43eee7 100644
--- a/Documentation/usb/usb-serial.txt
+++ b/Documentation/usb/usb-serial.txt
@@ -338,7 +338,7 @@ MCT USB Single Port Serial Adapter U232
This driver is for the MCT USB-RS232 Converter (25 pin, Model No.
U232-P25) from Magic Control Technology Corp. (there is also a 9 pin
Model No. U232-P9). More information about this device can be found at
- the manufacture's web-site: http://www.mct.com.tw.
+ the manufacturer's web-site: http://www.mct.com.tw.
The driver is generally working, though it still needs some more testing.
It is derived from the Belkin USB Serial Adapter F5U103 driver and its
@@ -428,6 +428,17 @@ Options supported:
See http://www.uuhaus.de/linux/palmconnect.html for up-to-date
information on this driver.
+Winchiphead CH341 Driver
+
+ This driver is for the Winchiphead CH341 USB-RS232 Converter. This chip
+ also implements an IEEE 1284 parallel port, I2C and SPI, but that is not
+ supported by the driver. The protocol was analyzed from the behaviour
+ of the Windows driver, no datasheet is available at present.
+ The manufacturer's website: http://www.winchiphead.com/.
+ For any questions or problems with this driver, please contact
+ frank@kingswood-consulting.co.uk.
+
+
Generic Serial driver
If your device is not one of the above listed devices, compatible with
diff --git a/Documentation/usb/usbmon.txt b/Documentation/usb/usbmon.txt
index 53ae866ae37b..2917ce4ffdc4 100644
--- a/Documentation/usb/usbmon.txt
+++ b/Documentation/usb/usbmon.txt
@@ -34,9 +34,12 @@ if usbmon is built into the kernel.
Verify that bus sockets are present.
# ls /sys/kernel/debug/usbmon
-1s 1t 1u 2s 2t 2u 3s 3t 3u 4s 4t 4u
+0s 0t 0u 1s 1t 1u 2s 2t 2u 3s 3t 3u 4s 4t 4u
#
+Now you can choose to either use the sockets numbered '0' (to capture packets on
+all buses), and skip to step #3, or find the bus used by your device with step #2.
+
2. Find which bus connects to the desired device
Run "cat /proc/bus/usb/devices", and find the T-line which corresponds to
@@ -56,6 +59,10 @@ Bus=03 means it's bus 3.
# cat /sys/kernel/debug/usbmon/3u > /tmp/1.mon.out
+to listen on a single bus, otherwise, to listen on all buses, type:
+
+# cat /sys/kernel/debug/usbmon/0u > /tmp/1.mon.out
+
This process will be reading until killed. Naturally, the output can be
redirected to a desirable location. This is preferred, because it is going
to be quite long.
diff --git a/Documentation/video4linux/CARDLIST.bttv b/Documentation/video4linux/CARDLIST.bttv
index 177159c5f4c4..d97cf7cc6088 100644
--- a/Documentation/video4linux/CARDLIST.bttv
+++ b/Documentation/video4linux/CARDLIST.bttv
@@ -147,3 +147,4 @@
146 -> SSAI Ultrasound Video Interface [414a:5353]
147 -> VoodooTV 200 (USA) [121a:3000]
148 -> DViCO FusionHDTV 2 [dbc0:d200]
+149 -> Typhoon TV-Tuner PCI (50684)
diff --git a/Documentation/video4linux/CARDLIST.cx23885 b/Documentation/video4linux/CARDLIST.cx23885
new file mode 100644
index 000000000000..00cb646a4bde
--- /dev/null
+++ b/Documentation/video4linux/CARDLIST.cx23885
@@ -0,0 +1,5 @@
+ 0 -> UNKNOWN/GENERIC [0070:3400]
+ 1 -> Hauppauge WinTV-HVR1800lp [0070:7600]
+ 2 -> Hauppauge WinTV-HVR1800 [0070:7800,0070:7801]
+ 3 -> Hauppauge WinTV-HVR1250 [0070:7911]
+ 4 -> DViCO FusionHDTV5 Express [18ac:d500]
diff --git a/Documentation/video4linux/CARDLIST.saa7134 b/Documentation/video4linux/CARDLIST.saa7134
index 3f8aeab50a10..a14545300e4c 100644
--- a/Documentation/video4linux/CARDLIST.saa7134
+++ b/Documentation/video4linux/CARDLIST.saa7134
@@ -88,11 +88,11 @@
87 -> ADS Instant TV Duo Cardbus PTV331 [0331:1421]
88 -> Tevion/KWorld DVB-T 220RF [17de:7201]
89 -> ELSA EX-VISION 700TV [1048:226c]
- 90 -> Kworld ATSC110 [17de:7350]
+ 90 -> Kworld ATSC110/115 [17de:7350,17de:7352]
91 -> AVerMedia A169 B [1461:7360]
92 -> AVerMedia A169 B1 [1461:6360]
93 -> Medion 7134 Bridge #2 [16be:0005]
- 94 -> LifeView FlyDVB-T Hybrid Cardbus [5168:3306,5168:3502]
+ 94 -> LifeView FlyDVB-T Hybrid Cardbus/MSI TV @nywhere A/D NB [5168:3306,5168:3502,4e42:3502]
95 -> LifeView FlyVIDEO3000 (NTSC) [5169:0138]
96 -> Medion Md8800 Quadro [16be:0007,16be:0008]
97 -> LifeView FlyDVB-S /Acorp TV134DS [5168:0300,4e42:0300]
@@ -115,3 +115,4 @@
114 -> KWorld DVB-T 210 [17de:7250]
115 -> Sabrent PCMCIA TV-PCB05 [0919:2003]
116 -> 10MOONS TM300 TV Card [1131:2304]
+117 -> Avermedia Super 007 [1461:f01d]
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
new file mode 100644
index 000000000000..2131b00b63f6
--- /dev/null
+++ b/Documentation/vm/00-INDEX
@@ -0,0 +1,20 @@
+00-INDEX
+ - this file.
+balance
+ - various information on memory balancing.
+hugetlbpage.txt
+ - a brief summary of hugetlbpage support in the Linux kernel.
+locking
+ - info on how locking and synchronization is done in the Linux vm code.
+numa
+ - information about NUMA specific code in the Linux vm.
+numa_memory_policy.txt
+ - documentation of concepts and APIs of the 2.6 memory policy support.
+overcommit-accounting
+ - description of the Linux kernels overcommit handling modes.
+page_migration
+ - description of page migration in NUMA systems.
+slabinfo.c
+ - source code for a tool to get reports about slabs.
+slub.txt
+ - a short users guide for SLUB.
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
index 8242f52d0f22..dd4986497996 100644
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -302,31 +302,30 @@ MEMORY POLICIES AND CPUSETS
Memory policies work within cpusets as described above. For memory policies
that require a node or set of nodes, the nodes are restricted to the set of
-nodes whose memories are allowed by the cpuset constraints. If the
-intersection of the set of nodes specified for the policy and the set of nodes
-allowed by the cpuset is the empty set, the policy is considered invalid and
-cannot be installed.
+nodes whose memories are allowed by the cpuset constraints. If the nodemask
+specified for the policy contains nodes that are not allowed by the cpuset, or
+the intersection of the set of nodes specified for the policy and the set of
+nodes with memory is the empty set, the policy is considered invalid
+and cannot be installed.
The interaction of memory policies and cpusets can be problematic for a
couple of reasons:
-1) the memory policy APIs take physical node id's as arguments. However, the
- memory policy APIs do not provide a way to determine what nodes are valid
- in the context where the application is running. An application MAY consult
- the cpuset file system [directly or via an out of tree, and not generally
- available, libcpuset API] to obtain this information, but then the
- application must be aware that it is running in a cpuset and use what are
- intended primarily as administrative APIs.
-
- However, as long as the policy specifies at least one node that is valid
- in the controlling cpuset, the policy can be used.
+1) the memory policy APIs take physical node id's as arguments. As mentioned
+ above, it is illegal to specify nodes that are not allowed in the cpuset.
+ The application must query the allowed nodes using the get_mempolicy()
+ API with the MPOL_F_MEMS_ALLOWED flag to determine the allowed nodes and
+ restrict itself to those nodes. However, the resources available to a
+ cpuset can be changed by the system administrator, or a workload manager
+ application, at any time. So, a task may still get errors attempting to
+ specify policy nodes, and must query the allowed memories again.
2) when tasks in two cpusets share access to a memory region, such as shared
memory segments created by shmget() of mmap() with the MAP_ANONYMOUS and
MAP_SHARED flags, and any of the tasks install shared policy on the region,
only nodes whose memories are allowed in both cpusets may be used in the
- policies. Again, obtaining this information requires "stepping outside"
- the memory policy APIs, as well as knowing in what cpusets other task might
- be attaching to the shared region, to use the cpuset information.
+ policies. Obtaining this information requires "stepping outside" the
+ memory policy APIs to use the cpuset information and requires that one
+ know in what cpusets other task might be attaching to the shared region.
Furthermore, if the cpusets' allowed memory sets are disjoint, "local"
allocation is the only valid policy.
diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c
index 1af7bd5a2183..7047696c47a1 100644
--- a/Documentation/vm/slabinfo.c
+++ b/Documentation/vm/slabinfo.c
@@ -11,6 +11,7 @@
#include <stdlib.h>
#include <sys/types.h>
#include <dirent.h>
+#include <strings.h>
#include <string.h>
#include <unistd.h>
#include <stdarg.h>
@@ -84,7 +85,7 @@ void fatal(const char *x, ...)
va_start(ap, x);
vfprintf(stderr, x, ap);
va_end(ap);
- exit(1);
+ exit(EXIT_FAILURE);
}
void usage(void)
@@ -119,14 +120,14 @@ void usage(void)
);
}
-unsigned long read_obj(char *name)
+unsigned long read_obj(const char *name)
{
FILE *f = fopen(name, "r");
if (!f)
buffer[0] = 0;
else {
- if (!fgets(buffer,sizeof(buffer), f))
+ if (!fgets(buffer, sizeof(buffer), f))
buffer[0] = 0;
fclose(f);
if (buffer[strlen(buffer)] == '\n')
@@ -139,7 +140,7 @@ unsigned long read_obj(char *name)
/*
* Get the contents of an attribute
*/
-unsigned long get_obj(char *name)
+unsigned long get_obj(const char *name)
{
if (!read_obj(name))
return 0;
@@ -147,7 +148,7 @@ unsigned long get_obj(char *name)
return atol(buffer);
}
-unsigned long get_obj_and_str(char *name, char **x)
+unsigned long get_obj_and_str(const char *name, char **x)
{
unsigned long result = 0;
char *p;
@@ -166,12 +167,12 @@ unsigned long get_obj_and_str(char *name, char **x)
return result;
}
-void set_obj(struct slabinfo *s, char *name, int n)
+void set_obj(struct slabinfo *s, const char *name, int n)
{
char x[100];
FILE *f;
- sprintf(x, "%s/%s", s->name, name);
+ snprintf(x, 100, "%s/%s", s->name, name);
f = fopen(x, "w");
if (!f)
fatal("Cannot write to %s\n", x);
@@ -180,13 +181,13 @@ void set_obj(struct slabinfo *s, char *name, int n)
fclose(f);
}
-unsigned long read_slab_obj(struct slabinfo *s, char *name)
+unsigned long read_slab_obj(struct slabinfo *s, const char *name)
{
char x[100];
FILE *f;
- int l;
+ size_t l;
- sprintf(x, "%s/%s", s->name, name);
+ snprintf(x, 100, "%s/%s", s->name, name);
f = fopen(x, "r");
if (!f) {
buffer[0] = 0;
@@ -453,7 +454,7 @@ void slabcache(struct slabinfo *s)
return;
store_size(size_str, slab_size(s));
- sprintf(dist_str,"%lu/%lu/%d", s->slabs, s->partial, s->cpu_slabs);
+ snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs, s->partial, s->cpu_slabs);
if (!line++)
first_line();
@@ -1062,6 +1063,7 @@ void read_slab_dir(void)
slab->partial = get_obj("partial");
slab->partial = get_obj_and_str("partial", &t);
decode_numa_list(slab->numa_partial, t);
+ free(t);
slab->poison = get_obj("poison");
slab->reclaim_account = get_obj("reclaim_account");
slab->red_zone = get_obj("red_zone");
@@ -1069,6 +1071,7 @@ void read_slab_dir(void)
slab->slab_size = get_obj("slab_size");
slab->slabs = get_obj_and_str("slabs", &t);
decode_numa_list(slab->numa, t);
+ free(t);
slab->store_user = get_obj("store_user");
slab->trace = get_obj("trace");
chdir("..");
@@ -1148,7 +1151,7 @@ int main(int argc, char *argv[])
while ((c = getopt_long(argc, argv, "ad::efhil1noprstvzTS",
opts, NULL)) != -1)
- switch(c) {
+ switch (c) {
case '1':
show_single_ref = 1;
break;
diff --git a/Documentation/w1/00-INDEX b/Documentation/w1/00-INDEX
new file mode 100644
index 000000000000..5270cf4cb109
--- /dev/null
+++ b/Documentation/w1/00-INDEX
@@ -0,0 +1,8 @@
+00-INDEX
+ - This file
+masters/
+ - Individual chips providing 1-wire busses.
+w1.generic
+ - The 1-wire (w1) bus
+w1.netlink
+ - Userspace communication protocol over connector [1].
diff --git a/Documentation/w1/masters/00-INDEX b/Documentation/w1/masters/00-INDEX
new file mode 100644
index 000000000000..752613c4cea2
--- /dev/null
+++ b/Documentation/w1/masters/00-INDEX
@@ -0,0 +1,6 @@
+00-INDEX
+ - This file
+ds2482
+ - The Maxim/Dallas Semiconductor DS2482 provides 1-wire busses.
+ds2490
+ - The Maxim/Dallas Semiconductor DS2490 builds USB <-> W1 bridges.
diff --git a/Documentation/w1/masters/ds2482 b/Documentation/w1/masters/ds2482
index c5d5478d90b2..9210d6fa5024 100644
--- a/Documentation/w1/masters/ds2482
+++ b/Documentation/w1/masters/ds2482
@@ -15,7 +15,7 @@ Author: Ben Gardner <bgardner@wabtec.com>
Description
-----------
-The Maixm/Dallas Semiconductor DS2482 is a I2C device that provides
+The Maxim/Dallas Semiconductor DS2482 is a I2C device that provides
one (DS2482-100) or eight (DS2482-800) 1-wire busses.
diff --git a/Documentation/w1/masters/ds2490 b/Documentation/w1/masters/ds2490
index 44a4918bd7f2..239f9ae01843 100644
--- a/Documentation/w1/masters/ds2490
+++ b/Documentation/w1/masters/ds2490
@@ -10,7 +10,7 @@ Author: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Description
-----------
-The Maixm/Dallas Semiconductor DS2490 is a chip
+The Maxim/Dallas Semiconductor DS2490 is a chip
which allows to build USB <-> W1 bridges.
DS9490(R) is a USB <-> W1 bus master device
diff --git a/Documentation/watchdog/src/watchdog-simple.c b/Documentation/watchdog/src/watchdog-simple.c
index 47801bc7e742..4cf72f3fa8e9 100644
--- a/Documentation/watchdog/src/watchdog-simple.c
+++ b/Documentation/watchdog/src/watchdog-simple.c
@@ -3,15 +3,25 @@
#include <unistd.h>
#include <fcntl.h>
-int main(int argc, const char *argv[]) {
+int main(void)
+{
int fd = open("/dev/watchdog", O_WRONLY);
+ int ret = 0;
if (fd == -1) {
perror("watchdog");
- exit(1);
+ exit(EXIT_FAILURE);
}
while (1) {
- write(fd, "\0", 1);
- fsync(fd);
+ ret = write(fd, "\0", 1);
+ if (ret != 1) {
+ ret = -1;
+ break;
+ }
+ ret = fsync(fd);
+ if (ret)
+ break;
sleep(10);
}
+ close(fd);
+ return ret;
}
diff --git a/Documentation/x86_64/mm.txt b/Documentation/x86_64/mm.txt
index f42798ed1c54..b89b6d2bebfa 100644
--- a/Documentation/x86_64/mm.txt
+++ b/Documentation/x86_64/mm.txt
@@ -9,6 +9,7 @@ ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
ffff810000000000 - ffffc0ffffffffff (=46 bits) direct mapping of all phys. memory
ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole
ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space
+ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffffff80000000 - ffffffff82800000 (=40 MB) kernel text mapping, from phys 0
... unused hole ...
diff --git a/Documentation/xterm-linux.xpm b/Documentation/xterm-linux.xpm
deleted file mode 100644
index f469c1a18e6e..000000000000
--- a/Documentation/xterm-linux.xpm
+++ /dev/null
@@ -1,61 +0,0 @@
-/* XPM */
-/*****************************************************************************/
-/** This pixmap was made by Torsten Poulin - 1996 - torsten@diku.dk **/
-/** It was made by combining xterm-blank.xpm with **/
-/** the wonderfully cute Linux penguin mascot by Larry Ewing. **/
-/** I had to change Larry's penguin a little to make it fit. **/
-/** xterm-blank.xpm contained the following comment: **/
-/** This pixmap is kindly offered by Ion Cionca - 1992 - **/
-/** Swiss Federal Institute of Technology **/
-/** Central Computing Service **/
-/*****************************************************************************/
-static char * image_name [] = {
-/**/
-"64 38 8 1",
-/**/
-" s mask c none",
-". c gray70",
-"X c gray85",
-"o c gray50",
-"O c yellow",
-"+ c darkolivegreen",
-"@ c white",
-"# c black",
-" ###### ",
-" ######## ",
-" ########## ........................... ",
-" ########### .XXXXXXXXXXXXXXXXXXXXXXXXXXX. ",
-" ########### .XXXXXXXXXXXXXXXXXXXXXXXXXXXXXoo ",
-" #@@@#@@@### .XX+++++++++++++++++++++++XXXXoo ",
-" #@#@#@#@### .XX++++++++++++++++++++++++XXXooo ",
-" #@#####@### .XX++@@+@++@+@@@@++@+++++++XXXooo ",
-" ###OOO######.XX++++++++++++++++++++++++XXXoooo ",
-" ##OOOOOO####.XX++@@@@+@@+@@@+++++++++++XXXoooo ",
-" #O#OOO#O####.XX++++++++++++++++++++++++XXXooooo ",
-" ##O###OO####.XX++@@@@@@@@@@+@@@@@++++++XXXooooo ",
-" ###OOOO@#####XX++++++++++++++++++++++++XXXooooo ",
-" ##@###@@@@####XX++@@@+@@@@+@@++@@@++++++XXXooooo ",
-" #@@@@@@@@@@####X++++++++++++++++++++++++XXXooooo ",
-" ##@@@@@@@@@@#####++@+++++++++++++++++++++XXXooooo ",
-" ###@@@@@@@@@@######+++++++++++++++++++++++XXXooooo ",
-" ####@@@@@@@@@@@#####+@@@@+@+@@@+@++++++++++XXXooooo ",
-" ###@@@@@@@@@@@@######++++++++++++++++++++++XXXooooo ",
-" ##@@@@@@@@@@@@@@#####@+@@@@++++++++++++++++XXXooooo ",
-" ###@@@@@@@@@@@@@@######++++++++++++++++++++XXXXoooo ",
-" ###@@@@@@@@@@@@@@######XXXXXXXXXXXXXXXXXXXXXXXXooo ",
-" ###@@@@@@@@@@@@@@@######XXXXXXXXXXXXXXXXXXXXXXXooo ",
-" ###@@@@@@@@@@@@@@@@#####ooooooooooooooooooooooo...oo ",
-" ###@@@@@@@@@@@@@@@######.........................ooo ",
-" #OO##@@@@@@@@@@@@@#######oooooooooooooooooooooooooooo ",
-" #OOO##@@@@@@@@@@@#OO####O#XXXXXXXXXXXXXXXXXXXXXXXoooo.. .. ",
-" ###OOOOO##@@@@@@@@@@#OOO#OOO#XXXXXXXXXXXXXX#######XXoooo . .",
-" #OOOOOOOO###@@@@@@@@@#OOOOOOO#ooooooooooooooooooooXXXooo . ",
-" #OOOOOOOOO###@@@@@@@@@#OOOOOOO##XXXXXXXXXXXXXXXXXooooo . ",
-" #OOOOOOOOO#@@@@@@@@###OOOOOOOOO#XXXXXXXXXXXXXXXoo oooooo ",
-" #OOOOOOOOO#@@@@@@@####OOOOOOOO#@@@@@@@@@@@XXXXXoo ooooo...o ",
-" #OOOOOOOOOOO###########OOOOOO##XXXXXXXXXXXXXXXXoo ooXXXoo..o ",
-" ##OOOOOOOOO###########OOOO##@@@@@@@@@@@@@XXXXoo oXXXXX..o ",
-" ###OOOO### oXX##OOO#XXXXXXXXXXXXXXXXXXoo o.....oo ",
-" #### oooo####oooooooooooooooooooo ooooooo ",
-" ",
-" "};