| // Copyright The Prometheus Authors |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package blockdevice |
| |
| import ( |
| "bufio" |
| "errors" |
| "fmt" |
| "io" |
| "os" |
| "strings" |
| |
| "github.com/prometheus/procfs" |
| "github.com/prometheus/procfs/internal/fs" |
| "github.com/prometheus/procfs/internal/util" |
| ) |
| |
| // Info contains identifying information for a block device such as a disk drive. |
| type Info struct { |
| MajorNumber uint32 |
| MinorNumber uint32 |
| DeviceName string |
| } |
| |
| // IOStats models the iostats data described in the kernel documentation. |
| // - https://www.kernel.org/doc/Documentation/iostats.txt, |
| // - https://www.kernel.org/doc/Documentation/block/stat.txt |
| // - https://www.kernel.org/doc/Documentation/ABI/testing/procfs-diskstats |
| type IOStats struct { |
| // ReadIOs is the number of reads completed successfully. |
| ReadIOs uint64 |
| // ReadMerges is the number of reads merged. Reads and writes |
| // which are adjacent to each other may be merged for efficiency. |
| ReadMerges uint64 |
| // ReadSectors is the total number of sectors read successfully. |
| ReadSectors uint64 |
| // ReadTicks is the total number of milliseconds spent by all reads. |
| ReadTicks uint64 |
| // WriteIOs is the total number of writes completed successfully. |
| WriteIOs uint64 |
| // WriteMerges is the number of reads merged. |
| WriteMerges uint64 |
| // WriteSectors is the total number of sectors written successfully. |
| WriteSectors uint64 |
| // WriteTicks is the total number of milliseconds spent by all writes. |
| WriteTicks uint64 |
| // IOsInProgress is number of I/Os currently in progress. |
| IOsInProgress uint64 |
| // IOsTotalTicks is the number of milliseconds spent doing I/Os. |
| // This field increases so long as IosInProgress is nonzero. |
| IOsTotalTicks uint64 |
| // WeightedIOTicks is the weighted number of milliseconds spent doing I/Os. |
| // This can also be used to estimate average queue wait time for requests. |
| WeightedIOTicks uint64 |
| // DiscardIOs is the total number of discards completed successfully. |
| DiscardIOs uint64 |
| // DiscardMerges is the number of discards merged. |
| DiscardMerges uint64 |
| // DiscardSectors is the total number of sectors discarded successfully. |
| DiscardSectors uint64 |
| // DiscardTicks is the total number of milliseconds spent by all discards. |
| DiscardTicks uint64 |
| // FlushRequestsCompleted is the total number of flush request completed successfully. |
| FlushRequestsCompleted uint64 |
| // TimeSpentFlushing is the total number of milliseconds spent flushing. |
| TimeSpentFlushing uint64 |
| } |
| |
| // Diskstats combines the device Info and IOStats. |
| type Diskstats struct { |
| Info |
| IOStats |
| // IoStatsCount contains the number of io stats read. For kernel versions 5.5+, |
| // there should be 20 fields read. For kernel versions 4.18+, |
| // there should be 18 fields read. For earlier kernel versions this |
| // will be 14 because the discard values are not available. |
| IoStatsCount int |
| } |
| |
| // BlockQueueStats models the queue files that are located in the sysfs tree for each block device |
| // and described in the kernel documentation: |
| // https://www.kernel.org/doc/Documentation/block/queue-sysfs.txt |
| // https://www.kernel.org/doc/html/latest/block/queue-sysfs.html |
| type BlockQueueStats struct { |
| // AddRandom is the status of a disk entropy (1 is on, 0 is off). |
| AddRandom uint64 |
| // Dax indicates whether the device supports Direct Access (DAX) (1 is on, 0 is off). |
| DAX uint64 |
| // DiscardGranularity is the size of internal allocation of the device in bytes, 0 means device |
| // does not support the discard functionality. |
| DiscardGranularity uint64 |
| // DiscardMaxHWBytes is the hardware maximum number of bytes that can be discarded in a single operation, |
| // 0 means device does not support the discard functionality. |
| DiscardMaxHWBytes uint64 |
| // DiscardMaxBytes is the software maximum number of bytes that can be discarded in a single operation. |
| DiscardMaxBytes uint64 |
| // HWSectorSize is the sector size of the device, in bytes. |
| HWSectorSize uint64 |
| // IOPoll indicates if polling is enabled (1 is on, 0 is off). |
| IOPoll uint64 |
| // IOPollDelay indicates how polling will be performed, -1 for classic polling, 0 for hybrid polling, |
| // with greater than 0 the kernel will put process issuing IO to sleep for this amount of time in |
| // microseconds before entering classic polling. |
| IOPollDelay int64 |
| // IOTimeout is the request timeout in milliseconds. |
| IOTimeout uint64 |
| // IOStats indicates if iostats accounting is used for the disk (1 is on, 0 is off). |
| IOStats uint64 |
| // LogicalBlockSize is the logical block size of the device, in bytes. |
| LogicalBlockSize uint64 |
| // MaxHWSectorsKB is the maximum number of kilobytes supported in a single data transfer. |
| MaxHWSectorsKB uint64 |
| // MaxIntegritySegments is the max limit of integrity segments as set by block layer which a hardware controller |
| // can handle. |
| MaxIntegritySegments uint64 |
| // MaxSectorsKB is the maximum number of kilobytes that the block layer will allow for a filesystem request. |
| MaxSectorsKB uint64 |
| // MaxSegments is the number of segments on the device. |
| MaxSegments uint64 |
| // MaxSegmentsSize is the maximum segment size of the device. |
| MaxSegmentSize uint64 |
| // MinimumIOSize is the smallest preferred IO size reported by the device. |
| MinimumIOSize uint64 |
| // NoMerges shows the lookup logic involved with IO merging requests in the block layer. 0 all merges are |
| // enabled, 1 only simple one hit merges are tried, 2 no merge algorithms will be tried. |
| NoMerges uint64 |
| // NRRequests is the number of how many requests may be allocated in the block layer for read or write requests. |
| NRRequests uint64 |
| // OptimalIOSize is the optimal IO size reported by the device. |
| OptimalIOSize uint64 |
| // PhysicalBlockSize is the physical block size of device, in bytes. |
| PhysicalBlockSize uint64 |
| // ReadAHeadKB is the maximum number of kilobytes to read-ahead for filesystems on this block device. |
| ReadAHeadKB uint64 |
| // Rotational indicates if the device is of rotational type or non-rotational type. |
| Rotational uint64 |
| // RQAffinity indicates affinity policy of device, if 1 the block layer will migrate request completions to the |
| // cpu “group” that originally submitted the request, if 2 forces the completion to run on the requesting cpu. |
| RQAffinity uint64 |
| // SchedulerList contains list of available schedulers for this block device. |
| SchedulerList []string |
| // SchedulerCurrent is the current scheduler for this block device. |
| SchedulerCurrent string |
| // WriteCache shows the type of cache for block device, "write back" or "write through". |
| WriteCache string |
| // WriteSameMaxBytes is the number of bytes the device can write in a single write-same command. |
| // A value of ‘0’ means write-same is not supported by this device. |
| WriteSameMaxBytes uint64 |
| // WBTLatUSec is the target minimum read latency, 0 means feature is disables. |
| WBTLatUSec int64 |
| // ThrottleSampleTime is the time window that blk-throttle samples data, in millisecond. Optional |
| // exists only if CONFIG_BLK_DEV_THROTTLING_LOW is enabled. |
| ThrottleSampleTime *uint64 |
| // Zoned indicates if the device is a zoned block device and the zone model of the device if it is indeed zoned. |
| // Possible values are: none, host-aware, host-managed for zoned block devices. |
| Zoned string |
| // NRZones indicates the total number of zones of the device, always zero for regular block devices. |
| NRZones uint64 |
| // ChunksSectors for RAID is the size in 512B sectors of the RAID volume stripe segment, |
| // for zoned host device is the size in 512B sectors. |
| ChunkSectors uint64 |
| // FUA indicates whether the device supports Force Unit Access for write requests. |
| FUA uint64 |
| // MaxDiscardSegments is the maximum number of DMA entries in a discard request. |
| MaxDiscardSegments uint64 |
| // WriteZeroesMaxBytes the maximum number of bytes that can be zeroed at once. |
| // The value 0 means that REQ_OP_WRITE_ZEROES is not supported. |
| WriteZeroesMaxBytes uint64 |
| } |
| |
| type IODeviceStats struct { |
| IODoneCount uint64 |
| IOErrCount uint64 |
| } |
| |
| // DeviceMapperInfo models the devicemapper files that are located in the sysfs tree for each block device |
| // and described in the kernel documentation: |
| // https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-block-dm |
| type DeviceMapperInfo struct { |
| // Name is the string containing mapped device name. |
| Name string |
| // RqBasedSeqIOMergeDeadline determines how long (in microseconds) a request that is a reasonable merge |
| // candidate can be queued on the request queue. |
| RqBasedSeqIOMergeDeadline uint64 |
| // Suspended indicates if the device is suspended (1 is on, 0 is off). |
| Suspended uint64 |
| // UseBlkMQ indicates if the device is using the request-based blk-mq I/O path mode (1 is on, 0 is off). |
| UseBlkMQ uint64 |
| // UUID is the DM-UUID string or empty string if DM-UUID is not set. |
| UUID string |
| } |
| |
| // UnderlyingDevices models the list of devices that this device is built from. |
| type UnderlyingDeviceInfo struct { |
| // DeviceNames is the list of devices names |
| DeviceNames []string |
| } |
| |
| const ( |
| procDiskstatsPath = "diskstats" |
| procDiskstatsFormat = "%d %d %s %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d" |
| sysBlockPath = "block" |
| sysBlockStatFormat = "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d" |
| sysBlockQueue = "queue" |
| sysBlockDM = "dm" |
| sysUnderlyingDev = "slaves" |
| sysBlockSize = "size" |
| sysDevicePath = "device" |
| ) |
| |
| // FS represents the pseudo-filesystems proc and sys, which provides an |
| // interface to kernel data structures. |
| type FS struct { |
| proc *fs.FS |
| sys *fs.FS |
| } |
| |
| // NewDefaultFS returns a new blockdevice fs using the default mountPoints for proc and sys. |
| // It will error if either of these mount points can't be read. |
| func NewDefaultFS() (FS, error) { |
| return NewFS(fs.DefaultProcMountPoint, fs.DefaultSysMountPoint) |
| } |
| |
| // NewFS returns a new blockdevice fs using the given mountPoints for proc and sys. |
| // It will error if either of these mount points can't be read. |
| func NewFS(procMountPoint string, sysMountPoint string) (FS, error) { |
| if strings.TrimSpace(procMountPoint) == "" { |
| procMountPoint = fs.DefaultProcMountPoint |
| } |
| procfs, err := fs.NewFS(procMountPoint) |
| if err != nil { |
| return FS{}, err |
| } |
| if strings.TrimSpace(sysMountPoint) == "" { |
| sysMountPoint = fs.DefaultSysMountPoint |
| } |
| sysfs, err := fs.NewFS(sysMountPoint) |
| if err != nil { |
| return FS{}, err |
| } |
| return FS{&procfs, &sysfs}, nil |
| } |
| |
| // ProcDiskstats reads the diskstats file and returns |
| // an array of Diskstats (one per line/device). |
| func (fs FS) ProcDiskstats() ([]Diskstats, error) { |
| file, err := os.Open(fs.proc.Path(procDiskstatsPath)) |
| if err != nil { |
| return nil, err |
| } |
| defer file.Close() |
| return parseProcDiskstats(file) |
| } |
| |
| func parseProcDiskstats(r io.Reader) ([]Diskstats, error) { |
| var ( |
| diskstats []Diskstats |
| scanner = bufio.NewScanner(r) |
| err error |
| ) |
| for scanner.Scan() { |
| d := &Diskstats{} |
| d.IoStatsCount, err = fmt.Sscanf(scanner.Text(), procDiskstatsFormat, |
| &d.MajorNumber, |
| &d.MinorNumber, |
| &d.DeviceName, |
| &d.ReadIOs, |
| &d.ReadMerges, |
| &d.ReadSectors, |
| &d.ReadTicks, |
| &d.WriteIOs, |
| &d.WriteMerges, |
| &d.WriteSectors, |
| &d.WriteTicks, |
| &d.IOsInProgress, |
| &d.IOsTotalTicks, |
| &d.WeightedIOTicks, |
| &d.DiscardIOs, |
| &d.DiscardMerges, |
| &d.DiscardSectors, |
| &d.DiscardTicks, |
| &d.FlushRequestsCompleted, |
| &d.TimeSpentFlushing, |
| ) |
| // The io.EOF error can be safely ignored because it just means we read fewer than |
| // the full 20 fields. |
| if err != nil && !errors.Is(err, io.EOF) { |
| return diskstats, err |
| } |
| if d.IoStatsCount >= 14 { |
| diskstats = append(diskstats, *d) |
| } |
| } |
| return diskstats, scanner.Err() |
| } |
| |
| // SysBlockDevices lists the device names from /sys/block/<dev>. |
| func (fs FS) SysBlockDevices() ([]string, error) { |
| deviceDirs, err := os.ReadDir(fs.sys.Path(sysBlockPath)) |
| if err != nil { |
| return nil, err |
| } |
| devices := []string{} |
| for _, deviceDir := range deviceDirs { |
| devices = append(devices, deviceDir.Name()) |
| } |
| return devices, nil |
| } |
| |
| // SysBlockDeviceStat returns stats for the block device read from /sys/block/<device>/stat. |
| // The number of stats read will be 15 if the discard stats are available (kernel 4.18+) |
| // and 11 if they are not available. |
| func (fs FS) SysBlockDeviceStat(device string) (IOStats, int, error) { |
| bytes, err := os.ReadFile(fs.sys.Path(sysBlockPath, device, "stat")) |
| if err != nil { |
| return IOStats{}, 0, err |
| } |
| return parseSysBlockDeviceStat(bytes) |
| } |
| |
| func parseSysBlockDeviceStat(data []byte) (IOStats, int, error) { |
| stat := IOStats{} |
| count, err := fmt.Sscanf(strings.TrimSpace(string(data)), sysBlockStatFormat, |
| &stat.ReadIOs, |
| &stat.ReadMerges, |
| &stat.ReadSectors, |
| &stat.ReadTicks, |
| &stat.WriteIOs, |
| &stat.WriteMerges, |
| &stat.WriteSectors, |
| &stat.WriteTicks, |
| &stat.IOsInProgress, |
| &stat.IOsTotalTicks, |
| &stat.WeightedIOTicks, |
| &stat.DiscardIOs, |
| &stat.DiscardMerges, |
| &stat.DiscardSectors, |
| &stat.DiscardTicks, |
| &stat.FlushRequestsCompleted, |
| &stat.TimeSpentFlushing, |
| ) |
| // An io.EOF error is ignored because it just means we read fewer than the full 15 fields. |
| if errors.Is(err, io.EOF) { |
| return stat, count, nil |
| } |
| return stat, count, err |
| } |
| |
| // SysBlockDeviceQueueStats returns stats for /sys/block/xxx/queue where xxx is a device name. |
| func (fs FS) SysBlockDeviceQueueStats(device string) (BlockQueueStats, error) { |
| stat := BlockQueueStats{} |
| // Files with uint64 fields |
| for file, p := range map[string]*uint64{ |
| "add_random": &stat.AddRandom, |
| "dax": &stat.DAX, |
| "discard_granularity": &stat.DiscardGranularity, |
| "discard_max_hw_bytes": &stat.DiscardMaxHWBytes, |
| "discard_max_bytes": &stat.DiscardMaxBytes, |
| "hw_sector_size": &stat.HWSectorSize, |
| "io_poll": &stat.IOPoll, |
| "io_timeout": &stat.IOTimeout, |
| "iostats": &stat.IOStats, |
| "logical_block_size": &stat.LogicalBlockSize, |
| "max_hw_sectors_kb": &stat.MaxHWSectorsKB, |
| "max_integrity_segments": &stat.MaxIntegritySegments, |
| "max_sectors_kb": &stat.MaxSectorsKB, |
| "max_segments": &stat.MaxSegments, |
| "max_segment_size": &stat.MaxSegmentSize, |
| "minimum_io_size": &stat.MinimumIOSize, |
| "nomerges": &stat.NoMerges, |
| "nr_requests": &stat.NRRequests, |
| "optimal_io_size": &stat.OptimalIOSize, |
| "physical_block_size": &stat.PhysicalBlockSize, |
| "read_ahead_kb": &stat.ReadAHeadKB, |
| "rotational": &stat.Rotational, |
| "rq_affinity": &stat.RQAffinity, |
| "write_same_max_bytes": &stat.WriteSameMaxBytes, |
| "nr_zones": &stat.NRZones, |
| "chunk_sectors": &stat.ChunkSectors, |
| "fua": &stat.FUA, |
| "max_discard_segments": &stat.MaxDiscardSegments, |
| "write_zeroes_max_bytes": &stat.WriteZeroesMaxBytes, |
| } { |
| val, err := util.ReadUintFromFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, file)) |
| if err != nil { |
| return BlockQueueStats{}, err |
| } |
| *p = val |
| } |
| // Files with int64 fields |
| for file, p := range map[string]*int64{ |
| "io_poll_delay": &stat.IOPollDelay, |
| "wbt_lat_usec": &stat.WBTLatUSec, |
| } { |
| val, err := util.ReadIntFromFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, file)) |
| if err != nil { |
| return BlockQueueStats{}, err |
| } |
| *p = val |
| } |
| // Files with string fields |
| for file, p := range map[string]*string{ |
| "write_cache": &stat.WriteCache, |
| "zoned": &stat.Zoned, |
| } { |
| val, err := util.SysReadFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, file)) |
| if err != nil { |
| return BlockQueueStats{}, err |
| } |
| *p = val |
| } |
| scheduler, err := util.SysReadFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, "scheduler")) |
| if err != nil { |
| return BlockQueueStats{}, err |
| } |
| var schedulers []string |
| for s := range strings.SplitSeq(scheduler, " ") { |
| if strings.HasPrefix(s, "[") && strings.HasSuffix(s, "]") { |
| s = s[1 : len(s)-1] |
| stat.SchedulerCurrent = s |
| } |
| schedulers = append(schedulers, s) |
| } |
| stat.SchedulerList = schedulers |
| // optional |
| throttleSampleTime, err := util.ReadUintFromFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, "throttle_sample_time")) |
| if err == nil { |
| stat.ThrottleSampleTime = &throttleSampleTime |
| } |
| return stat, nil |
| } |
| |
| func (fs FS) SysBlockDeviceMapperInfo(device string) (DeviceMapperInfo, error) { |
| info := DeviceMapperInfo{} |
| // Files with uint64 fields |
| for file, p := range map[string]*uint64{ |
| "rq_based_seq_io_merge_deadline": &info.RqBasedSeqIOMergeDeadline, |
| "suspended": &info.Suspended, |
| "use_blk_mq": &info.UseBlkMQ, |
| } { |
| val, err := util.ReadUintFromFile(fs.sys.Path(sysBlockPath, device, sysBlockDM, file)) |
| if err != nil { |
| return DeviceMapperInfo{}, err |
| } |
| *p = val |
| } |
| // Files with string fields |
| for file, p := range map[string]*string{ |
| "name": &info.Name, |
| "uuid": &info.UUID, |
| } { |
| val, err := util.SysReadFile(fs.sys.Path(sysBlockPath, device, sysBlockDM, file)) |
| if err != nil { |
| return DeviceMapperInfo{}, err |
| } |
| *p = val |
| } |
| return info, nil |
| } |
| |
| func (fs FS) SysBlockDeviceUnderlyingDevices(device string) (UnderlyingDeviceInfo, error) { |
| underlyingDir, err := os.Open(fs.sys.Path(sysBlockPath, device, sysUnderlyingDev)) |
| if err != nil { |
| return UnderlyingDeviceInfo{}, err |
| } |
| underlying, err := underlyingDir.Readdirnames(0) |
| if err != nil { |
| return UnderlyingDeviceInfo{}, err |
| } |
| return UnderlyingDeviceInfo{DeviceNames: underlying}, nil |
| |
| } |
| |
| // SysBlockDeviceSize returns the size of the block device from /sys/block/<device>/size |
| // in bytes by multiplying the value by the Linux sector length of 512. |
| func (fs FS) SysBlockDeviceSize(device string) (uint64, error) { |
| size, err := util.ReadUintFromFile(fs.sys.Path(sysBlockPath, device, sysBlockSize)) |
| if err != nil { |
| return 0, err |
| } |
| return procfs.SectorSize * size, nil |
| } |
| |
| // SysBlockDeviceIO returns stats for the block device io counters |
| // IO done count: /sys/block/<disk>/device/iodone_cnt |
| // IO error count: /sys/block/<disk>/device/ioerr_cnt. |
| func (fs FS) SysBlockDeviceIOStat(device string) (IODeviceStats, error) { |
| var ( |
| ioDeviceStats IODeviceStats |
| err error |
| ) |
| for file, p := range map[string]*uint64{ |
| "iodone_cnt": &ioDeviceStats.IODoneCount, |
| "ioerr_cnt": &ioDeviceStats.IOErrCount, |
| } { |
| var val uint64 |
| val, err = util.ReadHexFromFile(fs.sys.Path(sysBlockPath, device, sysDevicePath, file)) |
| if err != nil { |
| return IODeviceStats{}, err |
| } |
| *p = val |
| } |
| return ioDeviceStats, nil |
| } |