blob: 3a5d62d5eed6feb56328ba70168379edbbc1126d [file] [log] [blame]
// Copyright 2024 The LUCI Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package artifacts
import (
"bytes"
"fmt"
"math"
"mime"
"path/filepath"
"regexp"
"strconv"
"strings"
"time"
"google.golang.org/protobuf/proto"
"google.golang.org/protobuf/types/known/timestamppb"
"go.chromium.org/luci/common/errors"
pb "go.chromium.org/luci/resultdb/proto/v1"
)
// ToLogLines retrieves and processes an artifact and returns
// its content as a set of log lines.
// It executes best effort extraction of the timestamp and the severity
// of each log line.
// The `year` is used as a fallback if the year was absent from
// the log line's timestamp.
// The max specified the maximum number of results to return,
// if the max <= 0 it will return all lines.
func ToLogLines(artifactID string, contentType string, content []byte, year, maxLines, maxBytes int) ([]*pb.ArtifactLine, error) {
isSupported := isLogSupportedArtifact(artifactID, contentType)
if !isSupported {
return nil, errors.New("unsupported file type")
}
linesContent := bytes.Split(content, []byte("\n"))
limit := len(linesContent)
if maxLines < limit && maxLines > 0 {
limit = maxLines
}
ret := make([]*pb.ArtifactLine, 0, limit)
var totalSize int
for index, lineContent := range linesContent[:limit] {
lineContentStr := string(lineContent)
timestamp, _ := extractTimestamp(lineContentStr, year)
severity := extractSeverity(lineContentStr)
line := &pb.ArtifactLine{
Number: int64(index) + 1,
Timestamp: timestamp,
Severity: severity,
Content: lineContent,
}
totalSize += proto.Size(line)
if totalSize > maxBytes {
break
}
ret = append(ret, line)
}
// The file contains lines but no single line fits the max size.
if len(linesContent) > 0 && len(ret) == 0 {
return nil, errors.New(fmt.Sprintf("first file line content exceeds maximum size limit: %d bytes", maxBytes))
}
return ret, nil
}
var SupportedContentTypes = map[string]bool{
"application/octet-stream": true,
"application/x-gzip": true,
}
func isLogSupportedArtifact(artifactID string, contentType string) bool {
return isSupportedContentType(contentType) && !isNonTextFile(artifactID)
}
func isSupportedContentType(contentType string) bool {
// If the content type is not available then
// we allow other checks to validate if the file is supported.
if contentType == "" {
return true
}
_, ok := SupportedContentTypes[contentType]
return strings.HasPrefix(contentType, "text") || ok
}
func isNonTextFile(filePath string) bool {
ext := filepath.Ext(filePath)
mimeType := mime.TypeByExtension(ext)
fmt.Println(filePath)
if ext != "" {
return mimeType != "" && !strings.HasPrefix(mimeType, "text/")
} else {
return false
}
}
type logTimestamp struct {
// The regexp for the supported log timestamp
Regexp *regexp.Regexp
// The time layout defining the parser format used by time parser lib
Layout string
// Specific time zone, e.g. America/Los_Angeles
Zone string
// Specifies whether the timestamp is in sec.millisec epoch format
IsEpoch bool
// Specifies whether the year is missing from the timestamp
YearMissing bool
}
const (
RFC3339FullDate = "2006-01-02"
PstTimeZone = "America/Los_Angeles"
UtcTimeZone = "UTC"
)
// List of supported log timestamps.
var supportedLogTimestamps = []logTimestamp{
// tast and upstart log style: 2021-12-17T07:33:58.266026Z
{
Regexp: regexp.MustCompile(
`(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}).(\d{3,6})[A-Z]`),
Layout: "2006-01-02T15:04:05.999999Z07:00",
Zone: UtcTimeZone,
},
// gziped files timestamps: 03-12 19:08:48.511
{
Regexp: regexp.MustCompile(
`^(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2}).(\d{3})`),
Layout: "2006/01-02 15:04:05.000",
Zone: UtcTimeZone,
YearMissing: true,
},
// tast steam out timestamps: 2022-10-18 15:10:19
{
Regexp: regexp.MustCompile(`(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})`),
Layout: "2006-01-02 15:04:05",
Zone: PstTimeZone,
YearMissing: false,
},
// tast log.txt timestamps: 2022/10/18 15:10:19
{
Regexp: regexp.MustCompile(`(\d{4})\/(\d{2})\/(\d{2}) (\d{2}):(\d{2}):(\d{2})`),
Layout: "2006/01/02 15:04:05",
Zone: PstTimeZone,
YearMissing: false,
},
// <tast_test>_logs.txt timestamps: Jun 9 22:15:15
{
Regexp: regexp.MustCompile(`(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (\d{1,}) (\d{2}):(\d{2}):(\d{2})`),
Layout: "2006/Jan 2 15:04:05",
Zone: PstTimeZone,
YearMissing: true,
},
// Format for timestamps like: 12/16 22:34:29.742
{
Regexp: regexp.MustCompile(`(\d{2})\/(\d{2}) (\d{2}):(\d{2}):(\d{2})(?:.(\d+))?`),
Layout: "2006/01/02 15:04:05.000",
Zone: PstTimeZone,
YearMissing: true,
},
// ISO UTC format: 2019-09-30T09:49:45.355431+00:00
{
Regexp: regexp.MustCompile(
`(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}).(\d{3})\d+[\+\-]\d{2}:\d{2}`),
Layout: "2006-01-02T15:04:05.000000-07:00",
Zone: UtcTimeZone,
},
// chrome log style timestamps, eg: 1119/064727.250343
{
Regexp: regexp.MustCompile(`(\d{2})(\d{2})\/(\d{2})(\d{2})(\d{2})\.(\d{6})`),
Layout: "2006/0102/150405.000000",
Zone: PstTimeZone,
YearMissing: true,
},
// audit.log style timestamps: msg=audit(1571729288.142:53)
{
Regexp: regexp.MustCompile(`msg=audit\((\d+)\.(\d+):(\d+)`),
IsEpoch: true,
Zone: UtcTimeZone,
YearMissing: true,
},
// journal.log style dates: Nov 01 02:58:43
{
Regexp: regexp.MustCompile(
`(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) ([0-9]{1,}) ([0-9]{2}):([0-9]{2}):([0-9]{2})`),
Layout: "2006/Jan 02 15:04:05",
Zone: PstTimeZone,
YearMissing: true,
},
// vmlog timestamps: 0620/065447
{
Regexp: regexp.MustCompile(`(\d{2})(\d{2})/(\d{2})(\d{2})(\d{2})`),
Layout: "2006/0102/150405",
Zone: PstTimeZone,
YearMissing: true,
},
// labstation timestamps: Jun19 20:18
{
Regexp: regexp.MustCompile(
`(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)([0-9]{2}) ([0-9]{2}):([0-9]{2})`),
Layout: "2006/Jan02 15:04",
Zone: PstTimeZone,
YearMissing: true,
},
}
func extractTimestamp(line string, year int) (*timestamppb.Timestamp, error) {
for _, l := range supportedLogTimestamps {
m := l.Regexp.FindStringSubmatch(line)
// If timestamp is already converted to millisec epoc, skip the time
// parser
if l.IsEpoch {
if len(m) > 3 {
// Convert string to int base 10 with 64 bit size
sec, err := strconv.ParseInt(m[1], 10, 64)
if err != nil {
break
}
milliSec, err := strconv.ParseInt(m[2], 10, 64)
if err != nil {
break
}
nanoSec, err := strconv.ParseInt(m[3], 10, 64)
if err != nil {
break
}
nanoSec += milliSec * int64(math.Pow10(6))
return timestamppb.New(time.Unix(sec, nanoSec)), nil
}
} else {
if len(m) > 1 {
ts := m[0]
if l.YearMissing {
ts = fmt.Sprintf("%d/%s", year, ts)
}
t, err := time.ParseInLocation(l.Layout, ts, zone(l.Zone))
if err != nil {
return nil, err
}
return timestamppb.New(t), nil
}
}
}
return nil, errors.New("invalid timestamp format provided.")
}
// zone gets the timestamp location from a specific zone name.
// If an invalid zone is provided it defaults to UTC.
func zone(name string) *time.Location {
zone, err := time.LoadLocation(name)
if err != nil {
return time.UTC
}
return zone
}
// severityRegexes are the recognized log severity patterns found in
// most of the artifacts that we process, severity extraction is best effort
// and more severities can be added in the future.
//
// Matches the following surrounded by whitespace,
// the full wrods can also be surrounded by colon `:` character:
// * FATAL = FATAL or F.
// * ERROR = ERROR or ERR or E.
// * WARNING = WARNING or WARNIN or WARNI or WARN OR W.
// * NOTICE = NOTICE or NOTIC or N.
// * INFO = INFO or I.
// * DEBUG = DEBUG or D.
// * VERBOSE = VERBOSE or VERBOSE1 or V.
var severityRegexes = []regexp.Regexp{
*regexp.MustCompile(`\s*\:*\(?(FATAL)\)?\:*[\s\|]+|.+\s(F)\s.+`),
*regexp.MustCompile(`\s*\:*\(?(ERROR|ERR)\)?\:*[\s\|]+|.+\s(E)\s.+`),
*regexp.MustCompile(`\s*\:*\(?(WARNING|WARNIN|WARNI)\)?\:*[\s\|]+|.+\s(W)\s.+`),
*regexp.MustCompile(`\s*\:*\(?(NOTICE|NOTIC)\)?\:*[\s\|]+|.+\s(N)\s.+`),
*regexp.MustCompile(`\s*\:*\(?(INFO)\)?\:*[\s\|]+|.+\s(I)\s.+`),
*regexp.MustCompile(`\s*\:*\(?(DEBUG)\)?\:*[\s|]+|.+\s(D)\s.+`),
*regexp.MustCompile(`\s*\:*\(?(VERBOSE|VERBOSE1)\)?\:*[\s|]+|.+\s(V)\s.+`),
}
// Function to extract the log level
func extractSeverity(logLine string) pb.ArtifactLine_Severity {
var severity string
for _, severityRegex := range severityRegexes {
match := severityRegex.FindStringSubmatch(logLine)
if len(match) > 1 {
// Find the non-empty capturing group
for _, group := range match[1:] {
if len(group) > 0 {
// Break on the first one as the severity
// is generally found at the beginning of the line.
severity = string(group)
break
}
}
}
}
if severity != "" {
switch {
case strings.HasPrefix(severity, "F"):
return pb.ArtifactLine_FATAL
case strings.HasPrefix(severity, "E"):
return pb.ArtifactLine_ERROR
case strings.HasPrefix(severity, "W"):
return pb.ArtifactLine_WARNING
case strings.HasPrefix(severity, "N"):
return pb.ArtifactLine_NOTICE
case strings.HasPrefix(severity, "I"):
return pb.ArtifactLine_INFO
case strings.HasPrefix(severity, "D"):
return pb.ArtifactLine_DEBUG
case strings.HasPrefix(severity, "V"):
return pb.ArtifactLine_VERBOSE
}
}
return pb.ArtifactLine_SEVERITY_UNSPECIFIED
}