blob: ebc7a2fc8025ea39a0d5cf7159184cf2c0afcf25 [file] [log] [blame]
// Copyright 2019 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Package document provides common utilities for performing comparison between
// documents produced during testing.
package document
import (
"context"
"io/ioutil"
"regexp"
"github.com/kylelemons/godebug/diff"
"chromiumos/tast/errors"
"chromiumos/tast/testing"
)
// TODO(crbug.com/973637): Investigate why it is that CUPS is inconsistent on
// settings the values for the "For" and "Title" fields in the resulting PDF.
// Once the root cause is determined and fixed we should perform the PDF
// comparison without stripping the fields.
// cleanRegex is used to clear away PDF/PS document fields which cause
// discrepancies when attempting to perform a diff between documents. These
// fields have no bearing on the actual content of the document, so it is safe
// to clear them away.
var cleanRegex = regexp.MustCompile(
// Matches the embedded ghostscript version in the PS file.
// This gets outdated on every gs uprev, so we strip it out.
`(?m)(^(%%Creator: GPL Ghostscript .*` +
// This matches the "For" comment field which specifies the user that created the
// file.
`|%%For: \(\w+\)` +
// This matches the "Title" comment field which specifies the title of original
// document.
`|%%Title: \([\w\.]+\)` +
// This matches the comment field which specifies the version of poppler used to
// produce the pdf.
`|%Produced by poppler.*` +
// Removes the postscript creation date.
`|%%CreationDate: D:.*` +
// Removes the ghostscript invocation command.
`|%%Invocation: .*` +
// Removes additional lines of the ghostscript invocation command.
`|%%\+ .*` +
// Removes time metadata for PCLm Jobs.
`|% *job-start-time: .*` +
// Removes PDF xref objects (they contain byte offsets).
`|\d{10} \d{5} [fn] *` +
// Removes the byte offset of a PDF xref object.
`|startxref[\r\n]+\d+[\r\n]+%%EOF` +
// For Brother jobs, jobtime and printlog item 2 contain
// time-specific values.
`|@PJL SET JOBTIME = .*` +
`|@PJL PRINTLOG ITEM = 2,.*` +
// For HP jobs, JobAcct4,JobAcc5 & DMINFO contain
// time-specific values.
`|@PJL SET JOBATTR="JobAcct[45]=.*` +
`|@PJL DMINFO ASCIIHEX=".*` +
// For Ricoh jobs, the SET DATE/TIME values are time-specific.
`|@PJL SET DATE=".*` +
`|@PJL SET TIME=".*)[\r\n])` +
// For Ricoh jobs, "usercode (\d+)" contains the date
// and time of the print job.
`|usrcode \(\d+\)` +
// For Ricoh PS jobs, the time is contained here.
`|/Time \(\d+\)` +
// For Ricoh jobs, "(\d+) lppswd" contains the date
// and time of the print job.
`|\(\d+\) lppswd` +
// The ID tag contains two md5 hashes which uniquely identify the document.
// It is usually stored in the following form:
// [<81b14aafa313db63dbd6f981e49f94f4><81b14aafa313db63dbd6f981e49f94f4>].
// However, occasionally when storing it as a string with escape sequences saves
// space, Ghostscript (s_write_ps_string() in base/spsdf.c) will store it as follows:
// [(\271\022'7\220\243~!~W8i'7\334#)(\271\022'7\220\243~!~W8i'7\334#)].
// See https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf
// section 7.5.5 File Trailer and section 7.3.4.2 Literal Strings for more information.
// For Ricoh jobs, the /ID tag is time-specific.
`|/ID \[.*\]` +
// This matches the "CreationDate" field embedded in the PDF file.
`|/CreationDate\(D:[0-9]{14}[-+Z][0-9]{2}'[0-9]{2}'\)` +
// This matches the "ModDate" field embedded in the PDF file.
`|/ModDate\(D:[0-9]{14}[-+Z][0-9]{2}'[0-9]{2}'\)` +
// This matches the "Producer" field which specifies the Ghostscript
// version for a PDF generated by Ghostscript.
`|/Producer\(GPL Ghostscript [\d.]+\)` +
// This matches the "CreationDate" field embedded in a PDF file generated by
// Tesseract (See: https://github.com/tesseract-ocr/tesseract/blob/
// d33edbc4b19b794a1f979551f89f083d398abe19/src/api/pdfrenderer.cpp#L916).
`|/CreationDate \(D:[0-9]{14}[a-zA-Z0-9\-]{3}'[0-9]{2}'\)` +
// This matches the "Producer" comment field which specifies the Tesseract
// version for a PDF generated by Tesseract.
`|/Producer \(Tesseract [\w\-\.]+\)` +
// This matches the "Producer" comment field added by Skia.
`|/Producer \(Skia/PDF [\w\-\.]+\)`)
// cleanBaseFontRegex is used to clear away the font 'IDs' in the
// FontDescriptor fields which may differ between systems. Thus if two
// FontDescriptor lines refer to the same font we can ignore any difference
// between the IDs.
//
// For example, in the given FontDescriptor field:
// <</BaseFont/WDZDNS+Symbola/FontDescriptor 23 0 R/Type/Font
// The "WDZDNS" ID will be removed.
var cleanBaseFontRegex = regexp.MustCompile(
`(/BaseFont/)([A-Z]{6}\+)([a-zA-Z,]+/FontDescriptor)`)
// cleanFontNameRegex is the same as cleanBaseFontRegex except it matches a
// different form of the FontDescriptor fields.
//
// For example, in the given FontDescriptor field:
// <</Type/FontDescriptor/FontName/ZQPAHQ+Webdings/FontBBox[0 -200 1000 799]/Flags 4
// The "ZQPAHQ" ID will be removed.
var cleanFontNameRegex = regexp.MustCompile(
`(/FontName/)([A-Z]{6}\+)([a-zA-Z,]+/FontBBox)`)
func cleanFontDescriptors(contents string, re *regexp.Regexp) string {
return re.ReplaceAllStringFunc(contents,
func(m string) string {
parts := re.FindStringSubmatch(m)
return parts[1] + parts[3]
})
}
// CleanContents removes font 'IDs' and other PDF/PS metadata to ensure a
// stable diff.
func CleanContents(contents string) string {
contents = cleanFontDescriptors(contents, cleanBaseFontRegex)
contents = cleanFontDescriptors(contents, cleanFontNameRegex)
return cleanRegex.ReplaceAllLiteralString(contents, "")
}
// CompareFileContents compares the string contents given by output and golden
// and returns an error if there are any differences. If there are any
// differences between the given file contents then the results of the diff are
// written to diffPath.
func CompareFileContents(ctx context.Context, output, golden, diffPath string) error {
output = CleanContents(output)
golden = CleanContents(golden)
testing.ContextLog(ctx, "Comparing output with golden file")
if diff := diff.Diff(output, golden); diff != "" {
testing.ContextLog(ctx, "Dumping diff to ", diffPath)
if err := ioutil.WriteFile(diffPath, []byte(diff), 0644); err != nil {
testing.ContextLog(ctx, "Failed to dump diff: ", err)
}
return errors.New("result file did not match the expected file")
}
return nil
}
// CompareFiles loads the contents of the given output and golden files and
// compares them for differences. If there are any differences between the two
// files then an error will be returned and the result of the diff are written
// to diffPath.
func CompareFiles(ctx context.Context, output, golden, diffPath string) error {
outputBytes, err := ioutil.ReadFile(output)
if err != nil {
return errors.Wrapf(err, "failed to read file %s", output)
}
goldenBytes, err := ioutil.ReadFile(golden)
if err != nil {
return errors.Wrapf(err, "failed to read file %s", golden)
}
return CompareFileContents(ctx, string(outputBytes), string(goldenBytes), diffPath)
}