tools/license2rtf.js - v8/node - Git at Google


 var assert = require('assert'),
     Stream = require('stream'),
     inherits = require('util').inherits;


 /*
  * This filter consumes a stream of characters and emits one string per line.
  */
 function LineSplitter() {
   var self = this,
       buffer = "";

   Stream.call(this);
   this.writable = true;

   this.write = function(data) {
     var lines = (buffer + data).split(/\r\n|\n\r|\n|\r/);
     for (var i = 0; i < lines.length - 1; i++) {
       self.emit('data', lines[i]);
     }
     buffer = lines[lines.length - 1];
     return true;
   };

   this.end = function(data) {
     this.write(data || '');
     if (buffer) {
       self.emit('data', buffer);
     }
     self.emit('end');
   };
 }
 inherits(LineSplitter, Stream);


 /*
  * This filter consumes lines and emits paragraph objects.
  */
 function ParagraphParser() {
   var self = this,
       block_is_license_block = false,
       block_has_c_style_comment,
       is_first_line_in_paragraph,
       paragraph_line_indent,
       paragraph;

    Stream.call(this);
    this.writable = true;

    resetBlock(false);

    this.write = function(data) {
      parseLine(data + '');
      return true;
    };

    this.end = function(data) {
      if (data) {
        parseLine(data + '');
      }
      flushParagraph();
      self.emit('end');
    };

   function resetParagraph() {
     is_first_line_in_paragraph = true;
     paragraph_line_indent = -1;

     paragraph = {
       li: '',
       in_license_block: block_is_license_block,
       lines: []
     };
   }

   function resetBlock(is_license_block) {
     block_is_license_block = is_license_block;
     block_has_c_style_comment = false;
     resetParagraph();
   }

   function flushParagraph() {
     if (paragraph.lines.length || paragraph.li) {
       self.emit('data', paragraph);
     }
     resetParagraph();
   }

   function parseLine(line) {
     // Strip trailing whitespace
     line = line.replace(/\s*$/, '');

     // Detect block separator
     if (/^\s*(=|"){3,}\s*$/.test(line)) {
       flushParagraph();
       resetBlock(!block_is_license_block);
       return;
     }

     // Strip comments around block
     if (block_is_license_block) {
       if (!block_has_c_style_comment)
         block_has_c_style_comment = /^\s*(\/\*)/.test(line);
       if (block_has_c_style_comment) {
         var prev = line;
         line = line.replace(/^(\s*?)(?:\s?\*\/|\/\*\s|\s\*\s?)/, '$1');
         if (prev == line)
           line = line.replace(/^\s{2}/, '');
         if (/\*\//.test(prev))
           block_has_c_style_comment = false;
       } else {
         // Strip C++ and perl style comments.
         line = line.replace(/^(\s*)(?:\/\/\s?|#\s?)/, '$1');
       }
     }

     // Detect blank line (paragraph separator)
     if (!/\S/.test(line)) {
       flushParagraph();
       return;
     }

     // Detect separator "lines" within a block. These mark a paragraph break
     // and are stripped from the output.
     if (/^\s*[=*\-]{5,}\s*$/.test(line)) {
       flushParagraph();
       return;
     }

     // Find out indentation level and the start of a lied or numbered list;
     var result = /^(\s*)(\d+\.|\*|-)?\s*/.exec(line);
     assert.ok(result);
     // The number of characters that will be stripped from the beginning of
     // the line.
     var line_strip_length = result[0].length;
     // The indentation size that will be used to detect indentation jumps.
     // Fudge by 1 space.
     var line_indent = Math.floor(result[0].length / 2) * 2;
     // The indentation level that will be exported
     var level = Math.floor(result[1].length / 2);
     // The list indicator that precedes the actual content, if any.
     var line_li = result[2];

     // Flush the paragraph when there is a li or an indentation jump
     if (line_li || (line_indent != paragraph_line_indent &&
                     paragraph_line_indent != -1)) {
       flushParagraph();
       paragraph.li = line_li;
     }

     // Set the paragraph indent that we use to detect indentation jumps. When
     // we just detected a list indicator, wait
     // for the next line to arrive before setting this.
     if (!line_li && paragraph_line_indent != -1) {
       paragraph_line_indent = line_indent;
     }

     // Set the output indent level if it has not been set yet.
     if (paragraph.level === undefined)
       paragraph.level = level;

     // Strip leading whitespace and li.
     line = line.slice(line_strip_length);

     if (line)
       paragraph.lines.push(line);

     is_first_line_in_paragraph = false;
   }
 }
 inherits(ParagraphParser, Stream);


 /*
  * This filter consumes paragraph objects and emits modified paragraph objects.
  * The lines within the paragraph are unwrapped where appropriate. It also
  * replaces multiple consecutive whitespace characters by a single one.
  */
 function Unwrapper() {
   var self = this;

   Stream.call(this);
   this.writable = true;

   this.write = function(paragraph) {
     var lines = paragraph.lines,
         break_after = [],
         i;

     for (i = 0; i < lines.length - 1; i++) {
       var line = lines[i];

       // When a line is really short, the line was probably kept separate for a
       // reason.
       if (line.length < 50)  {
         // If the first word on the next line really didn't fit after the line,
         // it probably was just ordinary wrapping after all.
         var next_first_word_length = lines[i + 1].replace(/\s.*$/, '').length;
         if (line.length + next_first_word_length < 60) {
           break_after[i] = true;
         }
       }
     }

     for (i = 0; i < lines.length - 1; ) {
       if (!break_after[i]) {
         lines[i] += ' ' + lines.splice(i + 1, 1)[0];
       } else {
         i++;
       }
     }

     for (i = 0; i < lines.length; i++) {
       // Replace multiple whitespace characters by a single one, and strip
       // trailing whitespace.
       lines[i] = lines[i].replace(/\s+/g, ' ').replace(/\s+$/, '');
     }

     self.emit('data', paragraph);
   };

   this.end = function(data) {
     if (data)
       self.write(data);
     self.emit('end');
   };
 }
 inherits(Unwrapper, Stream);


 /*
  * This filter generates an rtf document from a stream of paragraph objects.
  */
 function RtfGenerator() {
   var self = this,
       did_write_anything = false;

   Stream.call(this);
   this.writable = true;

   this.write = function(paragraph) {
     if (!did_write_anything) {
       emitHeader();
       did_write_anything = true;
     }

     var li = paragraph.li,
         level = paragraph.level + (li ? 1 : 0),
         lic = paragraph.in_license_block;

     var rtf = "\\pard";
     rtf += '\\sa150\\sl300\\slmult1';
     if (level > 0)
       rtf += '\\li' + (level * 240);
     if (li) {
       rtf += '\\tx' + (level) * 240;
       rtf += '\\fi-240';
     }
     if (lic)
       rtf += '\\ri240';
     if (!lic)
       rtf += '\\b';
     if (li)
       rtf += ' ' + li + '\\tab';
     rtf += ' ';
     rtf += paragraph.lines.map(rtfEscape).join('\\line ');
     if (!lic)
       rtf += '\\b0';
     rtf += '\\par\n';

     self.emit('data', rtf);
   };

   this.end = function(data) {
     if (data)
       self.write(data);
     if (did_write_anything)
       emitFooter();
     self.emit('end');
   };

   function toHex(number, length) {
     var hex = (~~number).toString(16);
     while (hex.length < length)
       hex = '0' + hex;
     return hex;
   }

   function rtfEscape(string) {
     return string
       .replace(/[\\\{\}]/g, function(m) {
        return '\\' + m;
       })
       .replace(/\t/g, function() {
         return '\\tab ';
       })
       .replace(/[\x00-\x1f\x7f-\xff]/g, function(m) {
         return '\\\'' + toHex(m.charCodeAt(0), 2);
       })
       .replace(/\ufeff/g, '')
       .replace(/[\u0100-\uffff]/g, function(m) {
         return '\\u' + toHex(m.charCodeAt(0), 4) + '?';
      });
   }

   function emitHeader() {
     self.emit('data', '{\\rtf1\\ansi\\ansicpg1252\\uc1\\deff0\\deflang1033' +
                       '{\\fonttbl{\\f0\\fswiss\\fcharset0 Tahoma;}}\\fs20\n' +
                       '{\\*\\generator txt2rtf 0.0.1;}\n');
   }

   function emitFooter() {
     self.emit('data', '}');
   }
 }
 inherits(RtfGenerator, Stream);


 var stdin = process.stdin,
     stdout = process.stdout,
     line_splitter = new LineSplitter(),
     paragraph_parser = new ParagraphParser(),
     unwrapper = new Unwrapper(),
     rtf_generator = new RtfGenerator();

 stdin.setEncoding('utf-8');
 stdin.resume();

 stdin.pipe(line_splitter);
 line_splitter.pipe(paragraph_parser);
 paragraph_parser.pipe(unwrapper);
 unwrapper.pipe(rtf_generator);
 rtf_generator.pipe(stdout);

	var assert = require('assert'),
	Stream = require('stream'),
	inherits = require('util').inherits;


	/*
	* This filter consumes a stream of characters and emits one string per line.
	*/
	function LineSplitter() {
	var self = this,
	buffer = "";

	Stream.call(this);
	this.writable = true;

	this.write = function(data) {
	var lines = (buffer + data).split(/\r\n\|\n\r\|\n\|\r/);
	for (var i = 0; i < lines.length - 1; i++) {
	self.emit('data', lines[i]);
	}
	buffer = lines[lines.length - 1];
	return true;
	};

	this.end = function(data) {
	this.write(data \|\| '');
	if (buffer) {
	self.emit('data', buffer);
	}
	self.emit('end');
	};
	}
	inherits(LineSplitter, Stream);


	/*
	* This filter consumes lines and emits paragraph objects.
	*/
	function ParagraphParser() {
	var self = this,
	block_is_license_block = false,
	block_has_c_style_comment,
	is_first_line_in_paragraph,
	paragraph_line_indent,
	paragraph;

	Stream.call(this);
	this.writable = true;

	resetBlock(false);

	this.write = function(data) {
	parseLine(data + '');
	return true;
	};

	this.end = function(data) {
	if (data) {
	parseLine(data + '');
	}
	flushParagraph();
	self.emit('end');
	};

	function resetParagraph() {
	is_first_line_in_paragraph = true;
	paragraph_line_indent = -1;

	paragraph = {
	li: '',
	in_license_block: block_is_license_block,
	lines: []
	};
	}

	function resetBlock(is_license_block) {
	block_is_license_block = is_license_block;
	block_has_c_style_comment = false;
	resetParagraph();
	}

	function flushParagraph() {
	if (paragraph.lines.length \|\| paragraph.li) {
	self.emit('data', paragraph);
	}
	resetParagraph();
	}

	function parseLine(line) {
	// Strip trailing whitespace
	line = line.replace(/\s*$/, '');

	// Detect block separator
	if (/^\s(=\|"){3,}\s$/.test(line)) {
	flushParagraph();
	resetBlock(!block_is_license_block);
	return;
	}

	// Strip comments around block
	if (block_is_license_block) {
	if (!block_has_c_style_comment)
	block_has_c_style_comment = /^\s(\/\)/.test(line);
	if (block_has_c_style_comment) {
	var prev = line;
	line = line.replace(/^(\s?)(?:\s?\\/\|\/\\s\|\s\\s?)/, '$1');
	if (prev == line)
	line = line.replace(/^\s{2}/, '');
	if (/\*\//.test(prev))
	block_has_c_style_comment = false;
	} else {
	// Strip C++ and perl style comments.
	line = line.replace(/^(\s*)(?:\/\/\s?\|#\s?)/, '$1');
	}
	}

	// Detect blank line (paragraph separator)
	if (!/\S/.test(line)) {
	flushParagraph();
	return;
	}

	// Detect separator "lines" within a block. These mark a paragraph break
	// and are stripped from the output.
	if (/^\s[=\-]{5,}\s*$/.test(line)) {
	flushParagraph();
	return;
	}

	// Find out indentation level and the start of a lied or numbered list;
	var result = /^(\s)(\d+\.\|\\|-)?\s*/.exec(line);
	assert.ok(result);
	// The number of characters that will be stripped from the beginning of
	// the line.
	var line_strip_length = result[0].length;
	// The indentation size that will be used to detect indentation jumps.
	// Fudge by 1 space.
	var line_indent = Math.floor(result[0].length / 2) * 2;
	// The indentation level that will be exported
	var level = Math.floor(result[1].length / 2);
	// The list indicator that precedes the actual content, if any.
	var line_li = result[2];

	// Flush the paragraph when there is a li or an indentation jump
	if (line_li \|\| (line_indent != paragraph_line_indent &&
	paragraph_line_indent != -1)) {
	flushParagraph();
	paragraph.li = line_li;
	}

	// Set the paragraph indent that we use to detect indentation jumps. When
	// we just detected a list indicator, wait
	// for the next line to arrive before setting this.
	if (!line_li && paragraph_line_indent != -1) {
	paragraph_line_indent = line_indent;
	}

	// Set the output indent level if it has not been set yet.
	if (paragraph.level === undefined)
	paragraph.level = level;

	// Strip leading whitespace and li.
	line = line.slice(line_strip_length);

	if (line)
	paragraph.lines.push(line);

	is_first_line_in_paragraph = false;
	}
	}
	inherits(ParagraphParser, Stream);


	/*
	* This filter consumes paragraph objects and emits modified paragraph objects.
	* The lines within the paragraph are unwrapped where appropriate. It also
	* replaces multiple consecutive whitespace characters by a single one.
	*/
	function Unwrapper() {
	var self = this;

	Stream.call(this);
	this.writable = true;

	this.write = function(paragraph) {
	var lines = paragraph.lines,
	break_after = [],
	i;

	for (i = 0; i < lines.length - 1; i++) {
	var line = lines[i];

	// When a line is really short, the line was probably kept separate for a
	// reason.
	if (line.length < 50) {
	// If the first word on the next line really didn't fit after the line,
	// it probably was just ordinary wrapping after all.
	var next_first_word_length = lines[i + 1].replace(/\s.*$/, '').length;
	if (line.length + next_first_word_length < 60) {
	break_after[i] = true;
	}
	}
	}

	for (i = 0; i < lines.length - 1; ) {
	if (!break_after[i]) {
	lines[i] += ' ' + lines.splice(i + 1, 1)[0];
	} else {
	i++;
	}
	}

	for (i = 0; i < lines.length; i++) {
	// Replace multiple whitespace characters by a single one, and strip
	// trailing whitespace.
	lines[i] = lines[i].replace(/\s+/g, ' ').replace(/\s+$/, '');
	}

	self.emit('data', paragraph);
	};

	this.end = function(data) {
	if (data)
	self.write(data);
	self.emit('end');
	};
	}
	inherits(Unwrapper, Stream);


	/*
	* This filter generates an rtf document from a stream of paragraph objects.
	*/
	function RtfGenerator() {
	var self = this,
	did_write_anything = false;

	Stream.call(this);
	this.writable = true;

	this.write = function(paragraph) {
	if (!did_write_anything) {
	emitHeader();
	did_write_anything = true;
	}

	var li = paragraph.li,
	level = paragraph.level + (li ? 1 : 0),
	lic = paragraph.in_license_block;

	var rtf = "\\pard";
	rtf += '\\sa150\\sl300\\slmult1';
	if (level > 0)
	rtf += '\\li' + (level * 240);
	if (li) {
	rtf += '\\tx' + (level) * 240;
	rtf += '\\fi-240';
	}
	if (lic)
	rtf += '\\ri240';
	if (!lic)
	rtf += '\\b';
	if (li)
	rtf += ' ' + li + '\\tab';
	rtf += ' ';
	rtf += paragraph.lines.map(rtfEscape).join('\\line ');
	if (!lic)
	rtf += '\\b0';
	rtf += '\\par\n';

	self.emit('data', rtf);
	};

	this.end = function(data) {
	if (data)
	self.write(data);
	if (did_write_anything)
	emitFooter();
	self.emit('end');
	};

	function toHex(number, length) {
	var hex = (~~number).toString(16);
	while (hex.length < length)
	hex = '0' + hex;
	return hex;
	}

	function rtfEscape(string) {
	return string
	.replace(/[\\\{\}]/g, function(m) {
	return '\\' + m;
	})
	.replace(/\t/g, function() {
	return '\\tab ';
	})
	.replace(/[\x00-\x1f\x7f-\xff]/g, function(m) {
	return '\\\'' + toHex(m.charCodeAt(0), 2);
	})
	.replace(/\ufeff/g, '')
	.replace(/[\u0100-\uffff]/g, function(m) {
	return '\\u' + toHex(m.charCodeAt(0), 4) + '?';
	});
	}

	function emitHeader() {
	self.emit('data', '{\\rtf1\\ansi\\ansicpg1252\\uc1\\deff0\\deflang1033' +
	'{\\fonttbl{\\f0\\fswiss\\fcharset0 Tahoma;}}\\fs20\n' +
	'{\\*\\generator txt2rtf 0.0.1;}\n');
	}

	function emitFooter() {
	self.emit('data', '}');
	}
	}
	inherits(RtfGenerator, Stream);


	var stdin = process.stdin,
	stdout = process.stdout,
	line_splitter = new LineSplitter(),
	paragraph_parser = new ParagraphParser(),
	unwrapper = new Unwrapper(),
	rtf_generator = new RtfGenerator();

	stdin.setEncoding('utf-8');
	stdin.resume();

	stdin.pipe(line_splitter);
	line_splitter.pipe(paragraph_parser);
	paragraph_parser.pipe(unwrapper);
	unwrapper.pipe(rtf_generator);
	rtf_generator.pipe(stdout);