src/third_party/icu/source/tools/escapesrc/escapesrc.cpp - external/github.com/google/proto-quic - Git at Google

 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html

 #include <stdio.h>
 #include <string>
 #include <stdlib.h>
 #include <unistd.h>
 #include <errno.h>
 #include <string.h>
 #include <iostream>
 #include <fstream>

 // with caution:
 #include "unicode/utf8.h"

 static const char
   kSPACE   = 0x20,
   kTAB     = 0x09,
   kLF      = 0x0A,
   kCR      = 0x0D,
   // kHASH    = 0x23,
   // kSLASH   = 0x2f,
   kBKSLASH = 0x5C,
   // kSTAR    = 0x2A,
   kL_U     = 0x75,
   kU_U     = 0x55,
   kQUOT    = 0x27,
   kDBLQ    = 0x22;

 # include "cptbl.h"

 # define cp1047_to_8859(c) cp1047_8859_1[c]

 std::string prog;

 void usage() {
   fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str());
 }


 int cleanup(const std::string &outfile) {
   const char *outstr = outfile.c_str();
   if(outstr && *outstr) {
     int rc = unlink(outstr);
     if(rc == 0) {
       fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr);
       return 0;
     } else {
       if( errno == ENOENT ) {
         return 0; // File did not exist - no error.
       } else {
         perror("unlink");
         return 1;
       }
     }
   }
   return 0;
 }

 // inline bool hasNonAscii(const char *line, size_t len) {
 //   const unsigned char *uline = reinterpret_cast<const unsigned char*>(line);
 //   for(size_t i=0;i<len; i++) {
 //     if( uline[i] > 0x7F) {
 //       return true;
 //     }
 //   }
 //   return false;
 // }

 inline const char *skipws(const char *p, const char *e) {
   for(;p<e;p++) {
     switch(*p) {
     case kSPACE:
     case kTAB:
     case kLF:
     case kCR:
       break;
     default:
       return p; // non ws
     }
   }
   return p;
 }

 // inline bool isCommentOrEmpty(const char* line, size_t len) {
 //   const char *p = line;
 //   const char *e = line+len;
 //   p = skipws(p,e);
 //   if(p==e) {
 //     return true; // whitespace only
 //   }
 //   p++;
 //   switch(*p) {
 //   case kHASH: return true; // #directive
 //   case kSLASH:
 //     p++;
 //     if(p==e) return false; // single slash
 //     switch(*p) {
 //     case kSLASH: // '/ /'
 //     case kSTAR: // '/ *'
 //       return true; // start of comment
 //     default: return false; // something else
 //     }
 //   default: return false; // something else
 //   }
 //   /*NOTREACHED*/
 // }

 void appendByte(std::string &outstr,
                 uint8_t byte) {
     char tmp2[5];
     sprintf(tmp2, "\\x%02X", 0xFF & (int)(byte));
     outstr += tmp2;
 }

 /**
  * @return true on failure
  */
 bool appendUtf8(std::string &outstr,
                 const std::string &linestr,
                 size_t &pos,
                 size_t chars) {
   char tmp[9];
   for(size_t i=0;i<chars;i++) {
     tmp[i] = linestr[++pos];
   }
   tmp[chars] = 0;
   unsigned int c;
   sscanf(tmp, "%X", &c);
   UChar32 ch = c & 0x1FFFFF;

   // now to append \\x%% etc
   uint8_t bytesNeeded = U8_LENGTH(ch);
   if(bytesNeeded == 0) {
     fprintf(stderr, "Illegal code point U+%X\n", ch);
     return true;
   }
   uint8_t bytes[4];
   uint8_t *s = bytes;
   size_t i = 0;
   U8_APPEND_UNSAFE(s, i, ch);
   for(size_t t = 0; t<i; t++) {
     appendByte(outstr, s[t]);
   }
   return false;
 }

 /**
  * @param linestr string to mutate. Already escaped into \u format.
  * @param origpos beginning, points to 'u8"'
  * @param pos end, points to "
  * @return false for no-problem, true for failure!
  */
 bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) {
   size_t pos = origpos + 3;
   std::string outstr;
   outstr += '\"'; // local encoding
   for(;pos<endpos;pos++) {
     char c = linestr[pos];
     if(c == '\\') {
       char c2 = linestr[++pos];
       switch(c2) {
       case '\'':
       case '"':
 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
         c2 = cp1047_to_8859(c2);
 #endif
         appendByte(outstr, c2);
         break;
       case 'u':
         appendUtf8(outstr, linestr, pos, 4);
         break;
       case 'U':
         appendUtf8(outstr, linestr, pos, 8);
         break;
       }
     } else {
 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
       c = cp1047_to_8859(c);
 #endif
       appendByte(outstr, c);
     }
   }
   outstr += ('\"');

   linestr.replace(origpos, (endpos-origpos+1), outstr);

   return false; // OK
 }

 /**
  * fix the string at the position
  * false = no err
  * true = had err
  */
 bool fixAt(std::string &linestr, size_t pos) {
   size_t origpos = pos;

   if(linestr[pos] != 'u') {
     fprintf(stderr, "Not a 'u'?");
     return true;
   }

   pos++; // past 'u'

   bool utf8 = false;

   if(linestr[pos] == '8') { // u8"
     utf8 = true;
     pos++;
   }

   char quote = linestr[pos];

   if(quote != '\'' && quote != '\"') {
     fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote);
     return true;
   }

   if(quote == '\'' && utf8) {
     fprintf(stderr, "Cannot do u8'...'\n");
     return true;
   }

   pos ++;

   //printf("u%c…%c\n", quote, quote);

   for(; pos < linestr.size(); pos++) {
     if(linestr[pos] == quote) {
       if(utf8) {
         return fixu8(linestr, origpos, pos); // fix u8"..."
       } else {
         return false; // end of quote
       }
     }
     if(linestr[pos] == '\\') {
       pos++;
       if(linestr[pos] == quote) continue; // quoted quote
       if(linestr[pos] == 'u') continue; // for now ... unicode escape
       if(linestr[pos] == '\\') continue;
       // some other escape… ignore
     } else {
       size_t old_pos = pos;
       int32_t i = pos;
 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
       // mogrify 1-4 bytes from 1047 'back' to utf-8
       char old_byte = linestr[pos];
       linestr[pos] = cp1047_to_8859(linestr[pos]);
       // how many more?
       int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]);
       for(size_t pos2 = pos+1; trail>0; pos2++,trail--) {
         linestr[pos2] = cp1047_to_8859(linestr[pos2]);
         if(linestr[pos2] == 0x0A) {
           linestr[pos2] = 0x85; // NL is ambiguous here
         }
       }
 #endif

       // Proceed to decode utf-8
       const uint8_t *s = (const uint8_t*) (linestr.c_str());
       int32_t length = linestr.size();
       UChar32 c;
       if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) {
 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
         linestr[pos] = old_byte; // put it back
 #endif
         continue; // single code point not previously legal for \u escaping
       }

       // otherwise, convert it to \u / \U
       {
         U8_NEXT(s, i, length, c);
       }
       if(c<0) {
         fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", old_pos);
         fprintf(stderr, "Line: >>%s<<\n", linestr.c_str());
         return true;
       }

       size_t seqLen = (i-pos);

       //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout);

       char newSeq[20];
       if( c <= 0xFFFF) {
         sprintf(newSeq, "\\u%04X", c);
       } else {
         sprintf(newSeq, "\\U%08X", c);
       }
       linestr.replace(pos, seqLen, newSeq);
       pos += strlen(newSeq) - 1;
     }
   }

   return false;
 }

 /**
  * false = no err
  * true = had err
  */
 bool fixLine(int /*no*/, std::string &linestr) {
   const char *line = linestr.c_str();
   size_t len = linestr.size();

   // no u' in the line?
   if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) {
     return false; // Nothing to do. No u' or u" detected
   }

   // lines such as u8"\u0308" are all ASCII.
   // // Quick Check: all ascii?
   // if(!hasNonAscii(line, len)) {
   //   return false; // ASCII
   // }

   // // comment or empty line?
   // if(isCommentOrEmpty(line, len)) {
   //   return false; // Comment or just empty
   // }

   // start from the end and find all u" cases
   size_t pos = len = linestr.size();
   while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) {
     //printf("found doublequote at %d\n", pos);
     if(fixAt(linestr, pos)) return true;
     if(pos == 0) break;
     pos--;
   }

   // reset and find all u' cases
   pos = len = linestr.size();
   while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) {
     //printf("found singlequote at %d\n", pos);
     if(fixAt(linestr, pos)) return true;
     if(pos == 0) break;
     pos--;
   }

   // reset and find all u8" cases
   pos = len = linestr.size();
   while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) {
     if(fixAt(linestr, pos)) return true;
     if(pos == 0) break;
     pos--;
   }

   //fprintf(stderr, "%d - fixed\n", no);
   return false;
 }

 int convert(const std::string &infile, const std::string &outfile) {
   fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str());

   std::ifstream inf;

   inf.open(infile.c_str(), std::ios::in);

   if(!inf.is_open()) {
     fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str());
     cleanup(outfile);
     return 1;
   }

   std::ofstream outf;

   outf.open(outfile.c_str(), std::ios::out);

   if(!outf.is_open()) {
     fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str());
     return 1;
   }

   // TODO: any platform variations of #line?
   outf << "#line 1 \"" << infile << "\"" << '\n';

   int no = 0;
   std::string linestr;
   while( getline( inf, linestr)) {
     no++;
     if(fixLine(no, linestr)) {
       outf.close();
       fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str());
       cleanup(outfile);
       return 1;
     }
     outf << linestr << '\n';
   }

   return 0;
 }

 int main(int argc, const char *argv[]) {
   prog = argv[0];

   if(argc != 3) {
     usage();
     return 1;
   }

   std::string infile = argv[1];
   std::string outfile = argv[2];

   return convert(infile, outfile);
 }


 #include "utf_impl.cpp"
	// © 2016 and later: Unicode, Inc. and others.
	// License & terms of use: http://www.unicode.org/copyright.html

	#include <stdio.h>
	#include <string>
	#include <stdlib.h>
	#include <unistd.h>
	#include <errno.h>
	#include <string.h>
	#include <iostream>
	#include <fstream>

	// with caution:
	#include "unicode/utf8.h"

	static const char
	kSPACE = 0x20,
	kTAB = 0x09,
	kLF = 0x0A,
	kCR = 0x0D,
	// kHASH = 0x23,
	// kSLASH = 0x2f,
	kBKSLASH = 0x5C,
	// kSTAR = 0x2A,
	kL_U = 0x75,
	kU_U = 0x55,
	kQUOT = 0x27,
	kDBLQ = 0x22;

	# include "cptbl.h"

	# define cp1047_to_8859(c) cp1047_8859_1[c]

	std::string prog;

	void usage() {
	fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str());
	}


	int cleanup(const std::string &outfile) {
	const char *outstr = outfile.c_str();
	if(outstr && *outstr) {
	int rc = unlink(outstr);
	if(rc == 0) {
	fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr);
	return 0;
	} else {
	if( errno == ENOENT ) {
	return 0; // File did not exist - no error.
	} else {
	perror("unlink");
	return 1;
	}
	}
	}
	return 0;
	}

	// inline bool hasNonAscii(const char *line, size_t len) {
	// const unsigned char uline = reinterpret_cast<const unsigned char>(line);
	// for(size_t i=0;i<len; i++) {
	// if( uline[i] > 0x7F) {
	// return true;
	// }
	// }
	// return false;
	// }

	inline const char skipws(const char p, const char *e) {
	for(;p<e;p++) {
	switch(*p) {
	case kSPACE:
	case kTAB:
	case kLF:
	case kCR:
	break;
	default:
	return p; // non ws
	}
	}
	return p;
	}

	// inline bool isCommentOrEmpty(const char* line, size_t len) {
	// const char *p = line;
	// const char *e = line+len;
	// p = skipws(p,e);
	// if(p==e) {
	// return true; // whitespace only
	// }
	// p++;
	// switch(*p) {
	// case kHASH: return true; // #directive
	// case kSLASH:
	// p++;
	// if(p==e) return false; // single slash
	// switch(*p) {
	// case kSLASH: // '/ /'
	// case kSTAR: // '/ *'
	// return true; // start of comment
	// default: return false; // something else
	// }
	// default: return false; // something else
	// }
	// /NOTREACHED/
	// }

	void appendByte(std::string &outstr,
	uint8_t byte) {
	char tmp2[5];
	sprintf(tmp2, "\\x%02X", 0xFF & (int)(byte));
	outstr += tmp2;
	}

	/**
	* @return true on failure
	*/
	bool appendUtf8(std::string &outstr,
	const std::string &linestr,
	size_t &pos,
	size_t chars) {
	char tmp[9];
	for(size_t i=0;i<chars;i++) {
	tmp[i] = linestr[++pos];
	}
	tmp[chars] = 0;
	unsigned int c;
	sscanf(tmp, "%X", &c);
	UChar32 ch = c & 0x1FFFFF;

	// now to append \\x%% etc
	uint8_t bytesNeeded = U8_LENGTH(ch);
	if(bytesNeeded == 0) {
	fprintf(stderr, "Illegal code point U+%X\n", ch);
	return true;
	}
	uint8_t bytes[4];
	uint8_t *s = bytes;
	size_t i = 0;
	U8_APPEND_UNSAFE(s, i, ch);
	for(size_t t = 0; t<i; t++) {
	appendByte(outstr, s[t]);
	}
	return false;
	}

	/**
	* @param linestr string to mutate. Already escaped into \u format.
	* @param origpos beginning, points to 'u8"'
	* @param pos end, points to "
	* @return false for no-problem, true for failure!
	*/
	bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) {
	size_t pos = origpos + 3;
	std::string outstr;
	outstr += '\"'; // local encoding
	for(;pos<endpos;pos++) {
	char c = linestr[pos];
	if(c == '\\') {
	char c2 = linestr[++pos];
	switch(c2) {
	case '\'':
	case '"':
	#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
	c2 = cp1047_to_8859(c2);
	#endif
	appendByte(outstr, c2);
	break;
	case 'u':
	appendUtf8(outstr, linestr, pos, 4);
	break;
	case 'U':
	appendUtf8(outstr, linestr, pos, 8);
	break;
	}
	} else {
	#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
	c = cp1047_to_8859(c);
	#endif
	appendByte(outstr, c);
	}
	}
	outstr += ('\"');

	linestr.replace(origpos, (endpos-origpos+1), outstr);

	return false; // OK
	}

	/**
	* fix the string at the position
	* false = no err
	* true = had err
	*/
	bool fixAt(std::string &linestr, size_t pos) {
	size_t origpos = pos;

	if(linestr[pos] != 'u') {
	fprintf(stderr, "Not a 'u'?");
	return true;
	}

	pos++; // past 'u'

	bool utf8 = false;

	if(linestr[pos] == '8') { // u8"
	utf8 = true;
	pos++;
	}

	char quote = linestr[pos];

	if(quote != '\'' && quote != '\"') {
	fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote);
	return true;
	}

	if(quote == '\'' && utf8) {
	fprintf(stderr, "Cannot do u8'...'\n");
	return true;
	}

	pos ++;

	//printf("u%c…%c\n", quote, quote);

	for(; pos < linestr.size(); pos++) {
	if(linestr[pos] == quote) {
	if(utf8) {
	return fixu8(linestr, origpos, pos); // fix u8"..."
	} else {
	return false; // end of quote
	}
	}
	if(linestr[pos] == '\\') {
	pos++;
	if(linestr[pos] == quote) continue; // quoted quote
	if(linestr[pos] == 'u') continue; // for now ... unicode escape
	if(linestr[pos] == '\\') continue;
	// some other escape… ignore
	} else {
	size_t old_pos = pos;
	int32_t i = pos;
	#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
	// mogrify 1-4 bytes from 1047 'back' to utf-8
	char old_byte = linestr[pos];
	linestr[pos] = cp1047_to_8859(linestr[pos]);
	// how many more?
	int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]);
	for(size_t pos2 = pos+1; trail>0; pos2++,trail--) {
	linestr[pos2] = cp1047_to_8859(linestr[pos2]);
	if(linestr[pos2] == 0x0A) {
	linestr[pos2] = 0x85; // NL is ambiguous here
	}
	}
	#endif

	// Proceed to decode utf-8
	const uint8_t s = (const uint8_t) (linestr.c_str());
	int32_t length = linestr.size();
	UChar32 c;
	if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) {
	#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
	linestr[pos] = old_byte; // put it back
	#endif
	continue; // single code point not previously legal for \u escaping
	}

	// otherwise, convert it to \u / \U
	{
	U8_NEXT(s, i, length, c);
	}
	if(c<0) {
	fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", old_pos);
	fprintf(stderr, "Line: >>%s<<\n", linestr.c_str());
	return true;
	}

	size_t seqLen = (i-pos);

	//printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout);

	char newSeq[20];
	if( c <= 0xFFFF) {
	sprintf(newSeq, "\\u%04X", c);
	} else {
	sprintf(newSeq, "\\U%08X", c);
	}
	linestr.replace(pos, seqLen, newSeq);
	pos += strlen(newSeq) - 1;
	}
	}

	return false;
	}

	/**
	* false = no err
	* true = had err
	*/
	bool fixLine(int /no/, std::string &linestr) {
	const char *line = linestr.c_str();
	size_t len = linestr.size();

	// no u' in the line?
	if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) {
	return false; // Nothing to do. No u' or u" detected
	}

	// lines such as u8"\u0308" are all ASCII.
	// // Quick Check: all ascii?
	// if(!hasNonAscii(line, len)) {
	// return false; // ASCII
	// }

	// // comment or empty line?
	// if(isCommentOrEmpty(line, len)) {
	// return false; // Comment or just empty
	// }

	// start from the end and find all u" cases
	size_t pos = len = linestr.size();
	while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) {
	//printf("found doublequote at %d\n", pos);
	if(fixAt(linestr, pos)) return true;
	if(pos == 0) break;
	pos--;
	}

	// reset and find all u' cases
	pos = len = linestr.size();
	while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) {
	//printf("found singlequote at %d\n", pos);
	if(fixAt(linestr, pos)) return true;
	if(pos == 0) break;
	pos--;
	}

	// reset and find all u8" cases
	pos = len = linestr.size();
	while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) {
	if(fixAt(linestr, pos)) return true;
	if(pos == 0) break;
	pos--;
	}

	//fprintf(stderr, "%d - fixed\n", no);
	return false;
	}

	int convert(const std::string &infile, const std::string &outfile) {
	fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str());

	std::ifstream inf;

	inf.open(infile.c_str(), std::ios::in);

	if(!inf.is_open()) {
	fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str());
	cleanup(outfile);
	return 1;
	}

	std::ofstream outf;

	outf.open(outfile.c_str(), std::ios::out);

	if(!outf.is_open()) {
	fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str());
	return 1;
	}

	// TODO: any platform variations of #line?
	outf << "#line 1 \"" << infile << "\"" << '\n';

	int no = 0;
	std::string linestr;
	while( getline( inf, linestr)) {
	no++;
	if(fixLine(no, linestr)) {
	outf.close();
	fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str());
	cleanup(outfile);
	return 1;
	}
	outf << linestr << '\n';
	}

	return 0;
	}

	int main(int argc, const char *argv[]) {
	prog = argv[0];

	if(argc != 3) {
	usage();
	return 1;
	}

	std::string infile = argv[1];
	std::string outfile = argv[2];

	return convert(infile, outfile);
	}


	#include "utf_impl.cpp"