blob: 36dd6b4b3cc63a1a5657f3a1276b3fe47633b18a [file] [log] [blame]
/*
* Copyright (C) 2012-2017 Reece H. Dunn
*
* This file is part of ucd-tools.
*
* ucd-tools is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* ucd-tools is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
*/
#include "config.h"
#include "ucd/ucd.h"
#include <locale.h>
#include <string.h>
#include <stdio.h>
#include <wchar.h>
#include <wctype.h>
#ifndef HAVE_ISWBLANK
static int iswblank(wint_t c)
{
return iswspace(c) && !(c >= 0x0A && c <= 0x0D);
}
#endif
static void fput_utf8c(FILE *out, codepoint_t c)
{
if (c < 0x80)
fputc((uint8_t)c, out);
else if (c < 0x800)
{
fputc(0xC0 | (c >> 6), out);
fputc(0x80 + (c & 0x3F), out);
}
else if (c < 0x10000)
{
fputc(0xE0 | (c >> 12), out);
fputc(0x80 + ((c >> 6) & 0x3F), out);
fputc(0x80 + (c & 0x3F), out);
}
else if (c < 0x200000)
{
fputc(0xF0 | (c >> 18), out);
fputc(0x80 + ((c >> 12) & 0x3F), out);
fputc(0x80 + ((c >> 6) & 0x3F), out);
fputc(0x80 + (c & 0x3F), out);
}
}
static int fget_utf8c(FILE *in, codepoint_t *c)
{
int ch = EOF;
if ((ch = fgetc(in)) == EOF) return 0;
if ((uint8_t)ch < 0x80)
*c = (uint8_t)ch;
else switch ((uint8_t)ch & 0xF0)
{
default:
*c = (uint8_t)ch & 0x1F;
if ((ch = fgetc(in)) == EOF) return 0;
*c = (*c << 6) + ((uint8_t)ch & 0x3F);
break;
case 0xE0:
*c = (uint8_t)ch & 0x0F;
if ((ch = fgetc(in)) == EOF) return 0;
*c = (*c << 6) + ((uint8_t)ch & 0x3F);
if ((ch = fgetc(in)) == EOF) return 0;
*c = (*c << 6) + ((uint8_t)ch & 0x3F);
break;
case 0xF0:
*c = (uint8_t)ch & 0x07;
if ((ch = fgetc(in)) == EOF) return 0;
*c = (*c << 6) + ((uint8_t)ch & 0x3F);
if ((ch = fgetc(in)) == EOF) return 0;
*c = (*c << 6) + ((uint8_t)ch & 0x3F);
if ((ch = fgetc(in)) == EOF) return 0;
*c = (*c << 6) + ((uint8_t)ch & 0x3F);
break;
}
return 1;
}
static void uprintf_codepoint(FILE *out, codepoint_t c, char mode)
{
switch (mode)
{
case 'c': /* character */
switch (c)
{
case '\t': fputs("\\t", out); break;
case '\r': fputs("\\r", out); break;
case '\n': fputs("\\n", out); break;
default: fput_utf8c(out, c); break;
}
break;
case 'h': /* hexadecimal (lower) */
fprintf(out, "%06x", c);
break;
case 'H': /* hexadecimal (upper) */
fprintf(out, "%06X", c);
break;
}
}
static void uprintf_is(FILE *out, codepoint_t c, char mode)
{
switch (mode)
{
case 'A': /* alpha-numeric */
fputc(iswalnum(c) ? '1' : '0', out);
break;
case 'a': /* alpha */
fputc(iswalpha(c) ? '1' : '0', out);
break;
case 'b': /* blank */
fputc(iswblank(c) ? '1' : '0', out);
break;
case 'c': /* control */
fputc(iswcntrl(c) ? '1' : '0', out);
break;
case 'd': /* numeric */
fputc(iswdigit(c) ? '1' : '0', out);
break;
case 'g': /* glyph */
fputc(iswgraph(c) ? '1' : '0', out);
break;
case 'l': /* lower case */
fputc(iswlower(c) ? '1' : '0', out);
break;
case 'P': /* printable */
fputc(iswprint(c) ? '1' : '0', out);
break;
case 'p': /* punctuation */
fputc(iswpunct(c) ? '1' : '0', out);
break;
case 's': /* whitespace */
fputc(iswspace(c) ? '1' : '0', out);
break;
case 'u': /* upper case */
fputc(iswupper(c) ? '1' : '0', out);
break;
case 'x': /* xdigit */
fputc(iswxdigit(c) ? '1' : '0', out);
break;
}
}
static void uprintf(FILE *out, codepoint_t c, const char *format)
{
while (*format) switch (*format)
{
case '%':
switch (*++format)
{
case 'c': /* category */
fputs(ucd_get_category_string(ucd_lookup_category(c)), out);
break;
case 'C': /* category group */
fputs(ucd_get_category_group_string(ucd_lookup_category_group(c)), out);
break;
case 'p': /* codepoint */
uprintf_codepoint(out, c, *++format);
break;
case 'P': /* properties */
fprintf(out, "%016llx", ucd_properties(c, ucd_lookup_category(c)));
break;
case 'i': /* is* */
uprintf_is(out, c, *++format);
break;
case 'L': /* lowercase */
uprintf_codepoint(out, ucd_tolower(c), *++format);
break;
case 's': /* script */
fputs(ucd_get_script_string(ucd_lookup_script(c)), out);
break;
case 'T': /* titlecase */
uprintf_codepoint(out, ucd_totitle(c), *++format);
break;
case 'U': /* uppercase */
uprintf_codepoint(out, ucd_toupper(c), *++format);
break;
}
++format;
break;
case '\\':
switch (*++format) {
case 0:
break;
case 't':
fputc('\t', out);
++format;
break;
case 'r':
fputc('\r', out);
++format;
break;
case 'n':
fputc('\n', out);
++format;
break;
default:
fputc(*format, out);
++format;
break;
}
break;
default:
fputc(*format, out);
++format;
break;
}
}
static void print_file(FILE *in, const char *format)
{
codepoint_t c = 0;
while (fget_utf8c(in, &c))
uprintf(stdout, c, format ? format : "%pc\t%pH\t%s\t%c\t%Uc\t%Lc\t%Tc\t%is\n");
}
int main(int argc, char **argv)
{
FILE *in = NULL;
const char *format = NULL;
int argn;
for (argn = 1; argn != argc; ++argn)
{
const char *arg = argv[argn];
if (!strcmp(arg, "--stdin") || !strcmp(arg, "-"))
in = stdin;
else if (!strncmp(arg, "--format=", 9))
format = arg + 9;
else if (!strncmp(arg, "--locale=", 9))
setlocale(LC_CTYPE, arg + 9);
else if (in == NULL)
{
in = fopen(arg, "r");
if (!in)
fprintf(stdout, "cannot open `%s`\n", argv[1]);
}
}
if (in == stdin)
print_file(stdin, format);
else if (in != NULL)
{
print_file(in, format);
fclose(in);
}
else
{
codepoint_t c;
for (c = 0; c <= 0x10FFFF; ++c)
uprintf(stdout, c, format ? format :
"%pH %s %C %c %UH %LH %TH %id %ix %ic %is %ib %ip %iP %ig %iA %ia %iu %il %P\n");
}
return 0;
}