blob: b3f37394a7b3ce64541851afd0cb9ccdea1dc61a [file] [log] [blame]
#include "license.hunspell"
#include "license.myspell"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "hunspell.hxx"
#include "hunspell.h"
#ifndef HUNSPELL_CHROME_CLIENT
#ifndef MOZILLA_CLIENT
# include "config.h"
#endif
#endif
#include "csutil.hxx"
#ifdef HUNSPELL_CHROME_CLIENT
Hunspell::Hunspell(const unsigned char* bdict_data, size_t bdict_length)
#else
Hunspell::Hunspell(const char * affpath, const char * dpath, const char * key)
#endif
{
encoding = NULL;
csconv = NULL;
utf8 = 0;
complexprefixes = 0;
#ifndef HUNSPELL_CHROME_CLIENT
affixpath = mystrdup(affpath);
#endif
maxdic = 0;
#ifdef HUNSPELL_CHROME_CLIENT
bdict_reader = new hunspell::BDictReader;
bdict_reader->Init(bdict_data, bdict_length);
pHMgr[0] = new HashMgr(bdict_reader);
if (pHMgr[0]) maxdic = 1;
pAMgr = new AffixMgr(bdict_reader, pHMgr, &maxdic);
#else
/* first set up the hash manager */
pHMgr[0] = new HashMgr(dpath, affpath, key);
if (pHMgr[0]) maxdic = 1;
/* next set up the affix manager */
/* it needs access to the hash manager lookup methods */
pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key);
#endif
/* get the preferred try string and the dictionary */
/* encoding from the Affix Manager for that dictionary */
char * try_string = pAMgr->get_try_string();
encoding = pAMgr->get_encoding();
langnum = pAMgr->get_langnum();
utf8 = pAMgr->get_utf8();
if (!utf8)
csconv = get_current_cs(encoding);
complexprefixes = pAMgr->get_complexprefixes();
wordbreak = pAMgr->get_breaktable();
/* and finally set up the suggestion manager */
#ifdef HUNSPELL_CHROME_CLIENT
pSMgr = new SuggestMgr(bdict_reader, try_string, MAXSUGGESTION, pAMgr);
#else
pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr);
#endif
if (try_string) free(try_string);
}
Hunspell::~Hunspell()
{
if (pSMgr) delete pSMgr;
if (pAMgr) delete pAMgr;
for (int i = 0; i < maxdic; i++) delete pHMgr[i];
maxdic = 0;
pSMgr = NULL;
pAMgr = NULL;
#ifdef MOZILLA_CLIENT
delete [] csconv;
#endif
csconv= NULL;
if (encoding) free(encoding);
encoding = NULL;
#ifdef HUNSPELL_CHROME_CLIENT
if (bdict_reader) delete bdict_reader;
bdict_reader = NULL;
#else
if (affixpath) free(affixpath);
affixpath = NULL;
#endif
}
#ifndef HUNSPELL_CHROME_CLIENT
// load extra dictionaries
int Hunspell::add_dic(const char * dpath, const char * key) {
if (maxdic == MAXDIC || !affixpath) return 1;
pHMgr[maxdic] = new HashMgr(dpath, affixpath, key);
if (pHMgr[maxdic]) maxdic++; else return 1;
return 0;
}
#endif
// make a copy of src at destination while removing all leading
// blanks and removing any trailing periods after recording
// their presence with the abbreviation flag
// also since already going through character by character,
// set the capitalization type
// return the length of the "cleaned" (and UTF-8 encoded) word
int Hunspell::cleanword2(char * dest, const char * src,
w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev)
{
unsigned char * p = (unsigned char *) dest;
const unsigned char * q = (const unsigned char * ) src;
// first skip over any leading blanks
while ((*q != '\0') && (*q == ' ')) q++;
// now strip off any trailing periods (recording their presence)
*pabbrev = 0;
int nl = strlen((const char *)q);
while ((nl > 0) && (*(q+nl-1)=='.')) {
nl--;
(*pabbrev)++;
}
// if no characters are left it can't be capitalized
if (nl <= 0) {
*pcaptype = NOCAP;
*p = '\0';
return 0;
}
strncpy(dest, (char *) q, nl);
*(dest + nl) = '\0';
nl = strlen(dest);
if (utf8) {
*nc = u8_u16(dest_utf, MAXWORDLEN, dest);
// don't check too long words
// TODO(rouslan): Remove the interim change below when this patch lands:
// http://sf.net/tracker/?func=detail&aid=3595024&group_id=143754&atid=756395.
if (*nc >= MAXWORDLEN - 1) return 0;
if (*nc == -1) { // big Unicode character (non BMP area)
*pcaptype = NOCAP;
return nl;
}
*pcaptype = get_captype_utf8(dest_utf, *nc, langnum);
} else {
*pcaptype = get_captype(dest, nl, csconv);
*nc = nl;
}
return nl;
}
int Hunspell::cleanword(char * dest, const char * src,
int * pcaptype, int * pabbrev)
{
unsigned char * p = (unsigned char *) dest;
const unsigned char * q = (const unsigned char * ) src;
int firstcap = 0;
// first skip over any leading blanks
while ((*q != '\0') && (*q == ' ')) q++;
// now strip off any trailing periods (recording their presence)
*pabbrev = 0;
int nl = strlen((const char *)q);
while ((nl > 0) && (*(q+nl-1)=='.')) {
nl--;
(*pabbrev)++;
}
// if no characters are left it can't be capitalized
if (nl <= 0) {
*pcaptype = NOCAP;
*p = '\0';
return 0;
}
// now determine the capitalization type of the first nl letters
int ncap = 0;
int nneutral = 0;
int nc = 0;
if (!utf8) {
while (nl > 0) {
nc++;
if (csconv[(*q)].ccase) ncap++;
if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++;
*p++ = *q++;
nl--;
}
// remember to terminate the destination string
*p = '\0';
firstcap = csconv[(unsigned char)(*dest)].ccase;
} else {
unsigned short idx;
w_char t[MAXWORDLEN];
nc = u8_u16(t, MAXWORDLEN, src);
for (int i = 0; i < nc; i++) {
idx = (t[i].h << 8) + t[i].l;
unsigned short low = unicodetolower(idx, langnum);
if (idx != low) ncap++;
if (unicodetoupper(idx, langnum) == low) nneutral++;
}
u16_u8(dest, MAXWORDUTF8LEN, t, nc);
if (ncap) {
idx = (t[0].h << 8) + t[0].l;
firstcap = (idx != unicodetolower(idx, langnum));
}
}
// now finally set the captype
if (ncap == 0) {
*pcaptype = NOCAP;
} else if ((ncap == 1) && firstcap) {
*pcaptype = INITCAP;
} else if ((ncap == nc) || ((ncap + nneutral) == nc)){
*pcaptype = ALLCAP;
} else if ((ncap > 1) && firstcap) {
*pcaptype = HUHINITCAP;
} else {
*pcaptype = HUHCAP;
}
return strlen(dest);
}
void Hunspell::mkallcap(char * p)
{
if (utf8) {
w_char u[MAXWORDLEN];
int nc = u8_u16(u, MAXWORDLEN, p);
unsigned short idx;
for (int i = 0; i < nc; i++) {
idx = (u[i].h << 8) + u[i].l;
if (idx != unicodetoupper(idx, langnum)) {
u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8);
u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF);
}
}
u16_u8(p, MAXWORDUTF8LEN, u, nc);
} else {
while (*p != '\0') {
*p = csconv[((unsigned char) *p)].cupper;
p++;
}
}
}
int Hunspell::mkallcap2(char * p, w_char * u, int nc)
{
if (utf8) {
unsigned short idx;
for (int i = 0; i < nc; i++) {
idx = (u[i].h << 8) + u[i].l;
unsigned short up = unicodetoupper(idx, langnum);
if (idx != up) {
u[i].h = (unsigned char) (up >> 8);
u[i].l = (unsigned char) (up & 0x00FF);
}
}
u16_u8(p, MAXWORDUTF8LEN, u, nc);
return strlen(p);
} else {
while (*p != '\0') {
*p = csconv[((unsigned char) *p)].cupper;
p++;
}
}
return nc;
}
void Hunspell::mkallsmall(char * p)
{
while (*p != '\0') {
*p = csconv[((unsigned char) *p)].clower;
p++;
}
}
int Hunspell::mkallsmall2(char * p, w_char * u, int nc)
{
if (utf8) {
unsigned short idx;
for (int i = 0; i < nc; i++) {
idx = (u[i].h << 8) + u[i].l;
unsigned short low = unicodetolower(idx, langnum);
if (idx != low) {
u[i].h = (unsigned char) (low >> 8);
u[i].l = (unsigned char) (low & 0x00FF);
}
}
u16_u8(p, MAXWORDUTF8LEN, u, nc);
return strlen(p);
} else {
while (*p != '\0') {
*p = csconv[((unsigned char) *p)].clower;
p++;
}
}
return nc;
}
// convert UTF-8 sharp S codes to latin 1
char * Hunspell::sharps_u8_l1(char * dest, char * source) {
char * p = dest;
*p = *source;
for (p++, source++; *(source - 1); p++, source++) {
*p = *source;
if (*source == '\x9F') *--p = '\xDF';
}
return dest;
}
// recursive search for right ss - sharp s permutations
hentry * Hunspell::spellsharps(char * base, char * pos, int n,
int repnum, char * tmp, int * info, char **root) {
pos = strstr(pos, "ss");
if (pos && (n < MAXSHARPS)) {
*pos = '\xC3';
*(pos + 1) = '\x9F';
hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root);
if (h) return h;
*pos = 's';
*(pos + 1) = 's';
h = spellsharps(base, pos + 2, n + 1, repnum, tmp, info, root);
if (h) return h;
} else if (repnum > 0) {
if (utf8) return checkword(base, info, root);
return checkword(sharps_u8_l1(tmp, base), info, root);
}
return NULL;
}
int Hunspell::is_keepcase(const hentry * rv) {
return pAMgr && rv->astr && pAMgr->get_keepcase() &&
TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen);
}
/* insert a word to the beginning of the suggestion array and return ns */
int Hunspell::insert_sug(char ***slst, char * word, int ns) {
char * dup = mystrdup(word);
if (!dup) return ns;
if (ns == MAXSUGGESTION) {
ns--;
free((*slst)[ns]);
}
for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
(*slst)[0] = dup;
return ns + 1;
}
int Hunspell::spell(const char * word, int * info, char ** root)
{
#ifdef HUNSPELL_CHROME_CLIENT
if (pHMgr) pHMgr[0]->EmptyHentryCache();
#endif
struct hentry * rv=NULL;
// need larger vector. For example, Turkish capital letter I converted a
// 2-byte UTF-8 character (dotless i) by mkallsmall.
char cw[MAXWORDUTF8LEN];
char wspace[MAXWORDUTF8LEN];
w_char unicw[MAXWORDLEN];
// Hunspell supports XML input of the simplified API (see manual)
if (strcmp(word, SPELL_XML) == 0) return 1;
int nc = strlen(word);
int wl2 = 0;
if (utf8) {
if (nc >= MAXWORDUTF8LEN) return 0;
} else {
if (nc >= MAXWORDLEN) return 0;
}
int captype = 0;
int abbv = 0;
int wl = 0;
// input conversion
RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
int info2 = 0;
if (wl == 0 || maxdic == 0) return 1;
if (root) *root = NULL;
// allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.)
enum { NBEGIN, NNUM, NSEP };
int nstate = NBEGIN;
int i;
for (i = 0; (i < wl); i++) {
if ((cw[i] <= '9') && (cw[i] >= '0')) {
nstate = NNUM;
} else if ((cw[i] == ',') || (cw[i] == '.') || (cw[i] == '-')) {
if ((nstate == NSEP) || (i == 0)) break;
nstate = NSEP;
} else break;
}
if ((i == wl) && (nstate == NNUM)) return 1;
if (!info) info = &info2; else *info = 0;
switch(captype) {
case HUHCAP:
case HUHINITCAP:
*info += SPELL_ORIGCAP;
case NOCAP: {
rv = checkword(cw, info, root);
if ((abbv) && !(rv)) {
memcpy(wspace,cw,wl);
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
rv = checkword(wspace, info, root);
}
break;
}
case ALLCAP: {
*info += SPELL_ORIGCAP;
rv = checkword(cw, info, root);
if (rv) break;
if (abbv) {
memcpy(wspace,cw,wl);
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
rv = checkword(wspace, info, root);
if (rv) break;
}
// Spec. prefix handling for Catalan, French, Italian:
// prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
if (pAMgr && strchr(cw, '\'')) {
wl = mkallsmall2(cw, unicw, nc);
//There are no really sane circumstances where this could fail,
//but anyway...
if (char * apostrophe = strchr(cw, '\'')) {
if (utf8) {
w_char tmpword[MAXWORDLEN];
*apostrophe = '\0';
wl2 = u8_u16(tmpword, MAXWORDLEN, cw);
*apostrophe = '\'';
if (wl2 < nc) {
mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1);
rv = checkword(cw, info, root);
if (rv) break;
}
} else {
mkinitcap2(apostrophe + 1, unicw, nc);
rv = checkword(cw, info, root);
if (rv) break;
}
}
mkinitcap2(cw, unicw, nc);
rv = checkword(cw, info, root);
if (rv) break;
}
if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) {
char tmpword[MAXWORDUTF8LEN];
wl = mkallsmall2(cw, unicw, nc);
memcpy(wspace,cw,(wl+1));
rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
if (!rv) {
wl2 = mkinitcap2(cw, unicw, nc);
rv = spellsharps(cw, cw, 0, 0, tmpword, info, root);
}
if ((abbv) && !(rv)) {
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
if (!rv) {
memcpy(wspace, cw, wl2);
*(wspace+wl2) = '.';
*(wspace+wl2+1) = '\0';
rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
}
}
if (rv) break;
}
}
case INITCAP: {
*info += SPELL_ORIGCAP;
wl = mkallsmall2(cw, unicw, nc);
memcpy(wspace,cw,(wl+1));
wl2 = mkinitcap2(cw, unicw, nc);
if (captype == INITCAP) *info += SPELL_INITCAP;
rv = checkword(cw, info, root);
if (captype == INITCAP) *info -= SPELL_INITCAP;
// forbid bad capitalization
// (for example, ijs -> Ijs instead of IJs in Dutch)
// use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag)
if (*info & SPELL_FORBIDDEN) {
rv = NULL;
break;
}
if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL;
if (rv) break;
rv = checkword(wspace, info, root);
if (abbv && !rv) {
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
rv = checkword(wspace, info, root);
if (!rv) {
memcpy(wspace, cw, wl2);
*(wspace+wl2) = '.';
*(wspace+wl2+1) = '\0';
if (captype == INITCAP) *info += SPELL_INITCAP;
rv = checkword(wspace, info, root);
if (captype == INITCAP) *info -= SPELL_INITCAP;
if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL;
break;
}
}
if (rv && is_keepcase(rv) &&
((captype == ALLCAP) ||
// if CHECKSHARPS: KEEPCASE words with \xDF are allowed
// in INITCAP form, too.
!(pAMgr->get_checksharps() &&
((utf8 && strstr(wspace, "\xC3\x9F")) ||
(!utf8 && strchr(wspace, '\xDF')))))) rv = NULL;
break;
}
}
if (rv) {
if (pAMgr && pAMgr->get_warn() && rv->astr &&
TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) {
*info += SPELL_WARN;
if (pAMgr->get_forbidwarn()) return 0;
return HUNSPELL_OK_WARN;
}
return HUNSPELL_OK;
}
// recursive breaking at break points
if (wordbreak) {
char * s;
char r;
int nbr = 0;
wl = strlen(cw);
int numbreak = pAMgr ? pAMgr->get_numbreak() : 0;
// calculate break points for recursion limit
for (int j = 0; j < numbreak; j++) {
s = cw;
do {
s = (char *) strstr(s, wordbreak[j]);
if (s) {
nbr++;
s++;
}
} while (s);
}
if (nbr >= 10) return 0;
// check boundary patterns (^begin and end$)
for (int j = 0; j < numbreak; j++) {
int plen = strlen(wordbreak[j]);
if (plen == 1 || plen > wl) continue;
if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0
&& spell(cw + plen - 1)) return 1;
if (wordbreak[j][plen - 1] == '$' &&
strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) {
r = cw[wl - plen + 1];
cw[wl - plen + 1] = '\0';
if (spell(cw)) return 1;
cw[wl - plen + 1] = r;
}
}
// other patterns
for (int j = 0; j < numbreak; j++) {
int plen = strlen(wordbreak[j]);
s=(char *) strstr(cw, wordbreak[j]);
if (s && (s > cw) && (s < cw + wl - plen)) {
if (!spell(s + plen)) continue;
r = *s;
*s = '\0';
// examine 2 sides of the break point
if (spell(cw)) return 1;
*s = r;
// LANG_hu: spec. dash rule
if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) {
r = s[1];
s[1] = '\0';
if (spell(cw)) return 1; // check the first part with dash
s[1] = r;
}
// end of LANG speficic region
}
}
}
return 0;
}
struct hentry * Hunspell::checkword(const char * w, int * info, char ** root)
{
struct hentry * he = NULL;
int len, i;
char w2[MAXWORDUTF8LEN];
const char * word;
char * ignoredchars = pAMgr->get_ignore();
if (ignoredchars != NULL) {
strcpy(w2, w);
if (utf8) {
int ignoredchars_utf16_len;
unsigned short * ignoredchars_utf16 = pAMgr->get_ignore_utf16(&ignoredchars_utf16_len);
remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len);
} else {
remove_ignored_chars(w2,ignoredchars);
}
word = w2;
} else word = w;
len = strlen(word);
if (!len)
return NULL;
#ifdef HUNSPELL_CHROME_CLIENT
// We need to check if the word length is valid to make coverity (Event
// fixed_size_dest: Possible overrun of N byte fixed size buffer) happy.
if ((utf8 && strlen(word) >= MAXWORDUTF8LEN) || (!utf8 && strlen(word) >= MAXWORDLEN))
return NULL;
#endif
// word reversing wrapper for complex prefixes
if (complexprefixes) {
if (word != w2) {
strcpy(w2, word);
word = w2;
}
if (utf8) reverseword_utf(w2); else reverseword(w2);
}
// look word in hash table
for (i = 0; (i < maxdic) && !he; i ++) {
he = (pHMgr[i])->lookup(word);
// check forbidden and onlyincompound words
if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
if (info) *info += SPELL_FORBIDDEN;
// LANG_hu section: set dash information for suggestions
if (langnum == LANG_hu) {
if (pAMgr->get_compoundflag() &&
TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) {
if (info) *info += SPELL_COMPOUND;
}
}
return NULL;
}
// he = next not needaffix, onlyincompound homonym or onlyupcase word
while (he && (he->astr) &&
((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) ||
(pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
(info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen))
)) he = he->next_homonym;
}
// check with affixes
if (!he && pAMgr) {
// try stripping off affixes */
he = pAMgr->affix_check(word, len, 0);
// check compound restriction and onlyupcase
if (he && he->astr && (
(pAMgr->get_onlyincompound() &&
TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
(info && (*info & SPELL_INITCAP) &&
TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) {
he = NULL;
}
if (he) {
if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
if (info) *info += SPELL_FORBIDDEN;
return NULL;
}
if (root) {
*root = mystrdup(he->word);
if (*root && complexprefixes) {
if (utf8) reverseword_utf(*root); else reverseword(*root);
}
}
// try check compound word
} else if (pAMgr->get_compound()) {
he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0, info);
// LANG_hu section: `moving rule' with last dash
if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) {
char * dup = mystrdup(word);
if (!dup) return NULL;
dup[len-1] = '\0';
he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0, info);
free(dup);
}
// end of LANG speficic region
if (he) {
if (root) {
*root = mystrdup(he->word);
if (*root && complexprefixes) {
if (utf8) reverseword_utf(*root); else reverseword(*root);
}
}
if (info) *info += SPELL_COMPOUND;
}
}
}
return he;
}
int Hunspell::suggest(char*** slst, const char * word)
{
#ifdef HUNSPELL_CHROME_CLIENT
if (pHMgr) pHMgr[0]->EmptyHentryCache();
#endif
int onlycmpdsug = 0;
char cw[MAXWORDUTF8LEN];
char wspace[MAXWORDUTF8LEN];
if (!pSMgr || maxdic == 0) return 0;
w_char unicw[MAXWORDLEN];
*slst = NULL;
// process XML input of the simplified API (see manual)
if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) {
return spellml(slst, word);
}
int nc = strlen(word);
if (utf8) {
if (nc >= MAXWORDUTF8LEN) return 0;
} else {
if (nc >= MAXWORDLEN) return 0;
}
int captype = 0;
int abbv = 0;
int wl = 0;
// input conversion
RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
if (wl == 0) return 0;
int ns = 0;
int capwords = 0;
// check capitalized form for FORCEUCASE
if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) {
int info = SPELL_ORIGCAP;
char ** wlst;
if (checkword(cw, &info, NULL)) {
if (*slst) {
wlst = *slst;
} else {
wlst = (char **) malloc(MAXSUGGESTION * sizeof(char *));
if (wlst == NULL) return -1;
*slst = wlst;
for (int i = 0; i < MAXSUGGESTION; i++) {
wlst[i] = NULL;
}
}
wlst[0] = mystrdup(cw);
mkinitcap(wlst[0]);
return 1;
}
}
switch(captype) {
case NOCAP: {
ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
break;
}
case INITCAP: {
capwords = 1;
ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
if (ns == -1) break;
memcpy(wspace,cw,(wl+1));
mkallsmall2(wspace, unicw, nc);
ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
break;
}
case HUHINITCAP:
capwords = 1;
case HUHCAP: {
ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
if (ns != -1) {
int prevns;
// something.The -> something. The
char * dot = strchr(cw, '.');
if (dot && (dot > cw)) {
int captype_;
if (utf8) {
w_char w_[MAXWORDLEN];
int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1);
captype_ = get_captype_utf8(w_, wl_, langnum);
} else captype_ = get_captype(dot+1, strlen(dot+1), csconv);
if (captype_ == INITCAP) {
char * st = mystrdup(cw);
if (st) st = (char *) realloc(st, wl + 2);
if (st) {
st[(dot - cw) + 1] = ' ';
strcpy(st + (dot - cw) + 2, dot + 1);
ns = insert_sug(slst, st, ns);
free(st);
}
}
}
if (captype == HUHINITCAP) {
// TheOpenOffice.org -> The OpenOffice.org
memcpy(wspace,cw,(wl+1));
mkinitsmall2(wspace, unicw, nc);
ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
}
memcpy(wspace,cw,(wl+1));
mkallsmall2(wspace, unicw, nc);
if (spell(wspace)) ns = insert_sug(slst, wspace, ns);
prevns = ns;
ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
if (captype == HUHINITCAP) {
mkinitcap2(wspace, unicw, nc);
if (spell(wspace)) ns = insert_sug(slst, wspace, ns);
ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
}
// aNew -> "a New" (instead of "a new")
for (int j = prevns; j < ns; j++) {
char * space = strchr((*slst)[j],' ');
if (space) {
int slen = strlen(space + 1);
// different case after space (need capitalisation)
if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) {
w_char w[MAXWORDLEN];
int wc = 0;
char * r = (*slst)[j];
if (utf8) wc = u8_u16(w, MAXWORDLEN, space + 1);
mkinitcap2(space + 1, w, wc);
// set as first suggestion
for (int k = j; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
(*slst)[0] = r;
}
}
}
}
break;
}
case ALLCAP: {
memcpy(wspace, cw, (wl+1));
mkallsmall2(wspace, unicw, nc);
ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
if (ns == -1) break;
if (pAMgr && pAMgr->get_keepcase() && spell(wspace))
ns = insert_sug(slst, wspace, ns);
mkinitcap2(wspace, unicw, nc);
ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
for (int j=0; j < ns; j++) {
mkallcap((*slst)[j]);
if (pAMgr && pAMgr->get_checksharps()) {
char * pos;
if (utf8) {
pos = strstr((*slst)[j], "\xC3\x9F");
while (pos) {
*pos = 'S';
*(pos+1) = 'S';
pos = strstr(pos+2, "\xC3\x9F");
}
} else {
pos = strchr((*slst)[j], '\xDF');
while (pos) {
(*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2);
mystrrep((*slst)[j], "\xDF", "SS");
pos = strchr((*slst)[j], '\xDF');
}
}
}
}
break;
}
}
// LANG_hu section: replace '-' with ' ' in Hungarian
if (langnum == LANG_hu) {
for (int j=0; j < ns; j++) {
char * pos = strchr((*slst)[j],'-');
if (pos) {
int info;
char w[MAXWORDUTF8LEN];
*pos = '\0';
strcpy(w, (*slst)[j]);
strcat(w, pos + 1);
spell(w, &info, NULL);
if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
*pos = ' ';
} else *pos = '-';
}
}
}
// END OF LANG_hu section
// try ngram approach since found nothing or only compound words
if (pAMgr && (ns == 0 || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0) && (*slst)) {
switch(captype) {
case NOCAP: {
ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic);
break;
}
case HUHINITCAP:
capwords = 1;
case HUHCAP: {
memcpy(wspace,cw,(wl+1));
mkallsmall2(wspace, unicw, nc);
ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
break;
}
case INITCAP: {
capwords = 1;
memcpy(wspace,cw,(wl+1));
mkallsmall2(wspace, unicw, nc);
ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
break;
}
case ALLCAP: {
memcpy(wspace,cw,(wl+1));
mkallsmall2(wspace, unicw, nc);
int oldns = ns;
ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
for (int j = oldns; j < ns; j++)
mkallcap((*slst)[j]);
break;
}
}
}
// try dash suggestion (Afo-American -> Afro-American)
if (char * pos = strchr(cw, '-')) {
char * ppos = cw;
int nodashsug = 1;
char ** nlst = NULL;
int nn = 0;
int last = 0;
if (*slst) {
for (int j = 0; j < ns && nodashsug == 1; j++) {
if (strchr((*slst)[j], '-')) nodashsug = 0;
}
}
while (nodashsug && !last) {
if (*pos == '\0') last = 1; else *pos = '\0';
if (!spell(ppos)) {
nn = suggest(&nlst, ppos);
for (int j = nn - 1; j >= 0; j--) {
strncpy(wspace, cw, ppos - cw);
strcpy(wspace + (ppos - cw), nlst[j]);
if (!last) {
strcat(wspace, "-");
strcat(wspace, pos + 1);
}
ns = insert_sug(slst, wspace, ns);
free(nlst[j]);
}
if (nlst != NULL) free(nlst);
nodashsug = 0;
}
if (!last) {
*pos = '-';
ppos = pos + 1;
pos = strchr(ppos, '-');
}
if (!pos) pos = cw + strlen(cw);
}
}
// word reversing wrapper for complex prefixes
if (complexprefixes) {
for (int j = 0; j < ns; j++) {
if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]);
}
}
// capitalize
if (capwords) for (int j=0; j < ns; j++) {
mkinitcap((*slst)[j]);
}
// expand suggestions with dot(s)
if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
for (int j = 0; j < ns; j++) {
(*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
strcat((*slst)[j], word + strlen(word) - abbv);
}
}
// remove bad capitalized and forbidden forms
if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) {
switch (captype) {
case INITCAP:
case ALLCAP: {
int l = 0;
for (int j=0; j < ns; j++) {
if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) {
char s[MAXSWUTF8L];
w_char w[MAXSWL];
int len;
if (utf8) {
len = u8_u16(w, MAXSWL, (*slst)[j]);
} else {
strcpy(s, (*slst)[j]);
len = strlen(s);
}
mkallsmall2(s, w, len);
free((*slst)[j]);
if (spell(s)) {
(*slst)[l] = mystrdup(s);
if ((*slst)[l]) l++;
} else {
mkinitcap2(s, w, len);
if (spell(s)) {
(*slst)[l] = mystrdup(s);
if ((*slst)[l]) l++;
}
}
} else {
(*slst)[l] = (*slst)[j];
l++;
}
}
ns = l;
}
}
}
// remove duplications
int l = 0;
for (int j = 0; j < ns; j++) {
(*slst)[l] = (*slst)[j];
for (int k = 0; k < l; k++) {
if (strcmp((*slst)[k], (*slst)[j]) == 0) {
free((*slst)[j]);
l--;
break;
}
}
l++;
}
ns = l;
// output conversion
rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
for (int j = 0; rl && j < ns; j++) {
if (rl->conv((*slst)[j], wspace)) {
free((*slst)[j]);
(*slst)[j] = mystrdup(wspace);
}
}
// if suggestions removed by nosuggest, onlyincompound parameters
if (l == 0 && *slst) {
free(*slst);
*slst = NULL;
}
return l;
}
void Hunspell::free_list(char *** slst, int n) {
freelist(slst, n);
}
char * Hunspell::get_dic_encoding()
{
return encoding;
}
#ifdef HUNSPELL_EXPERIMENTAL
// XXX need UTF-8 support
int Hunspell::suggest_auto(char*** slst, const char * word)
{
char cw[MAXWORDUTF8LEN];
char wspace[MAXWORDUTF8LEN];
if (!pSMgr || maxdic == 0) return 0;
int wl = strlen(word);
if (utf8) {
if (wl >= MAXWORDUTF8LEN) return 0;
} else {
if (wl >= MAXWORDLEN) return 0;
}
int captype = 0;
int abbv = 0;
wl = cleanword(cw, word, &captype, &abbv);
if (wl == 0) return 0;
int ns = 0;
*slst = NULL; // HU, nsug in pSMgr->suggest
switch(captype) {
case NOCAP: {
ns = pSMgr->suggest_auto(slst, cw, ns);
if (ns>0) break;
break;
}
case INITCAP: {
memcpy(wspace,cw,(wl+1));
mkallsmall(wspace);
ns = pSMgr->suggest_auto(slst, wspace, ns);
for (int j=0; j < ns; j++)
mkinitcap((*slst)[j]);
ns = pSMgr->suggest_auto(slst, cw, ns);
break;
}
case HUHINITCAP:
case HUHCAP: {
ns = pSMgr->suggest_auto(slst, cw, ns);
if (ns == 0) {
memcpy(wspace,cw,(wl+1));
mkallsmall(wspace);
ns = pSMgr->suggest_auto(slst, wspace, ns);
}
break;
}
case ALLCAP: {
memcpy(wspace,cw,(wl+1));
mkallsmall(wspace);
ns = pSMgr->suggest_auto(slst, wspace, ns);
mkinitcap(wspace);
ns = pSMgr->suggest_auto(slst, wspace, ns);
for (int j=0; j < ns; j++)
mkallcap((*slst)[j]);
break;
}
}
// word reversing wrapper for complex prefixes
if (complexprefixes) {
for (int j = 0; j < ns; j++) {
if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]);
}
}
// expand suggestions with dot(s)
if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
for (int j = 0; j < ns; j++) {
(*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
strcat((*slst)[j], word + strlen(word) - abbv);
}
}
// LANG_hu section: replace '-' with ' ' in Hungarian
if (langnum == LANG_hu) {
for (int j=0; j < ns; j++) {
char * pos = strchr((*slst)[j],'-');
if (pos) {
int info;
char w[MAXWORDUTF8LEN];
*pos = '\0';
strcpy(w, (*slst)[j]);
strcat(w, pos + 1);
spell(w, &info, NULL);
if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
*pos = ' ';
} else *pos = '-';
}
}
}
// END OF LANG_hu section
return ns;
}
#endif
int Hunspell::stem(char*** slst, char ** desc, int n)
{
char result[MAXLNLEN];
char result2[MAXLNLEN];
*slst = NULL;
if (n == 0) return 0;
*result2 = '\0';
for (int i = 0; i < n; i++) {
*result = '\0';
// add compound word parts (except the last one)
char * s = (char *) desc[i];
char * part = strstr(s, MORPH_PART);
if (part) {
char * nextpart = strstr(part + 1, MORPH_PART);
while (nextpart) {
copy_field(result + strlen(result), part, MORPH_PART);
part = nextpart;
nextpart = strstr(part + 1, MORPH_PART);
}
s = part;
}
char **pl;
char tok[MAXLNLEN];
strcpy(tok, s);
char * alt = strstr(tok, " | ");
while (alt) {
alt[1] = MSEP_ALT;
alt = strstr(alt, " | ");
}
int pln = line_tok(tok, &pl, MSEP_ALT);
for (int k = 0; k < pln; k++) {
// add derivational suffixes
if (strstr(pl[k], MORPH_DERI_SFX)) {
// remove inflectional suffixes
char * is = strstr(pl[k], MORPH_INFL_SFX);
if (is) *is = '\0';
char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]);
if (sg) {
char ** gen;
int genl = line_tok(sg, &gen, MSEP_REC);
free(sg);
for (int j = 0; j < genl; j++) {
sprintf(result2 + strlen(result2), "%c%s%s",
MSEP_REC, result, gen[j]);
}
freelist(&gen, genl);
}
} else {
sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result);
if (strstr(pl[k], MORPH_SURF_PFX)) {
copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX);
}
copy_field(result2 + strlen(result2), pl[k], MORPH_STEM);
}
}
freelist(&pl, pln);
}
int sln = line_tok(result2, slst, MSEP_REC);
return uniqlist(*slst, sln);
}
int Hunspell::stem(char*** slst, const char * word)
{
char ** pl;
int pln = analyze(&pl, word);
int pln2 = stem(slst, pl, pln);
freelist(&pl, pln);
return pln2;
}
#ifdef HUNSPELL_EXPERIMENTAL
int Hunspell::suggest_pos_stems(char*** slst, const char * word)
{
char cw[MAXWORDUTF8LEN];
char wspace[MAXWORDUTF8LEN];
if (! pSMgr || maxdic == 0) return 0;
int wl = strlen(word);
if (utf8) {
if (wl >= MAXWORDUTF8LEN) return 0;
} else {
if (wl >= MAXWORDLEN) return 0;
}
int captype = 0;
int abbv = 0;
wl = cleanword(cw, word, &captype, &abbv);
if (wl == 0) return 0;
int ns = 0; // ns=0 = normalized input
*slst = NULL; // HU, nsug in pSMgr->suggest
switch(captype) {
case HUHCAP:
case NOCAP: {
ns = pSMgr->suggest_pos_stems(slst, cw, ns);
if ((abbv) && (ns == 0)) {
memcpy(wspace,cw,wl);
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
}
break;
}
case INITCAP: {
ns = pSMgr->suggest_pos_stems(slst, cw, ns);
if (ns == 0 || ((*slst)[0][0] == '#')) {
memcpy(wspace,cw,(wl+1));
mkallsmall(wspace);
ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
}
break;
}
case ALLCAP: {
ns = pSMgr->suggest_pos_stems(slst, cw, ns);
if (ns != 0) break;
memcpy(wspace,cw,(wl+1));
mkallsmall(wspace);
ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
if (ns == 0) {
mkinitcap(wspace);
ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
}
break;
}
}
return ns;
}
#endif // END OF HUNSPELL_EXPERIMENTAL CODE
const char * Hunspell::get_wordchars()
{
return pAMgr->get_wordchars();
}
unsigned short * Hunspell::get_wordchars_utf16(int * len)
{
return pAMgr->get_wordchars_utf16(len);
}
void Hunspell::mkinitcap(char * p)
{
if (!utf8) {
if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
} else {
int len;
w_char u[MAXWORDLEN];
len = u8_u16(u, MAXWORDLEN, p);
unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
u[0].h = (unsigned char) (i >> 8);
u[0].l = (unsigned char) (i & 0x00FF);
u16_u8(p, MAXWORDUTF8LEN, u, len);
}
}
int Hunspell::mkinitcap2(char * p, w_char * u, int nc)
{
if (!utf8) {
if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
} else if (nc > 0) {
unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
u[0].h = (unsigned char) (i >> 8);
u[0].l = (unsigned char) (i & 0x00FF);
u16_u8(p, MAXWORDUTF8LEN, u, nc);
return strlen(p);
}
return nc;
}
int Hunspell::mkinitsmall2(char * p, w_char * u, int nc)
{
if (!utf8) {
if (*p != '\0') *p = csconv[((unsigned char)*p)].clower;
} else if (nc > 0) {
unsigned short i = unicodetolower((u[0].h << 8) + u[0].l, langnum);
u[0].h = (unsigned char) (i >> 8);
u[0].l = (unsigned char) (i & 0x00FF);
u16_u8(p, MAXWORDUTF8LEN, u, nc);
return strlen(p);
}
return nc;
}
int Hunspell::add(const char * word)
{
if (pHMgr[0]) return (pHMgr[0])->add(word);
return 0;
}
int Hunspell::add_with_affix(const char * word, const char * example)
{
if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example);
return 0;
}
int Hunspell::remove(const char * word)
{
if (pHMgr[0]) return (pHMgr[0])->remove(word);
return 0;
}
const char * Hunspell::get_version()
{
return pAMgr->get_version();
}
struct cs_info * Hunspell::get_csconv()
{
return csconv;
}
void Hunspell::cat_result(char * result, char * st)
{
if (st) {
if (*result) mystrcat(result, "\n", MAXLNLEN);
mystrcat(result, st, MAXLNLEN);
free(st);
}
}
int Hunspell::analyze(char*** slst, const char * word)
{
char cw[MAXWORDUTF8LEN];
char wspace[MAXWORDUTF8LEN];
w_char unicw[MAXWORDLEN];
int wl2 = 0;
*slst = NULL;
if (! pSMgr || maxdic == 0) return 0;
int nc = strlen(word);
if (utf8) {
if (nc >= MAXWORDUTF8LEN) return 0;
} else {
if (nc >= MAXWORDLEN) return 0;
}
int captype = 0;
int abbv = 0;
int wl = 0;
// input conversion
RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
if (wl == 0) {
if (abbv) {
for (wl = 0; wl < abbv; wl++) cw[wl] = '.';
cw[wl] = '\0';
abbv = 0;
} else return 0;
}
char result[MAXLNLEN];
char * st = NULL;
*result = '\0';
int n = 0;
int n2 = 0;
int n3 = 0;
// test numbers
// LANG_hu section: set dash information for suggestions
if (langnum == LANG_hu) {
while ((n < wl) &&
(((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) {
n++;
if ((cw[n] == '.') || (cw[n] == ',')) {
if (((n2 == 0) && (n > 3)) ||
((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break;
n2++;
n3 = n;
}
}
if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0;
if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) {
mystrcat(result, cw, MAXLNLEN);
result[n - 1] = '\0';
if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1));
else {
char sign = cw[n];
cw[n] = '\0';
cat_result(result, pSMgr->suggest_morph(cw + n - 1));
mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE
cw[n] = sign;
cat_result(result, pSMgr->suggest_morph(cw + n));
}
return line_tok(result, slst, MSEP_REC);
}
}
// END OF LANG_hu section
switch(captype) {
case HUHCAP:
case HUHINITCAP:
case NOCAP: {
cat_result(result, pSMgr->suggest_morph(cw));
if (abbv) {
memcpy(wspace,cw,wl);
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
cat_result(result, pSMgr->suggest_morph(wspace));
}
break;
}
case INITCAP: {
wl = mkallsmall2(cw, unicw, nc);
memcpy(wspace,cw,(wl+1));
wl2 = mkinitcap2(cw, unicw, nc);
cat_result(result, pSMgr->suggest_morph(wspace));
cat_result(result, pSMgr->suggest_morph(cw));
if (abbv) {
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
cat_result(result, pSMgr->suggest_morph(wspace));
memcpy(wspace, cw, wl2);
*(wspace+wl2) = '.';
*(wspace+wl2+1) = '\0';
cat_result(result, pSMgr->suggest_morph(wspace));
}
break;
}
case ALLCAP: {
cat_result(result, pSMgr->suggest_morph(cw));
if (abbv) {
memcpy(wspace,cw,wl);
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
cat_result(result, pSMgr->suggest_morph(cw));
}
wl = mkallsmall2(cw, unicw, nc);
memcpy(wspace,cw,(wl+1));
wl2 = mkinitcap2(cw, unicw, nc);
cat_result(result, pSMgr->suggest_morph(wspace));
cat_result(result, pSMgr->suggest_morph(cw));
if (abbv) {
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
cat_result(result, pSMgr->suggest_morph(wspace));
memcpy(wspace, cw, wl2);
*(wspace+wl2) = '.';
*(wspace+wl2+1) = '\0';
cat_result(result, pSMgr->suggest_morph(wspace));
}
break;
}
}
if (*result) {
// word reversing wrapper for complex prefixes
if (complexprefixes) {
if (utf8) reverseword_utf(result); else reverseword(result);
}
return line_tok(result, slst, MSEP_REC);
}
// compound word with dash (HU) I18n
char * dash = NULL;
int nresult = 0;
// LANG_hu section: set dash information for suggestions
if (langnum == LANG_hu) dash = (char *) strchr(cw,'-');
if ((langnum == LANG_hu) && dash) {
*dash='\0';
// examine 2 sides of the dash
if (dash[1] == '\0') { // base word ending with dash
if (spell(cw)) {
char * p = pSMgr->suggest_morph(cw);
if (p) {
int ret = line_tok(p, slst, MSEP_REC);
free(p);
return ret;
}
}
} else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat.
if (spell(cw) && (spell("-e"))) {
st = pSMgr->suggest_morph(cw);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
}
mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE
st = pSMgr->suggest_morph("-e");
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
}
return line_tok(result, slst, MSEP_REC);
}
} else {
// first word ending with dash: word- XXX ???
char r2 = *(dash + 1);
dash[0]='-';
dash[1]='\0';
nresult = spell(cw);
dash[1] = r2;
dash[0]='\0';
if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) ||
((dash[1] > '0') && (dash[1] < '9')))) {
st = pSMgr->suggest_morph(cw);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE
}
st = pSMgr->suggest_morph(dash+1);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
}
return line_tok(result, slst, MSEP_REC);
}
}
// affixed number in correct word
if (nresult && (dash > cw) && (((*(dash-1)<='9') &&
(*(dash-1)>='0')) || (*(dash-1)=='.'))) {
*dash='-';
n = 1;
if (*(dash - n) == '.') n++;
// search first not a number character to left from dash
while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) {
n++;
}
if ((dash - n) < cw) n--;
// numbers: valami1000000-hoz
// examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz,
// 56-hoz, 6-hoz
for(; n >= 1; n--) {
if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && checkword(dash - n, NULL, NULL)) {
mystrcat(result, cw, MAXLNLEN);
result[dash - cw - n] = '\0';
st = pSMgr->suggest_morph(dash - n);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
}
return line_tok(result, slst, MSEP_REC);
}
}
}
}
return 0;
}
int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln)
{
*slst = NULL;
if (!pSMgr || !pln) return 0;
char **pl2;
int pl2n = analyze(&pl2, word);
int captype = 0;
int abbv = 0;
char cw[MAXWORDUTF8LEN];
cleanword(cw, word, &captype, &abbv);
char result[MAXLNLEN];
*result = '\0';
for (int i = 0; i < pln; i++) {
cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i]));
}
freelist(&pl2, pl2n);
if (*result) {
// allcap
if (captype == ALLCAP) mkallcap(result);
// line split
int linenum = line_tok(result, slst, MSEP_REC);
// capitalize
if (captype == INITCAP || captype == HUHINITCAP) {
for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]);
}
// temporary filtering of prefix related errors (eg.
// generate("undrinkable", "eats") --> "undrinkables" and "*undrinks")
int r = 0;
for (int j=0; j < linenum; j++) {
if (!spell((*slst)[j])) {
free((*slst)[j]);
(*slst)[j] = NULL;
} else {
if (r < j) (*slst)[r] = (*slst)[j];
r++;
}
}
if (r > 0) return r;
free(*slst);
*slst = NULL;
}
return 0;
}
int Hunspell::generate(char*** slst, const char * word, const char * pattern)
{
char **pl;
int pln = analyze(&pl, pattern);
int n = generate(slst, word, pl, pln);
freelist(&pl, pln);
return uniqlist(*slst, n);
}
// minimal XML parser functions
int Hunspell::get_xml_par(char * dest, const char * par, int max)
{
char * d = dest;
if (!par) return 0;
char end = *par;
char * dmax = dest + max;
if (end == '>') end = '<';
else if (end != '\'' && end != '"') return 0; // bad XML
for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) *d = *par;
*d = '\0';
mystrrep(dest, "&lt;", "<");
mystrrep(dest, "&amp;", "&");
return (int)(d - dest);
}
int Hunspell::get_langnum() const
{
return langnum;
}
// return the beginning of the element (attr == NULL) or the attribute
const char * Hunspell::get_xml_pos(const char * s, const char * attr)
{
const char * end = strchr(s, '>');
const char * p = s;
if (attr == NULL) return end;
do {
p = strstr(p, attr);
if (!p || p >= end) return 0;
} while (*(p-1) != ' ' && *(p-1) != '\n');
return p + strlen(attr);
}
int Hunspell::check_xml_par(const char * q, const char * attr, const char * value) {
char cw[MAXWORDUTF8LEN];
if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) &&
strcmp(cw, value) == 0) return 1;
return 0;
}
int Hunspell::get_xml_list(char ***slst, char * list, const char * tag) {
int n = 0;
char * p;
if (!list) return 0;
for (p = list; ((p = strstr(p, tag)) != NULL); p++) n++;
if (n == 0) return 0;
*slst = (char **) malloc(sizeof(char *) * n);
if (!*slst) return 0;
for (p = list, n = 0; ((p = strstr(p, tag)) != NULL); p++, n++) {
int l = strlen(p);
(*slst)[n] = (char *) malloc(l + 1);
if (!(*slst)[n]) return n;
if (!get_xml_par((*slst)[n], p + strlen(tag) - 1, l)) {
free((*slst)[n]);
break;
}
}
return n;
}
int Hunspell::spellml(char*** slst, const char * word)
{
char *q, *q2;
char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN];
q = (char *) strstr(word, "<query");
if (!q) return 0; // bad XML input
q2 = strchr(q, '>');
if (!q2) return 0; // bad XML input
q2 = strstr(q2, "<word");
if (!q2) return 0; // bad XML input
if (check_xml_par(q, "type=", "analyze")) {
int n = 0, s = 0;
if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 10)) n = analyze(slst, cw);
if (n == 0) return 0;
// convert the result to <code><a>ana1</a><a>ana2</a></code> format
for (int i = 0; i < n; i++) s+= strlen((*slst)[i]);
char * r = (char *) malloc(6 + 5 * s + 7 * n + 7 + 1); // XXX 5*s->&->&amp;
if (!r) return 0;
strcpy(r, "<code>");
for (int i = 0; i < n; i++) {
int l = strlen(r);
strcpy(r + l, "<a>");
strcpy(r + l + 3, (*slst)[i]);
mystrrep(r + l + 3, "\t", " ");
mystrrep(r + l + 3, "<", "&lt;");
mystrrep(r + l + 3, "&", "&amp;");
strcat(r, "</a>");
free((*slst)[i]);
}
strcat(r, "</code>");
(*slst)[0] = r;
return 1;
} else if (check_xml_par(q, "type=", "stem")) {
if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1)) return stem(slst, cw);
} else if (check_xml_par(q, "type=", "generate")) {
int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1);
if (n == 0) return 0;
char * q3 = strstr(q2 + 1, "<word");
if (q3) {
if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN - 1)) {
return generate(slst, cw, cw2);
}
} else {
if ((q2 = strstr(q2 + 1, "<code")) != NULL) {
char ** slst2;
if ((n = get_xml_list(&slst2, strchr(q2, '>'), "<a>")) != 0) {
int n2 = generate(slst, cw, slst2, n);
freelist(&slst2, n);
return uniqlist(*slst, n2);
}
freelist(&slst2, n);
}
}
}
return 0;
}
#ifdef HUNSPELL_EXPERIMENTAL
// XXX need UTF-8 support
char * Hunspell::morph_with_correction(const char * word)
{
char cw[MAXWORDUTF8LEN];
char wspace[MAXWORDUTF8LEN];
if (! pSMgr || maxdic == 0) return NULL;
int wl = strlen(word);
if (utf8) {
if (wl >= MAXWORDUTF8LEN) return NULL;
} else {
if (wl >= MAXWORDLEN) return NULL;
}
int captype = 0;
int abbv = 0;
wl = cleanword(cw, word, &captype, &abbv);
if (wl == 0) return NULL;
char result[MAXLNLEN];
char * st = NULL;
*result = '\0';
switch(captype) {
case NOCAP: {
st = pSMgr->suggest_morph_for_spelling_error(cw);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
}
if (abbv) {
memcpy(wspace,cw,wl);
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
st = pSMgr->suggest_morph_for_spelling_error(wspace);
if (st) {
if (*result) mystrcat(result, "\n", MAXLNLEN);
mystrcat(result, st, MAXLNLEN);
free(st);
}
}
break;
}
case INITCAP: {
memcpy(wspace,cw,(wl+1));
mkallsmall(wspace);
st = pSMgr->suggest_morph_for_spelling_error(wspace);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
}
st = pSMgr->suggest_morph_for_spelling_error(cw);
if (st) {
if (*result) mystrcat(result, "\n", MAXLNLEN);
mystrcat(result, st, MAXLNLEN);
free(st);
}
if (abbv) {
memcpy(wspace,cw,wl);
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
mkallsmall(wspace);
st = pSMgr->suggest_morph_for_spelling_error(wspace);
if (st) {
if (*result) mystrcat(result, "\n", MAXLNLEN);
mystrcat(result, st, MAXLNLEN);
free(st);
}
mkinitcap(wspace);
st = pSMgr->suggest_morph_for_spelling_error(wspace);
if (st) {
if (*result) mystrcat(result, "\n", MAXLNLEN);
mystrcat(result, st, MAXLNLEN);
free(st);
}
}
break;
}
case HUHCAP: {
st = pSMgr->suggest_morph_for_spelling_error(cw);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
}
memcpy(wspace,cw,(wl+1));
mkallsmall(wspace);
st = pSMgr->suggest_morph_for_spelling_error(wspace);
if (st) {
if (*result) mystrcat(result, "\n", MAXLNLEN);
mystrcat(result, st, MAXLNLEN);
free(st);
}
break;
}
case ALLCAP: {
memcpy(wspace,cw,(wl+1));
st = pSMgr->suggest_morph_for_spelling_error(wspace);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
}
mkallsmall(wspace);
st = pSMgr->suggest_morph_for_spelling_error(wspace);
if (st) {
if (*result) mystrcat(result, "\n", MAXLNLEN);
mystrcat(result, st, MAXLNLEN);
free(st);
}
mkinitcap(wspace);
st = pSMgr->suggest_morph_for_spelling_error(wspace);
if (st) {
if (*result) mystrcat(result, "\n", MAXLNLEN);
mystrcat(result, st, MAXLNLEN);
free(st);
}
if (abbv) {
memcpy(wspace,cw,(wl+1));
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
if (*result) mystrcat(result, "\n", MAXLNLEN);
st = pSMgr->suggest_morph_for_spelling_error(wspace);
if (st) {
mystrcat(result, st, MAXLNLEN);
free(st);
}
mkallsmall(wspace);
st = pSMgr->suggest_morph_for_spelling_error(wspace);
if (st) {
if (*result) mystrcat(result, "\n", MAXLNLEN);
mystrcat(result, st, MAXLNLEN);
free(st);
}
mkinitcap(wspace);
st = pSMgr->suggest_morph_for_spelling_error(wspace);
if (st) {
if (*result) mystrcat(result, "\n", MAXLNLEN);
mystrcat(result, st, MAXLNLEN);
free(st);
}
}
break;
}
}
if (*result) return mystrdup(result);
return NULL;
}
#endif // END OF HUNSPELL_EXPERIMENTAL CODE
Hunhandle *Hunspell_create(const char * affpath, const char * dpath)
{
#ifdef HUNSPELL_CHROME_CLIENT
return NULL;
#else
return (Hunhandle*)(new Hunspell(affpath, dpath));
#endif
}
Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath,
const char * key)
{
#ifdef HUNSPELL_CHROME_CLIENT
return NULL;
#else
return (Hunhandle*)(new Hunspell(affpath, dpath, key));
#endif
}
void Hunspell_destroy(Hunhandle *pHunspell)
{
delete (Hunspell*)(pHunspell);
}
int Hunspell_spell(Hunhandle *pHunspell, const char *word)
{
return ((Hunspell*)pHunspell)->spell(word);
}
char *Hunspell_get_dic_encoding(Hunhandle *pHunspell)
{
return ((Hunspell*)pHunspell)->get_dic_encoding();
}
int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word)
{
return ((Hunspell*)pHunspell)->suggest(slst, word);
}
int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word)
{
return ((Hunspell*)pHunspell)->analyze(slst, word);
}
int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word)
{
return ((Hunspell*)pHunspell)->stem(slst, word);
}
int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n)
{
return ((Hunspell*)pHunspell)->stem(slst, desc, n);
}
int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word,
const char * word2)
{
return ((Hunspell*)pHunspell)->generate(slst, word, word2);
}
int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word,
char** desc, int n)
{
return ((Hunspell*)pHunspell)->generate(slst, word, desc, n);
}
/* functions for run-time modification of the dictionary */
/* add word to the run-time dictionary */
int Hunspell_add(Hunhandle *pHunspell, const char * word) {
return ((Hunspell*)pHunspell)->add(word);
}
/* add word to the run-time dictionary with affix flags of
* the example (a dictionary word): Hunspell will recognize
* affixed forms of the new word, too.
*/
int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word,
const char * example) {
return ((Hunspell*)pHunspell)->add_with_affix(word, example);
}
/* remove word from the run-time dictionary */
int Hunspell_remove(Hunhandle *pHunspell, const char * word) {
return ((Hunspell*)pHunspell)->remove(word);
}
void Hunspell_free_list(Hunhandle *, char *** slst, int n) {
freelist(slst, n);
}