blob: 0134ce144b949ab92435d602a46353d4ebc90907 [file] [log] [blame]
Index: src/hunspell/affixmgr.cxx
===================================================================
RCS file: /cvsroot/hunspell/hunspell/src/hunspell/affixmgr.cxx,v
retrieving revision 1.41
diff -u -r1.41 affixmgr.cxx
--- src/hunspell/affixmgr.cxx 16 Dec 2011 09:15:34 -0000 1.41
+++ src/hunspell/affixmgr.cxx 29 May 2014 01:05:07 -0000
@@ -14,8 +14,14 @@
#include "csutil.hxx"
+#ifdef HUNSPELL_CHROME_CLIENT
+AffixMgr::AffixMgr(hunspell::BDictReader* reader, HashMgr** ptr, int * md)
+{
+ bdict_reader = reader;
+#else
AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key)
{
+#endif
// register hash manager and load affix data from aff file
pHMgr = ptr[0];
alldic = ptr;
@@ -107,9 +113,17 @@
sFlag[i] = NULL;
}
+#ifdef HUNSPELL_CHROME_CLIENT
+ // Define dummy parameters for parse_file() to avoid changing the parameters
+ // of parse_file(). This may make it easier to merge the changes of the
+ // original hunspell.
+ const char* affpath = NULL;
+ const char* key = NULL;
+#else
for (int j=0; j < CONTSIZE; j++) {
contclasses[j] = 0;
}
+#endif
if (parse_file(affpath, key)) {
HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
@@ -269,6 +283,43 @@
char * line; // io buffers
char ft; // affix type
+#ifdef HUNSPELL_CHROME_CLIENT
+ // open the affix file
+ // We're always UTF-8
+ utf8 = 1;
+
+ // A BDICT file stores PFX and SFX lines in a special section and it provides
+ // a special line iterator for reading PFX and SFX lines.
+ // We create a FileMgr object from this iterator and parse PFX and SFX lines
+ // before parsing other lines.
+ hunspell::LineIterator affix_iterator = bdict_reader->GetAffixLineIterator();
+ FileMgr* iterator = new FileMgr(&affix_iterator);
+ if (!iterator) {
+ HUNSPELL_WARNING(stderr,
+ "error: could not create a FileMgr from an affix line iterator.\n");
+ return 1;
+ }
+
+ while ((line = iterator->getline())) {
+ ft = ' ';
+ if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
+ if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
+ if (ft != ' ')
+ parse_affix(line, ft, iterator, NULL);
+ }
+ delete iterator;
+
+ // Create a FileMgr object for reading lines except PFX and SFX lines.
+ // We don't need to change the loop below since our FileMgr emulates the
+ // original one.
+ hunspell::LineIterator other_iterator = bdict_reader->GetOtherLineIterator();
+ FileMgr * afflst = new FileMgr(&other_iterator);
+ if (!afflst) {
+ HUNSPELL_WARNING(stderr,
+ "error: could not create a FileMgr from an other line iterator.\n");
+ return 1;
+ }
+#else
// checking flag duplication
char dupflags[CONTSIZE];
char dupflags_ini = 1;
@@ -282,6 +333,7 @@
HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
return 1;
}
+#endif
// step one is to parse the affix file building up the internal
// affix data structures
@@ -291,6 +343,7 @@
while ((line = afflst->getline()) != NULL) {
mychomp(line);
+#ifndef HUNSPELL_CHROME_CLIENT
/* remove byte order mark */
if (firstline) {
firstline = 0;
@@ -299,6 +352,7 @@
memmove(line, line+3, strlen(line+3)+1);
}
}
+#endif
/* parse in the keyboard string */
if (strncmp(line,"KEY",3) == 0) {
@@ -545,6 +599,7 @@
}
}
+#ifndef HUNSPELL_CHROME_CLIENT
/* parse in the typical fault correcting table */
if (strncmp(line,"REP",3) == 0) {
if (parse_reptable(line, afflst)) {
@@ -552,6 +607,7 @@
return 1;
}
}
+#endif
/* parse in the input conversion table */
if (strncmp(line,"ICONV",5) == 0) {
@@ -699,6 +755,7 @@
checksharps=1;
}
+#ifndef HUNSPELL_CHROME_CLIENT
/* parse this affix: P - prefix, S - suffix */
ft = ' ';
if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
@@ -713,6 +770,7 @@
return 1;
}
}
+#endif
}
finishFileMgr(afflst);
@@ -1307,6 +1365,26 @@
const char * r;
int lenr, lenp;
+#ifdef HUNSPELL_CHROME_CLIENT
+ const char *pattern, *pattern2;
+ hunspell::ReplacementIterator iterator = bdict_reader->GetReplacementIterator();
+ while (iterator.GetNext(&pattern, &pattern2)) {
+ r = word;
+ lenr = strlen(pattern2);
+ lenp = strlen(pattern);
+
+ // search every occurence of the pattern in the word
+ while ((r=strstr(r, pattern)) != NULL) {
+ strcpy(candidate, word);
+ if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
+ strcpy(candidate+(r-word), pattern2);
+ strcpy(candidate+(r-word)+lenr, r+lenp);
+ if (candidate_check(candidate,strlen(candidate))) return 1;
+ r++; // search for the next letter
+ }
+ }
+
+#else
if ((wl < 2) || !numrep) return 0;
for (int i=0; i < numrep; i++ ) {
@@ -1323,6 +1401,7 @@
r++; // search for the next letter
}
}
+#endif
return 0;
}
@@ -4219,6 +4298,7 @@
case 1: {
np++;
aflag = pHMgr->decode_flag(piece);
+#ifndef HUNSPELL_CHROME_CLIENT // We don't check for duplicates.
if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
((at == 'P') && (dupflags[aflag] & dupPFX))) {
HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix flag\n",
@@ -4226,6 +4306,7 @@
// return 1; XXX permissive mode for bad dictionaries
}
dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX);
+#endif
break;
}
// piece 3 - is cross product indicator
Index: src/hunspell/affixmgr.hxx
===================================================================
RCS file: /cvsroot/hunspell/hunspell/src/hunspell/affixmgr.hxx,v
retrieving revision 1.15
diff -u -r1.15 affixmgr.hxx
--- src/hunspell/affixmgr.hxx 13 Oct 2011 13:41:54 -0000 1.15
+++ src/hunspell/affixmgr.hxx 29 May 2014 01:05:07 -0000
@@ -18,6 +18,40 @@
class PfxEntry;
class SfxEntry;
+#ifdef HUNSPELL_CHROME_CLIENT
+
+#include <vector>
+
+// This class provides an implementation of the contclasses array in AffixMgr
+// that is normally a large static array. We should almost never need more than
+// 256 elements, so this class only allocates that much to start off with. If
+// elements higher than that are actually used, we'll automatically expand.
+class ContClasses {
+ public:
+ ContClasses() {
+ // Pre-allocate a buffer so that typically, we'll never have to resize.
+ EnsureSizeIs(256);
+ }
+
+ char& operator[](size_t index) {
+ EnsureSizeIs(index + 1);
+ return data[index];
+ }
+
+ void EnsureSizeIs(size_t new_size) {
+ if (data.size() >= new_size)
+ return; // Nothing to do.
+
+ size_t old_size = data.size();
+ data.resize(new_size);
+ memset(&data[old_size], 0, new_size - old_size);
+ }
+
+ std::vector<char> data;
+};
+
+#endif // HUNSPELL_CHROME_CLIENT
+
class LIBHUNSPELL_DLL_EXPORTED AffixMgr
{
@@ -106,12 +140,20 @@
int fullstrip;
int havecontclass; // boolean variable
+#ifdef HUNSPELL_CHROME_CLIENT
+ ContClasses contclasses;
+#else
char contclasses[CONTSIZE]; // flags of possible continuing classes (twofold affix)
+#endif
public:
+#ifdef HUNSPELL_CHROME_CLIENT
+ AffixMgr(hunspell::BDictReader* reader, HashMgr** ptr, int * md);
+#else
AffixMgr(const char * affpath, HashMgr** ptr, int * md,
const char * key = NULL);
+#endif
~AffixMgr();
struct hentry * affix_check(const char * word, int len,
const unsigned short needflag = (unsigned short) 0,
@@ -218,6 +260,10 @@
int get_fullstrip() const;
private:
+#ifdef HUNSPELL_CHROME_CLIENT
+ // Not owned by us, owned by the Hunspell object.
+ hunspell::BDictReader* bdict_reader;
+#endif
int parse_file(const char * affpath, const char * key);
int parse_flag(char * line, unsigned short * out, FileMgr * af);
int parse_num(char * line, int * out, FileMgr * af);
@@ -249,4 +295,3 @@
};
#endif
-
Index: src/hunspell/filemgr.cxx
===================================================================
RCS file: /cvsroot/hunspell/hunspell/src/hunspell/filemgr.cxx,v
retrieving revision 1.5
diff -u -r1.5 filemgr.cxx
--- src/hunspell/filemgr.cxx 23 Jun 2011 09:21:50 -0000 1.5
+++ src/hunspell/filemgr.cxx 29 May 2014 01:05:07 -0000
@@ -7,6 +7,32 @@
#include "filemgr.hxx"
+#ifdef HUNSPELL_CHROME_CLIENT
+#include "third_party/hunspell/google/bdict_reader.h"
+
+FileMgr::FileMgr(hunspell::LineIterator* iterator) : iterator_(iterator) {
+}
+
+FileMgr::~FileMgr() {
+}
+
+char * FileMgr::getline() {
+ // Read one line from a BDICT file and store the line to our line buffer.
+ // To emulate the original FileMgr::getline(), this function returns
+ // the pointer to our line buffer if we can read a line without errors.
+ // Otherwise, this function returns NULL.
+ bool result = iterator_->AdvanceAndCopy(line_, BUFSIZE - 1);
+ return result ? line_ : NULL;
+}
+
+int FileMgr::getlinenum() {
+ // This function is used only for displaying a line number that causes a
+ // parser error. For a BDICT file, providing a line number doesn't help
+ // identifying the place where causes a parser error so much since it is a
+ // binary file. So, we just return 0.
+ return 0;
+}
+#else
int FileMgr::fail(const char * err, const char * par) {
fprintf(stderr, err, par);
return -1;
@@ -47,3 +73,4 @@
int FileMgr::getlinenum() {
return linenum;
}
+#endif
Index: src/hunspell/filemgr.hxx
===================================================================
RCS file: /cvsroot/hunspell/hunspell/src/hunspell/filemgr.hxx,v
retrieving revision 1.3
diff -u -r1.3 filemgr.hxx
--- src/hunspell/filemgr.hxx 15 Apr 2010 11:22:08 -0000 1.3
+++ src/hunspell/filemgr.hxx 29 May 2014 01:05:07 -0000
@@ -7,6 +7,30 @@
#include "hunzip.hxx"
#include <stdio.h>
+#ifdef HUNSPELL_CHROME_CLIENT
+namespace hunspell {
+class LineIterator;
+} // namespace hunspell
+
+// A class which encapsulates operations of reading a BDICT file.
+// Chrome uses a BDICT file to compress hunspell dictionaries. A BDICT file is
+// a binary file converted from a DIC file and an AFF file. (See
+// "bdict_reader.h" for its format.)
+// This class encapsulates the operations of reading a BDICT file and emulates
+// the original FileMgr operations for AffixMgr so that it can read a BDICT
+// file without so many changes.
+class FileMgr {
+ public:
+ FileMgr(hunspell::LineIterator* iterator);
+ ~FileMgr();
+ char* getline();
+ int getlinenum();
+
+ protected:
+ hunspell::LineIterator* iterator_;
+ char line_[BUFSIZE + 50]; // input buffer
+};
+#else
class LIBHUNSPELL_DLL_EXPORTED FileMgr
{
protected:
@@ -23,3 +47,4 @@
int getlinenum();
};
#endif
+#endif
Index: src/hunspell/hashmgr.cxx
===================================================================
RCS file: /cvsroot/hunspell/hunspell/src/hunspell/hashmgr.cxx,v
retrieving revision 1.12
diff -u -r1.12 hashmgr.cxx
--- src/hunspell/hashmgr.cxx 23 Jun 2011 09:21:50 -0000 1.12
+++ src/hunspell/hashmgr.cxx 29 May 2014 01:05:07 -0000
@@ -12,8 +12,14 @@
// build a hash table from a munched word list
+#ifdef HUNSPELL_CHROME_CLIENT
+HashMgr::HashMgr(hunspell::BDictReader* reader)
+{
+ bdict_reader = reader;
+#else
HashMgr::HashMgr(const char * tpath, const char * apath, const char * key)
{
+#endif
tablesize = 0;
tableptr = NULL;
flag_mode = FLAG_CHAR;
@@ -31,8 +37,14 @@
numaliasm = 0;
aliasm = NULL;
forbiddenword = FORBIDDENWORD; // forbidden word signing flag
+#ifdef HUNSPELL_CHROME_CLIENT
+ // No tables to load, just the AF lines.
+ load_config(NULL, NULL);
+ int ec = LoadAFLines();
+#else
load_config(apath, key);
int ec = load_tables(tpath, key);
+#endif
if (ec) {
/* error condition - what should we do here */
HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec);
@@ -91,15 +103,58 @@
if (ignorechars) free(ignorechars);
if (ignorechars_utf16) free(ignorechars_utf16);
+#ifdef HUNSPELL_CHROME_CLIENT
+ EmptyHentryCache();
+ for (std::vector<std::string*>::iterator it = pointer_to_strings_.begin();
+ it != pointer_to_strings_.end(); ++it) {
+ delete *it;
+ }
+#endif
#ifdef MOZILLA_CLIENT
delete [] csconv;
#endif
}
+#ifdef HUNSPELL_CHROME_CLIENT
+void HashMgr::EmptyHentryCache() {
+ // We need to delete each cache entry, and each additional one in the linked
+ // list of homonyms.
+ for (HEntryCache::iterator i = hentry_cache.begin();
+ i != hentry_cache.end(); ++i) {
+ hentry* cur = i->second;
+ while (cur) {
+ hentry* next = cur->next_homonym;
+ DeleteHashEntry(cur);
+ cur = next;
+ }
+ }
+ hentry_cache.clear();
+}
+#endif
+
// lookup a root word in the hashtable
struct hentry * HashMgr::lookup(const char *word) const
{
+#ifdef HUNSPELL_CHROME_CLIENT
+ int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD];
+ int affix_count = bdict_reader->FindWord(word, affix_ids);
+ if (affix_count == 0) { // look for custom added word
+ std::map<base::StringPiece, int>::const_iterator iter =
+ custom_word_to_affix_id_map_.find(word);
+ if (iter != custom_word_to_affix_id_map_.end()) {
+ affix_count = 1;
+ affix_ids[0] = iter->second;
+ }
+ }
+
+ static const int kMaxWordLen = 128;
+ static char word_buf[kMaxWordLen];
+ // To take account of null-termination, we use upto 127.
+ strncpy(word_buf, word, kMaxWordLen - 1);
+
+ return AffixIDsToHentry(word_buf, affix_ids, affix_count);
+#else
struct hentry * dp;
if (tableptr) {
dp = tableptr[hash(word)];
@@ -109,12 +164,14 @@
}
}
return NULL;
+#endif
}
// add a word to the hash table (private)
int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff,
int al, const char * desc, bool onlyupcase)
{
+#ifndef HUNSPELL_CHROME_CLIENT
bool upcasehomonym = false;
int descl = desc ? (aliasm ? sizeof(short) : strlen(desc) + 1) : 0;
// variable-length hash record with word and optional fields
@@ -206,6 +263,17 @@
if (hp->astr) free(hp->astr);
free(hp);
}
+#else
+ std::map<base::StringPiece, int>::iterator iter =
+ custom_word_to_affix_id_map_.find(word);
+ if(iter == custom_word_to_affix_id_map_.end()) { // word needs to be added
+ std::string* new_string_word = new std::string(word);
+ pointer_to_strings_.push_back(new_string_word);
+ base::StringPiece sp(*(new_string_word));
+ custom_word_to_affix_id_map_[sp] = 0; // no affixes for custom words
+ return 1;
+ }
+#endif
return 0;
}
@@ -256,6 +324,12 @@
// remove word (personal dictionary function for standalone applications)
int HashMgr::remove(const char * word)
{
+#ifdef HUNSPELL_CHROME_CLIENT
+ std::map<base::StringPiece, int>::iterator iter =
+ custom_word_to_affix_id_map_.find(word);
+ if (iter != custom_word_to_affix_id_map_.end())
+ custom_word_to_affix_id_map_.erase(iter);
+#else
struct hentry * dp = lookup(word);
while (dp) {
if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) {
@@ -270,6 +344,7 @@
}
dp = dp->next_homonym;
}
+#endif
return 0;
}
@@ -339,6 +414,44 @@
// initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);
struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
{
+#ifdef HUNSPELL_CHROME_CLIENT
+ // Return NULL if dictionary is not valid.
+ if (!bdict_reader->IsValid())
+ return NULL;
+
+ // This function is only ever called by one place and not nested. We can
+ // therefore keep static state between calls and use |col| as a "reset" flag
+ // to avoid changing the API. It is set to -1 for the first call.
+ // Allocate the iterator on the heap to prevent an exit time destructor.
+ static hunspell::WordIterator& word_iterator =
+ *new hunspell::WordIterator(bdict_reader->GetAllWordIterator());
+ if (col < 0) {
+ col = 1;
+ word_iterator = bdict_reader->GetAllWordIterator();
+ }
+
+ int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD];
+ static const int kMaxWordLen = 128;
+ static char word[kMaxWordLen];
+ int affix_count = word_iterator.Advance(word, kMaxWordLen, affix_ids);
+ if (affix_count == 0)
+ return NULL;
+ short word_len = static_cast<short>(strlen(word));
+
+ // Since hunspell 1.2.8, an hentry struct becomes a variable-length struct,
+ // i.e. a struct which uses its array 'word[1]' as a variable-length array.
+ // As noted above, this function is not nested. So, we just use a static
+ // struct which consists of an hentry and a char[kMaxWordLen], and initialize
+ // the static struct and return it for now.
+ // No need to create linked lists for the extra affixes.
+ static struct {
+ hentry entry;
+ char word[kMaxWordLen];
+ } hash_entry;
+
+ return InitHashEntry(&hash_entry.entry, sizeof(hash_entry),
+ &word[0], word_len, affix_ids[0]);
+#else
if (hp && hp->next != NULL) return hp->next;
for (col++; col < tablesize; col++) {
if (tableptr[col]) return tableptr[col];
@@ -346,11 +459,13 @@
// null at end and reset to start
col = -1;
return NULL;
+#endif
}
// load a munched word list and build a hash table on the fly
int HashMgr::load_tables(const char * tpath, const char * key)
{
+#ifndef HUNSPELL_CHROME_CLIENT
int al;
char * ap;
char * dp;
@@ -471,6 +586,7 @@
}
delete dict;
+#endif
return 0;
}
@@ -479,6 +595,9 @@
int HashMgr::hash(const char * word) const
{
+#ifdef HUNSPELL_CHROME_CLIENT
+ return 0;
+#else
long hv = 0;
for (int i=0; i < 4 && *word != 0; i++)
hv = (hv << 8) | (*word++);
@@ -487,6 +606,7 @@
hv ^= (*word++);
}
return (unsigned long) hv % tablesize;
+#endif
}
int HashMgr::decode_flags(unsigned short ** result, char * flags, FileMgr * af) {
@@ -607,7 +727,12 @@
int firstline = 1;
// open the affix file
+#ifdef HUNSPELL_CHROME_CLIENT
+ hunspell::LineIterator iterator = bdict_reader->GetOtherLineIterator();
+ FileMgr * afflst = new FileMgr(&iterator);
+#else
FileMgr * afflst = new FileMgr(affpath, key);
+#endif
if (!afflst) {
HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath);
return 1;
@@ -802,6 +927,121 @@
return 0;
}
+#ifdef HUNSPELL_CHROME_CLIENT
+int HashMgr::LoadAFLines()
+{
+ utf8 = 1; // We always use UTF-8.
+
+ // Read in all the AF lines which tell us the rules for each affix group ID.
+ hunspell::LineIterator iterator = bdict_reader->GetAfLineIterator();
+ FileMgr afflst(&iterator);
+ while (char* line = afflst.getline()) {
+ int rv = parse_aliasf(line, &afflst);
+ if (rv)
+ return rv;
+ }
+
+ return 0;
+}
+
+hentry* HashMgr::InitHashEntry(hentry* entry,
+ size_t item_size,
+ const char* word,
+ int word_length,
+ int affix_index) const {
+ // Return if the given buffer doesn't have enough space for a hentry struct
+ // or the given word is too long.
+ // Our BDICT cannot handle words longer than (128 - 1) bytes. So, it is
+ // better to return an error if the given word is too long and prevent
+ // an unexpected result caused by a long word.
+ const int kMaxWordLen = 128;
+ if (item_size < sizeof(hentry) + word_length + 1 ||
+ word_length >= kMaxWordLen)
+ return NULL;
+
+ // Initialize a hentry struct with the given parameters, and
+ // append the given string at the end of this hentry struct.
+ memset(entry, 0, item_size);
+ FileMgr af(NULL);
+ entry->alen = static_cast<short>(
+ const_cast<HashMgr*>(this)->get_aliasf(affix_index, &entry->astr, &af));
+ entry->blen = static_cast<unsigned char>(word_length);
+ memcpy(&entry->word, word, word_length);
+
+ return entry;
+}
+
+hentry* HashMgr::CreateHashEntry(const char* word,
+ int word_length,
+ int affix_index) const {
+ // Return if the given word is too long.
+ // (See the comment in HashMgr::InitHashEntry().)
+ const int kMaxWordLen = 128;
+ if (word_length >= kMaxWordLen)
+ return NULL;
+
+ const size_t kEntrySize = sizeof(hentry) + word_length + 1;
+ struct hentry* entry = reinterpret_cast<hentry*>(malloc(kEntrySize));
+ if (entry)
+ InitHashEntry(entry, kEntrySize, word, word_length, affix_index);
+
+ return entry;
+}
+
+void HashMgr::DeleteHashEntry(hentry* entry) const {
+ free(entry);
+}
+
+hentry* HashMgr::AffixIDsToHentry(char* word,
+ int* affix_ids,
+ int affix_count) const
+{
+ if (affix_count == 0)
+ return NULL;
+
+ HEntryCache& cache = const_cast<HashMgr*>(this)->hentry_cache;
+ std::string std_word(word);
+ HEntryCache::iterator found = cache.find(std_word);
+ if (found != cache.end()) {
+ // We must return an existing hentry for the same word if we've previously
+ // handed one out. Hunspell will compare pointers in some cases to see if
+ // two words it has found are the same.
+ return found->second;
+ }
+
+ short word_len = static_cast<short>(strlen(word));
+
+ // We can get a number of prefixes per word. There will normally be only one,
+ // but if not, there will be a linked list of "hentry"s for the "homonym"s
+ // for the word.
+ struct hentry* first_he = NULL;
+ struct hentry* prev_he = NULL; // For making linked list.
+ for (int i = 0; i < affix_count; i++) {
+ struct hentry* he = CreateHashEntry(word, word_len, affix_ids[i]);
+ if (!he)
+ break;
+ if (i == 0)
+ first_he = he;
+ if (prev_he)
+ prev_he->next_homonym = he;
+ prev_he = he;
+ }
+
+ cache[std_word] = first_he; // Save this word in the cache for later.
+ return first_he;
+}
+
+hentry* HashMgr::GetHentryFromHEntryCache(char* word) {
+ HEntryCache& cache = const_cast<HashMgr*>(this)->hentry_cache;
+ std::string std_word(word);
+ HEntryCache::iterator found = cache.find(std_word);
+ if (found != cache.end())
+ return found->second;
+ else
+ return NULL;
+}
+#endif
+
int HashMgr::is_aliasf() {
return (aliasf != NULL);
}
Index: src/hunspell/hashmgr.hxx
===================================================================
RCS file: /cvsroot/hunspell/hunspell/src/hunspell/hashmgr.hxx,v
retrieving revision 1.3
diff -u -r1.3 hashmgr.hxx
--- src/hunspell/hashmgr.hxx 15 Apr 2010 11:22:08 -0000 1.3
+++ src/hunspell/hashmgr.hxx 29 May 2014 01:05:07 -0000
@@ -8,10 +8,25 @@
#include "htypes.hxx"
#include "filemgr.hxx"
+#ifdef HUNSPELL_CHROME_CLIENT
+#include <string>
+#include <map>
+
+#include "base/stl_util.h"
+#include "base/strings/string_piece.h"
+#include "third_party/hunspell/google/bdict_reader.h"
+#endif
+
enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI };
class LIBHUNSPELL_DLL_EXPORTED HashMgr
{
+#ifdef HUNSPELL_CHROME_CLIENT
+ // Not owned by this class, owned by the Hunspell object.
+ hunspell::BDictReader* bdict_reader;
+ std::map<base::StringPiece, int> custom_word_to_affix_id_map_;
+ std::vector<std::string*> pointer_to_strings_;
+#endif
int tablesize;
struct hentry ** tableptr;
int userword;
@@ -34,7 +49,23 @@
public:
+#ifdef HUNSPELL_CHROME_CLIENT
+ HashMgr(hunspell::BDictReader* reader);
+
+ // Return the hentry corresponding to the given word. Returns NULL if the
+ // word is not there in the cache.
+ hentry* GetHentryFromHEntryCache(char* word);
+
+ // Called before we do a new operation. This will empty the cache of pointers
+ // to hentries that we have cached. In Chrome, we make these on-demand, but
+ // they must live as long as the single spellcheck operation that they're part
+ // of since Hunspell will save pointers to various ones as it works.
+ //
+ // This function allows that cache to be emptied and not grow infinitely.
+ void EmptyHentryCache();
+#else
HashMgr(const char * tpath, const char * apath, const char * key = NULL);
+#endif
~HashMgr();
struct hentry * lookup(const char *) const;
@@ -59,6 +90,40 @@
int al, const char * desc, bool onlyupcase);
int load_config(const char * affpath, const char * key);
int parse_aliasf(char * line, FileMgr * af);
+
+#ifdef HUNSPELL_CHROME_CLIENT
+ // Loads the AF lines from a BDICT.
+ // A BDICT file compresses its AF lines to save memory.
+ // This function decompresses each AF line and call parse_aliasf().
+ int LoadAFLines();
+
+ // Helper functions that create a new hentry struct, initialize it, and
+ // delete it.
+ // These functions encapsulate non-trivial operations in creating and
+ // initializing a hentry struct from BDICT data to avoid changing code so much
+ // even when a hentry struct is changed.
+ hentry* InitHashEntry(hentry* entry,
+ size_t item_size,
+ const char* word,
+ int word_length,
+ int affix_index) const;
+ hentry* CreateHashEntry(const char* word,
+ int word_length,
+ int affix_index) const;
+ void DeleteHashEntry(hentry* entry) const;
+
+ // Converts the list of affix IDs to a linked list of hentry structures. The
+ // hentry structures will point to the given word. The returned pointer will
+ // be a statically allocated variable that will change for the next call. The
+ // |word| buffer must be the same.
+ hentry* AffixIDsToHentry(char* word, int* affix_ids, int affix_count) const;
+
+ // See EmptyHentryCache above. Note that each one is actually a linked list
+ // followed by the homonym pointer.
+ typedef std::map<std::string, hentry*> HEntryCache;
+ HEntryCache hentry_cache;
+#endif
+
int add_hidden_capitalized_word(char * word, int wbl, int wcl,
unsigned short * flags, int al, char * dp, int captype);
int parse_aliasm(char * line, FileMgr * af);
Index: src/hunspell/htypes.hxx
===================================================================
RCS file: /cvsroot/hunspell/hunspell/src/hunspell/htypes.hxx,v
retrieving revision 1.3
diff -u -r1.3 htypes.hxx
--- src/hunspell/htypes.hxx 6 Sep 2010 07:58:53 -0000 1.3
+++ src/hunspell/htypes.hxx 29 May 2014 01:05:07 -0000
@@ -1,6 +1,16 @@
#ifndef _HTYPES_HXX_
#define _HTYPES_HXX_
+#ifdef HUNSPELL_CHROME_CLIENT
+// This is a workaround for preventing errors in parsing Turkish BDICs, which
+// contain very long AF lines (~ 12,000 chars).
+// TODO(hbono) change the HashMgr::parse_aliasf() function to be able to parse
+// longer lines than MAXDELEN.
+#define MAXDELEN (8192 * 2)
+#else
+#define MAXDELEN 8192
+#endif // HUNSPELL_CHROME_CLIENT
+
#define ROTATE_LEN 5
#define ROTATE(v,q) \
Index: src/hunspell/hunspell.cxx
===================================================================
RCS file: /cvsroot/hunspell/hunspell/src/hunspell/hunspell.cxx,v
retrieving revision 1.29
diff -u -r1.29 hunspell.cxx
--- src/hunspell/hunspell.cxx 23 Jun 2011 09:21:50 -0000 1.29
+++ src/hunspell/hunspell.cxx 29 May 2014 01:05:07 -0000
@@ -7,20 +7,37 @@
#include "hunspell.hxx"
#include "hunspell.h"
+#ifndef HUNSPELL_CHROME_CLIENT
#ifndef MOZILLA_CLIENT
# include "config.h"
#endif
+#endif
#include "csutil.hxx"
+#ifdef HUNSPELL_CHROME_CLIENT
+Hunspell::Hunspell(const unsigned char* bdict_data, size_t bdict_length)
+#else
Hunspell::Hunspell(const char * affpath, const char * dpath, const char * key)
+#endif
{
encoding = NULL;
csconv = NULL;
utf8 = 0;
complexprefixes = 0;
+#ifndef HUNSPELL_CHROME_CLIENT
affixpath = mystrdup(affpath);
+#endif
maxdic = 0;
+#ifdef HUNSPELL_CHROME_CLIENT
+ bdict_reader = new hunspell::BDictReader;
+ bdict_reader->Init(bdict_data, bdict_length);
+
+ pHMgr[0] = new HashMgr(bdict_reader);
+ if (pHMgr[0]) maxdic = 1;
+
+ pAMgr = new AffixMgr(bdict_reader, pHMgr, &maxdic);
+#else
/* first set up the hash manager */
pHMgr[0] = new HashMgr(dpath, affpath, key);
if (pHMgr[0]) maxdic = 1;
@@ -28,6 +45,7 @@
/* next set up the affix manager */
/* it needs access to the hash manager lookup methods */
pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key);
+#endif
/* get the preferred try string and the dictionary */
/* encoding from the Affix Manager for that dictionary */
@@ -41,7 +59,11 @@
wordbreak = pAMgr->get_breaktable();
/* and finally set up the suggestion manager */
+#ifdef HUNSPELL_CHROME_CLIENT
+ pSMgr = new SuggestMgr(bdict_reader, try_string, MAXSUGGESTION, pAMgr);
+#else
pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr);
+#endif
if (try_string) free(try_string);
}
@@ -59,10 +81,16 @@
csconv= NULL;
if (encoding) free(encoding);
encoding = NULL;
+#ifdef HUNSPELL_CHROME_CLIENT
+ if (bdict_reader) delete bdict_reader;
+ bdict_reader = NULL;
+#else
if (affixpath) free(affixpath);
affixpath = NULL;
+#endif
}
+#ifndef HUNSPELL_CHROME_CLIENT
// load extra dictionaries
int Hunspell::add_dic(const char * dpath, const char * key) {
if (maxdic == MAXDIC || !affixpath) return 1;
@@ -70,6 +98,7 @@
if (pHMgr[maxdic]) maxdic++; else return 1;
return 0;
}
+#endif
// make a copy of src at destination while removing all leading
// blanks and removing any trailing periods after recording
@@ -322,6 +351,9 @@
int Hunspell::spell(const char * word, int * info, char ** root)
{
+#ifdef HUNSPELL_CHROME_CLIENT
+ if (pHMgr[0]) pHMgr[0]->EmptyHentryCache();
+#endif
struct hentry * rv=NULL;
// need larger vector. For example, Turkish capital letter I converted a
// 2-byte UTF-8 character (dotless i) by mkallsmall.
@@ -586,6 +618,13 @@
if (!len)
return NULL;
+#ifdef HUNSPELL_CHROME_CLIENT
+ // We need to check if the word length is valid to make coverity (Event
+ // fixed_size_dest: Possible overrun of N byte fixed size buffer) happy.
+ if ((utf8 && strlen(word) >= MAXWORDUTF8LEN) || (!utf8 && strlen(word) >= MAXWORDLEN))
+ return NULL;
+#endif
+
// word reversing wrapper for complex prefixes
if (complexprefixes) {
if (word != w2) {
@@ -675,6 +714,9 @@
int Hunspell::suggest(char*** slst, const char * word)
{
+#ifdef HUNSPELL_CHROME_CLIENT
+ if (pHMgr[0]) pHMgr[0]->EmptyHentryCache();
+#endif
int onlycmpdsug = 0;
char cw[MAXWORDUTF8LEN];
char wspace[MAXWORDUTF8LEN];
@@ -1921,13 +1963,21 @@
Hunhandle *Hunspell_create(const char * affpath, const char * dpath)
{
+#ifdef HUNSPELL_CHROME_CLIENT
+ return NULL;
+#else
return (Hunhandle*)(new Hunspell(affpath, dpath));
+#endif
}
Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath,
const char * key)
{
+#ifdef HUNSPELL_CHROME_CLIENT
+ return NULL;
+#else
return (Hunhandle*)(new Hunspell(affpath, dpath, key));
+#endif
}
void Hunspell_destroy(Hunhandle *pHunspell)
Index: src/hunspell/hunspell.hxx
===================================================================
RCS file: /cvsroot/hunspell/hunspell/src/hunspell/hunspell.hxx,v
retrieving revision 1.6
diff -u -r1.6 hunspell.hxx
--- src/hunspell/hunspell.hxx 21 Jan 2011 17:30:41 -0000 1.6
+++ src/hunspell/hunspell.hxx 29 May 2014 01:05:07 -0000
@@ -5,6 +5,10 @@
#include "suggestmgr.hxx"
#include "langnum.hxx"
+#ifdef HUNSPELL_CHROME_CLIENT
+#include "third_party/hunspell/google/bdict_reader.h"
+#endif
+
#define SPELL_XML "<?xml?>"
#define MAXDIC 20
@@ -23,7 +27,9 @@
HashMgr* pHMgr[MAXDIC];
int maxdic;
SuggestMgr* pSMgr;
+#ifndef HUNSPELL_CHROME_CLIENT // We are using BDict instead.
char * affixpath;
+#endif
char * encoding;
struct cs_info * csconv;
int langnum;
@@ -31,17 +37,28 @@
int complexprefixes;
char** wordbreak;
+#ifdef HUNSPELL_CHROME_CLIENT
+ // Not owned by us, owned by the Hunspell object.
+ hunspell::BDictReader* bdict_reader;
+#endif
+
public:
/* Hunspell(aff, dic) - constructor of Hunspell class
* input: path of affix file and dictionary file
*/
+#ifdef HUNSPELL_CHROME_CLIENT
+ Hunspell(const unsigned char* bdict_data, size_t bdict_length);
+#else
Hunspell(const char * affpath, const char * dpath, const char * key = NULL);
+#endif
~Hunspell();
+#ifndef HUNSPELL_CHROME_CLIENT
/* load extra dictionaries (only dic files) */
int add_dic(const char * dpath, const char * key = NULL);
+#endif
/* spell(word) - spellcheck word
* output: 0 = bad word, not 0 = good word
Index: src/hunspell/replist.hxx
===================================================================
RCS file: /cvsroot/hunspell/hunspell/src/hunspell/replist.hxx,v
retrieving revision 1.2
diff -u -r1.2 replist.hxx
--- src/hunspell/replist.hxx 15 Apr 2010 11:22:09 -0000 1.2
+++ src/hunspell/replist.hxx 29 May 2014 01:05:07 -0000
@@ -2,6 +2,12 @@
#ifndef _REPLIST_HXX_
#define _REPLIST_HXX_
+#ifdef HUNSPELL_CHROME_CLIENT
+// Compilation issues in spellchecker.cc think near is a macro, therefore
+// removing it here solves that problem.
+#undef near
+#endif
+
#include "hunvisapi.h"
#include "w_char.hxx"
Index: src/hunspell/suggestmgr.cxx
===================================================================
RCS file: /cvsroot/hunspell/hunspell/src/hunspell/suggestmgr.cxx,v
retrieving revision 1.24
diff -u -r1.24 suggestmgr.cxx
--- src/hunspell/suggestmgr.cxx 14 Feb 2011 21:47:24 -0000 1.24
+++ src/hunspell/suggestmgr.cxx 29 May 2014 01:05:07 -0000
@@ -12,9 +12,114 @@
const w_char W_VLINE = { '\0', '|' };
+#ifdef HUNSPELL_CHROME_CLIENT
+namespace {
+// A simple class which creates temporary hentry objects which are available
+// only in a scope. To conceal memory operations from SuggestMgr functions,
+// this object automatically deletes all hentry objects created through
+// CreateScopedHashEntry() calls in its destructor. So, the following snippet
+// raises a memory error.
+//
+// hentry* bad_copy = NULL;
+// {
+// ScopedHashEntryFactory factory;
+// hentry* scoped_copy = factory.CreateScopedHashEntry(0, source);
+// ...
+// bad_copy = scoped_copy;
+// }
+// if (bad_copy->word[0]) // memory for scoped_copy has been deleted!
+//
+// As listed in the above snippet, it is simple to use this class.
+// 1. Declare an instance of this ScopedHashEntryFactory, and;
+// 2. Call its CreateHashEntry() member instead of using 'new hentry' or
+// 'operator='.
+//
+class ScopedHashEntryFactory {
+ public:
+ ScopedHashEntryFactory();
+ ~ScopedHashEntryFactory();
+
+ // Creates a temporary copy of the given hentry struct.
+ // The returned copy is available only while this object is available.
+ // NOTE: this function just calls memcpy() in creating a copy of the given
+ // hentry struct, i.e. it does NOT copy objects referred by pointers of the
+ // given hentry struct.
+ hentry* CreateScopedHashEntry(int index, const hentry* source);
+
+ private:
+ // A struct which encapsulates the new hentry struct introduced in hunspell
+ // 1.2.8. For a pointer to an hentry struct 'h', hunspell 1.2.8 stores a word
+ // (including a NUL character) into 'h->word[0]',...,'h->word[h->blen]' even
+ // though arraysize(h->word[]) is 1. Also, it changed 'astr' to a pointer so
+ // it can store affix flags into 'h->astr[0]',...,'h->astr[alen-1]'. To handle
+ // this new hentry struct, we define a struct which combines three values: an
+ // hentry struct 'hentry'; a char array 'word[kMaxWordLen]', and; an unsigned
+ // short array 'astr' so a hentry struct 'h' returned from
+ // CreateScopedHashEntry() satisfies the following equations:
+ // hentry* h = factory.CreateScopedHashEntry(0, source);
+ // h->word[0] == ((HashEntryItem*)h)->entry.word[0].
+ // h->word[1] == ((HashEntryItem*)h)->word[0].
+ // ...
+ // h->word[h->blen] == ((HashEntryItem*)h)->word[h->blen-1].
+ // h->astr[0] == ((HashEntryItem*)h)->astr[0].
+ // h->astr[1] == ((HashEntryItem*)h)->astr[1].
+ // ...
+ // h->astr[h->alen-1] == ((HashEntryItem*)h)->astr[h->alen-1].
+ enum {
+ kMaxWordLen = 128,
+ kMaxAffixLen = 8,
+ };
+ struct HashEntryItem {
+ hentry entry;
+ char word[kMaxWordLen];
+ unsigned short astr[kMaxAffixLen];
+ };
+
+ HashEntryItem hash_items_[MAX_ROOTS];
+};
+
+ScopedHashEntryFactory::ScopedHashEntryFactory() {
+ memset(&hash_items_[0], 0, sizeof(hash_items_));
+}
+
+ScopedHashEntryFactory::~ScopedHashEntryFactory() {
+}
+
+hentry* ScopedHashEntryFactory::CreateScopedHashEntry(int index,
+ const hentry* source) {
+ if (index >= MAX_ROOTS || source->blen >= kMaxWordLen)
+ return NULL;
+
+ // Retrieve a HashEntryItem struct from our spool, initialize it, and
+ // returns the address of its 'hentry' member.
+ size_t source_size = sizeof(hentry) + source->blen + 1;
+ HashEntryItem* hash_item = &hash_items_[index];
+ memcpy(&hash_item->entry, source, source_size);
+ if (source->astr) {
+ hash_item->entry.alen = source->alen;
+ if (hash_item->entry.alen > kMaxAffixLen)
+ hash_item->entry.alen = kMaxAffixLen;
+ memcpy(hash_item->astr, source->astr, hash_item->entry.alen * sizeof(hash_item->astr[0]));
+ hash_item->entry.astr = &hash_item->astr[0];
+ }
+ return &hash_item->entry;
+}
+
+} // namespace
+#endif
+
+
+#ifdef HUNSPELL_CHROME_CLIENT
+SuggestMgr::SuggestMgr(hunspell::BDictReader* reader,
+ const char * tryme, int maxn,
+ AffixMgr * aptr)
+{
+ bdict_reader = reader;
+#else
SuggestMgr::SuggestMgr(const char * tryme, int maxn,
AffixMgr * aptr)
{
+#endif
// register affix manager and check in string of chars to
// try when building candidate suggestions
@@ -407,6 +512,49 @@
int lenr, lenp;
int wl = strlen(word);
if (wl < 2 || ! pAMgr) return ns;
+
+#ifdef HUNSPELL_CHROME_CLIENT
+ const char *pattern, *pattern2;
+ hunspell::ReplacementIterator iterator = bdict_reader->GetReplacementIterator();
+ while (iterator.GetNext(&pattern, &pattern2)) {
+ r = word;
+ lenr = strlen(pattern2);
+ lenp = strlen(pattern);
+
+ // search every occurence of the pattern in the word
+ while ((r=strstr(r, pattern)) != NULL) {
+ strcpy(candidate, word);
+ if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
+ strcpy(candidate+(r-word), pattern2);
+ strcpy(candidate+(r-word)+lenr, r+lenp);
+ ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL);
+ if (ns == -1) return -1;
+ // check REP suggestions with space
+ char * sp = strchr(candidate, ' ');
+ if (sp) {
+ char * prev = candidate;
+ while (sp) {
+ *sp = '\0';
+ if (checkword(prev, strlen(prev), 0, NULL, NULL)) {
+ int oldns = ns;
+ *sp = ' ';
+ ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL);
+ if (ns == -1) return -1;
+ if (oldns < ns) {
+ free(wlst[ns - 1]);
+ wlst[ns - 1] = mystrdup(candidate);
+ if (!wlst[ns - 1]) return -1;
+ }
+ }
+ *sp = ' ';
+ prev = sp + 1;
+ sp = strchr(prev, ' ');
+ }
+ }
+ r++; // search for the next letter
+ }
+ }
+#else
int numrep = pAMgr->get_numrep();
struct replentry* reptable = pAMgr->get_reptable();
if (reptable==NULL) return ns;
@@ -448,6 +596,7 @@
r++; // search for the next letter
}
}
+#endif
return ns;
}
@@ -678,7 +827,9 @@
// error is missing a letter it needs
int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest)
{
- char candidate[MAXSWUTF8L];
+ // TODO(rouslan): Remove the interim change below when this patch lands:
+ // http://sf.net/tracker/?func=detail&aid=3595024&group_id=143754&atid=756395
+ char candidate[MAXSWUTF8L + 4];
char * p;
clock_t timelimit = clock();
int timer = MINTIMER;
@@ -700,8 +851,10 @@
// error is missing a letter it needs
int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
{
- w_char candidate_utf[MAXSWL];
- char candidate[MAXSWUTF8L];
+ // TODO(rouslan): Remove the interim change below when this patch lands:
+ // http://sf.net/tracker/?func=detail&aid=3595024&group_id=143754&atid=756395
+ w_char candidate_utf[MAXSWL + 1];
+ char candidate[MAXSWUTF8L + 4];
w_char * p;
clock_t timelimit = clock();
int timer = MINTIMER;
@@ -1057,6 +1210,9 @@
struct hentry* hp = NULL;
int col = -1;
+#ifdef HUNSPELL_CHROME_CLIENT
+ ScopedHashEntryFactory hash_entry_factory;
+#endif
phonetable * ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;
char target[MAXSWUTF8L];
char candidate[MAXSWUTF8L];
@@ -1115,7 +1271,11 @@
if (sc > scores[lp]) {
scores[lp] = sc;
+#ifdef HUNSPELL_CHROME_CLIENT
+ roots[lp] = hash_entry_factory.CreateScopedHashEntry(lp, hp);
+#else
roots[lp] = hp;
+#endif
lval = sc;
for (j=0; j < MAX_ROOTS; j++)
if (scores[j] < lval) {
@@ -1948,16 +2108,14 @@
m = strlen(s);
n = strlen(s2);
}
- c = (char *) malloc((m + 1) * (n + 1));
- b = (char *) malloc((m + 1) * (n + 1));
+ c = (char *) calloc(m + 1, n + 1);
+ b = (char *) calloc(m + 1, n + 1);
if (!c || !b) {
if (c) free(c);
if (b) free(b);
*result = NULL;
return;
}
- for (i = 1; i <= m; i++) c[i*(n+1)] = 0;
- for (j = 0; j <= n; j++) c[j] = 0;
for (i = 1; i <= m; i++) {
for (j = 1; j <= n; j++) {
if ( ((utf8) && (*((short *) su+i-1) == *((short *)su2+j-1)))
Index: src/hunspell/suggestmgr.hxx
===================================================================
RCS file: /cvsroot/hunspell/hunspell/src/hunspell/suggestmgr.hxx,v
retrieving revision 1.5
diff -u -r1.5 suggestmgr.hxx
--- src/hunspell/suggestmgr.hxx 21 Jan 2011 22:10:24 -0000 1.5
+++ src/hunspell/suggestmgr.hxx 29 May 2014 01:05:07 -0000
@@ -52,7 +52,11 @@
public:
+#ifdef HUNSPELL_CHROME_CLIENT
+ SuggestMgr(hunspell::BDictReader* reader, const char * tryme, int maxn, AffixMgr *aptr);
+#else
SuggestMgr(const char * tryme, int maxn, AffixMgr *aptr);
+#endif
~SuggestMgr();
int suggest(char*** slst, const char * word, int nsug, int * onlycmpdsug);
@@ -66,6 +70,10 @@
char * suggest_morph_for_spelling_error(const char * word);
private:
+#ifdef HUNSPELL_CHROME_CLIENT
+ // Not owned by us, owned by the Hunspell object.
+ hunspell::BDictReader* bdict_reader;
+#endif
int testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest,
int * timer, clock_t * timelimit);
int checkword(const char *, int, int, int *, clock_t *);