| // Copyright 2013 Google Inc. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // |
| // Author: dsites@google.com (Dick Sites) |
| // |
| |
| #include "tote.h" |
| #include "lang_script.h" // For LanguageCode in Dump |
| |
| #include <stdio.h> |
| #include <string.h> // For memset |
| |
| namespace CLD2 { |
| |
| // Take a set of <key, value> pairs and tote them up. |
| // After explicitly sorting, retrieve top key, value pairs |
| // Normal use is key=per-script language and value = probability score |
| Tote::Tote() { |
| in_use_mask_ = 0; |
| byte_count_ = 0; |
| score_count_ = 0; |
| // No need to initialize values |
| } |
| |
| Tote::~Tote() { |
| } |
| |
| void Tote::Reinit() { |
| in_use_mask_ = 0; |
| byte_count_ = 0; |
| score_count_ = 0; |
| // No need to initialize values |
| } |
| // Increment count of quadgrams/trigrams/unigrams scored |
| void Tote::AddScoreCount() { |
| ++score_count_; |
| } |
| |
| |
| void Tote::Add(uint8 ikey, int idelta) { |
| int key_group = ikey >> 2; |
| uint64 groupmask = (1ULL << key_group); |
| if ((in_use_mask_ & groupmask) == 0) { |
| // Initialize this group |
| gscore_[key_group] = 0; |
| in_use_mask_ |= groupmask; |
| } |
| score_[ikey] += idelta; |
| } |
| |
| |
| // Return current top three keys |
| void Tote::CurrentTopThreeKeys(int* key3) const { |
| key3[0] = -1; |
| key3[1] = -1; |
| key3[2] = -1; |
| int score3[3] = {-1, -1, -1}; |
| uint64 tempmask = in_use_mask_; |
| int base = 0; |
| while (tempmask != 0) { |
| if (tempmask & 1) { |
| // Look at four in-use keys |
| for (int i = 0; i < 4; ++i) { |
| int insert_me = score_[base + i]; |
| // Favor lower numbers on ties |
| if (insert_me > score3[2]) { |
| // Insert |
| int insert_at = 2; |
| if (insert_me > score3[1]) { |
| score3[2] = score3[1]; |
| key3[2] = key3[1]; |
| insert_at = 1; |
| if (insert_me > score3[0]) { |
| score3[1] = score3[0]; |
| key3[1] = key3[0]; |
| insert_at = 0; |
| } |
| } |
| score3[insert_at] = insert_me; |
| key3[insert_at] = base + i; |
| } |
| } |
| } |
| tempmask >>= 1; |
| base += 4; |
| } |
| } |
| |
| |
| // Take a set of <key, value> pairs and tote them up. |
| // After explicitly sorting, retrieve top key, value pairs |
| // 0xFFFF in key signifies unused |
| DocTote::DocTote() { |
| // No need to initialize score_ or value_ |
| incr_count_ = 0; |
| sorted_ = 0; |
| memset(closepair_, 0, sizeof(closepair_)); |
| memset(key_, 0xFF, sizeof(key_)); |
| } |
| |
| DocTote::~DocTote() { |
| } |
| |
| void DocTote::Reinit() { |
| // No need to initialize score_ or value_ |
| incr_count_ = 0; |
| sorted_ = 0; |
| memset(closepair_, 0, sizeof(closepair_)); |
| memset(key_, 0xFF, sizeof(key_)); |
| runningscore_.Reinit(); |
| } |
| |
| // Weight reliability by ibytes |
| // Also see three-way associative comments above for Tote |
| void DocTote::Add(uint16 ikey, int ibytes, |
| int score, int ireliability) { |
| ++incr_count_; |
| |
| // Look for existing entry in top 2 positions of 3, times 8 columns |
| int sub0 = ikey & 15; |
| if (key_[sub0] == ikey) { |
| value_[sub0] += ibytes; |
| score_[sub0] += score; |
| reliability_[sub0] += ireliability * ibytes; |
| return; |
| } |
| // Look for existing entry in other of top 2 positions of 3, times 8 columns |
| int sub1 = sub0 ^ 8; |
| if (key_[sub1] == ikey) { |
| value_[sub1] += ibytes; |
| score_[sub1] += score; |
| reliability_[sub1] += ireliability * ibytes; |
| return; |
| } |
| // Look for existing entry in third position of 3, times 8 columns |
| int sub2 = (ikey & 7) + 16; |
| if (key_[sub2] == ikey) { |
| value_[sub2] += ibytes; |
| score_[sub2] += score; |
| reliability_[sub2] += ireliability * ibytes; |
| return; |
| } |
| |
| // Allocate new entry |
| int alloc = -1; |
| if (key_[sub0] == kUnusedKey) { |
| alloc = sub0; |
| } else if (key_[sub1] == kUnusedKey) { |
| alloc = sub1; |
| } else if (key_[sub2] == kUnusedKey) { |
| alloc = sub2; |
| } else { |
| // All choices allocated, need to replace smallest one |
| alloc = sub0; |
| if (value_[sub1] < value_[alloc]) {alloc = sub1;} |
| if (value_[sub2] < value_[alloc]) {alloc = sub2;} |
| } |
| key_[alloc] = ikey; |
| value_[alloc] = ibytes; |
| score_[alloc] = score; |
| reliability_[alloc] = ireliability * ibytes; |
| return; |
| } |
| |
| // Find subscript of a given packed language, or -1 |
| int DocTote::Find(uint16 ikey) { |
| if (sorted_) { |
| // Linear search if sorted |
| for (int sub = 0; sub < kMaxSize_; ++sub) { |
| if (key_[sub] == ikey) {return sub;} |
| } |
| return -1; |
| } |
| |
| // Look for existing entry |
| int sub0 = ikey & 15; |
| if (key_[sub0] == ikey) { |
| return sub0; |
| } |
| int sub1 = sub0 ^ 8; |
| if (key_[sub1] == ikey) { |
| return sub1; |
| } |
| int sub2 = (ikey & 7) + 16; |
| if (key_[sub2] == ikey) { |
| return sub2; |
| } |
| |
| return -1; |
| } |
| |
| // Return current top key |
| int DocTote::CurrentTopKey() { |
| int top_key = 0; |
| int top_value = -1; |
| for (int sub = 0; sub < kMaxSize_; ++sub) { |
| if (key_[sub] == kUnusedKey) {continue;} |
| if (top_value < value_[sub]) { |
| top_value = value_[sub]; |
| top_key = key_[sub]; |
| } |
| } |
| return top_key; |
| } |
| |
| |
| // Sort first n entries by decreasing order of value |
| // If key==0 other fields are not valid, treat value as -1 |
| void DocTote::Sort(int n) { |
| // This is n**2, but n is small |
| for (int sub = 0; sub < n; ++sub) { |
| if (key_[sub] == kUnusedKey) {value_[sub] = -1;} |
| |
| // Bubble sort key[sub] and entry[sub] |
| for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) { |
| if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;} |
| if (value_[sub] < value_[sub2]) { |
| // swap |
| uint16 tmpk = key_[sub]; |
| key_[sub] = key_[sub2]; |
| key_[sub2] = tmpk; |
| |
| int tmpv = value_[sub]; |
| value_[sub] = value_[sub2]; |
| value_[sub2] = tmpv; |
| |
| double tmps = score_[sub]; |
| score_[sub] = score_[sub2]; |
| score_[sub2] = tmps; |
| |
| int tmpr = reliability_[sub]; |
| reliability_[sub] = reliability_[sub2]; |
| reliability_[sub2] = tmpr; |
| } |
| } |
| } |
| sorted_ = 1; |
| } |
| |
| void DocTote::Dump(FILE* f) { |
| fprintf(f, "DocTote::Dump\n"); |
| for (int sub = 0; sub < kMaxSize_; ++sub) { |
| if (key_[sub] != kUnusedKey) { |
| Language lang = static_cast<Language>(key_[sub]); |
| fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang), |
| value_[sub], score_[sub], reliability_[sub]); |
| } |
| } |
| fprintf(f, " %d chunks scored<br>\n", incr_count_); |
| } |
| |
| } // End namespace CLD2 |
| |