internal/getonescriptspan.h - external/cld2 - Git at Google

 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 //
 // Author: dsites@google.com (Dick Sites)
 //


 #ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
 #define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_

 #include "integral_types.h"
 #include "langspan.h"
 #include "offsetmap.h"

 namespace CLD2 {

 static const int kMaxScriptBuffer = 40960;
 static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
 static const int kMaxScriptBytes = kMaxScriptBuffer - 32;   // Leave some room
 static const int kWithinScriptTail = 32;    // Stop at word space in last
                                             // N bytes of script buffer


 static inline bool IsContinuationByte(char c) {
   return static_cast<signed char>(c) < -64;
 }

 // Gets lscript number for letters; always returns
 //   0 (common script) for non-letters
 int GetUTF8LetterScriptNum(const char* src);

 // Update src pointer to point to next quadgram, +2..+5
 // Looks at src[0..4]
 const char* AdvanceQuad(const char* src);


 class ScriptScanner {
  public:
   ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
   ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
                 bool any_text, bool any_script);
   ~ScriptScanner();

   // Copy next run of same-script non-tag letters to buffer [NUL terminated]
   bool GetOneScriptSpan(LangSpan* span);

   // Force Latin and Cyrillic scripts to be lowercase
   void LowerScriptSpan(LangSpan* span);

   // Copy next run of same-script non-tag letters to buffer [NUL terminated]
   // Force Latin and Cyrillic scripts to be lowercase
   bool GetOneScriptSpanLower(LangSpan* span);

   // Copy next run of non-tag characters to buffer [NUL terminated]
   // This just removes tags and removes entities
   // Buffer has leading space
   bool GetOneTextSpan(LangSpan* span);

   // Maps byte offset in most recent GetOneScriptSpan/Lower
   // span->text [0..text_bytes] into an additional byte offset from
   // span->offset, to get back to corresponding text in the original
   // input buffer.
   // text_offset must be the first byte
   // of a UTF-8 character, or just beyond the last character. Normally this
   // routine is called with the first byte of an interesting range and
   // again with the first byte of the following range.
   int MapBack(int text_offset);

   const char* GetBufferStart() {return start_byte_;};

  private:
   // Skip over tags and non-letters
   int SkipToFrontOfSpan(const char* src, int len, int* script);

   const char* start_byte_;        // Starting byte of buffer to scan
   const char* next_byte_;         // First unscanned byte
   const char* next_byte_limit_;   // Last byte + 1
   int byte_length_;               // Bytes left: next_byte_limit_ - next_byte_

   bool is_plain_text_;            // true fo text, false for HTML
   char* script_buffer_;           // Holds text with expanded entities
   char* script_buffer_lower_;     // Holds lowercased text
   bool letters_marks_only_;       // To distinguish scriptspan of one
                                   // letters/marks vs. any mixture of text
   bool one_script_only_;          // To distinguish scriptspan of one
                                   // script vs. any mixture of scripts
   int exit_state_;                // For tag parser kTagParseTbl_0, based
                                   // on letters_marks_only_
  public :
   // Expose for debugging
   OffsetMap map2original_;    // map from script_buffer_ to buffer
   OffsetMap map2uplow_;       // map from script_buffer_lower_ to script_buffer_
 };

 }  // namespace CLD2

 #endif  // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
	// Copyright 2013 Google Inc. All Rights Reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	//
	// Author: dsites@google.com (Dick Sites)
	//


	#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
	#define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_

	#include "integral_types.h"
	#include "langspan.h"
	#include "offsetmap.h"

	namespace CLD2 {

	static const int kMaxScriptBuffer = 40960;
	static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
	static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
	static const int kWithinScriptTail = 32; // Stop at word space in last
	// N bytes of script buffer


	static inline bool IsContinuationByte(char c) {
	return static_cast<signed char>(c) < -64;
	}

	// Gets lscript number for letters; always returns
	// 0 (common script) for non-letters
	int GetUTF8LetterScriptNum(const char* src);

	// Update src pointer to point to next quadgram, +2..+5
	// Looks at src[0..4]
	const char* AdvanceQuad(const char* src);


	class ScriptScanner {
	public:
	ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
	ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
	bool any_text, bool any_script);
	~ScriptScanner();

	// Copy next run of same-script non-tag letters to buffer [NUL terminated]
	bool GetOneScriptSpan(LangSpan* span);

	// Force Latin and Cyrillic scripts to be lowercase
	void LowerScriptSpan(LangSpan* span);

	// Copy next run of same-script non-tag letters to buffer [NUL terminated]
	// Force Latin and Cyrillic scripts to be lowercase
	bool GetOneScriptSpanLower(LangSpan* span);

	// Copy next run of non-tag characters to buffer [NUL terminated]
	// This just removes tags and removes entities
	// Buffer has leading space
	bool GetOneTextSpan(LangSpan* span);

	// Maps byte offset in most recent GetOneScriptSpan/Lower
	// span->text [0..text_bytes] into an additional byte offset from
	// span->offset, to get back to corresponding text in the original
	// input buffer.
	// text_offset must be the first byte
	// of a UTF-8 character, or just beyond the last character. Normally this
	// routine is called with the first byte of an interesting range and
	// again with the first byte of the following range.
	int MapBack(int text_offset);

	const char* GetBufferStart() {return start_byte_;};

	private:
	// Skip over tags and non-letters
	int SkipToFrontOfSpan(const char* src, int len, int* script);

	const char* start_byte_; // Starting byte of buffer to scan
	const char* next_byte_; // First unscanned byte
	const char* next_byte_limit_; // Last byte + 1
	int byte_length_; // Bytes left: next_byte_limit_ - next_byte_

	bool is_plain_text_; // true fo text, false for HTML
	char* script_buffer_; // Holds text with expanded entities
	char* script_buffer_lower_; // Holds lowercased text
	bool letters_marks_only_; // To distinguish scriptspan of one
	// letters/marks vs. any mixture of text
	bool one_script_only_; // To distinguish scriptspan of one
	// script vs. any mixture of scripts
	int exit_state_; // For tag parser kTagParseTbl_0, based
	// on letters_marks_only_
	public :
	// Expose for debugging
	OffsetMap map2original_; // map from script_buffer_ to buffer
	OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_
	};

	} // namespace CLD2

	#endif // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_