From: Kentaro Hayashi Date: Tue, 20 Jun 2023 08:12:58 +0000 (+0900) Subject: Fixes build test errors in big-endian machines X-Git-Tag: archive/raspbian/0.2.0-1+rpi1^2^2~1 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=52d07893ea10046045b11fbd76fcf32e591ab2bd;p=sentencepiece.git Fixes build test errors in big-endian machines Author: Taku Kudo Origin: https://github.com/google/sentencepiece/commit/827591a0c552f2187aac8b8e0f999e8ff31aad81.patch Forwarded: not-needed Gbp-Pq: Name fix-ftbfs-big-endian.patch --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b3af04..a2f0f77 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,11 @@ option(SPM_NO_THREADLOCAL "Disable thread_local operator" OFF) option(SPM_USE_BUILTIN_PROTOBUF "Use built-in protobuf" ON) option(SPM_USE_EXTERNAL_ABSL "Use external abseil" OFF) option(SPM_ENABLE_MSVC_MT_BUILD, "Use /MT flag in MSVC build" OFF) +option(SPM_CROSS_SYSTEM_PROCESSOR, "Override system processor" "") + +if (SPM_CROSS_SYSTEM_PROCESSOR) + set(CMAKE_SYSTEM_PROCESSOR ${SPM_CROSS_SYSTEM_PROCESSOR}) +endif() # Disable shared build on windows if(WIN32) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 077d37d..09ef57f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -208,6 +208,7 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "mips") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "m68k") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc") OR + (${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "sh4")) find_library(ATOMIC_LIB NAMES atomic libatomic.so libatomic.so.1) @@ -217,6 +218,7 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR endif() endif() + if (SPM_ENABLE_SHARED) add_library(sentencepiece SHARED ${SPM_SRCS}) add_library(sentencepiece_train SHARED ${SPM_TRAIN_SRCS}) diff --git a/src/common.h b/src/common.h index ef5546d..b38b3f7 100644 --- a/src/common.h +++ b/src/common.h @@ -79,10 +79,6 @@ char (&ArraySizeHelper(const T (&array)[N]))[N]; #endif #endif -#ifdef IS_BIG_ENDIAN -inline uint32 Swap32(uint32 x) { return __builtin_bswap32(x); } -#endif - namespace sentencepiece { #ifdef OS_WIN namespace win32 { @@ -90,6 +86,12 @@ std::wstring Utf8ToWide(const absl::string_view input); } // namespace win32 #endif +#ifdef IS_BIG_ENDIAN +namespace util { +inline uint32 Swap32(uint32 x) { return __builtin_bswap32(x); } +} // namespace util +#endif + namespace error { void Abort(); diff --git a/src/normalizer.cc b/src/normalizer.cc index 2ab8084..53e43c4 100644 --- a/src/normalizer.cc +++ b/src/normalizer.cc @@ -260,14 +260,14 @@ std::string Normalizer::EncodePrecompiledCharsMap( std::string blob; blob.append(string_util::EncodePOD(trie_blob.size())); blob.append(trie_blob.data(), trie_blob.size()); - blob.append(normalized.data(), normalized.size()); #ifdef IS_BIG_ENDIAN uint32 *data = reinterpret_cast(const_cast(blob.data())); - for (int i = 0; i <= trie_blob.size() / 4; ++i) - data[i] = util::Swap32(data[i]); + for (int i = 0; i < blob.size() / 4; ++i) data[i] = util::Swap32(data[i]); #endif + blob.append(normalized.data(), normalized.size()); + return blob; } @@ -279,8 +279,7 @@ util::Status Normalizer::DecodePrecompiledCharsMap( if (blob.size() <= sizeof(trie_blob_size) || !string_util::DecodePOD( absl::string_view(blob.data(), sizeof(trie_blob_size)), - &trie_blob_size) || - trie_blob_size >= blob.size()) { + &trie_blob_size)) { return util::InternalError("Blob for normalization rule is broken."); } @@ -288,15 +287,17 @@ util::Status Normalizer::DecodePrecompiledCharsMap( trie_blob_size = util::Swap32(trie_blob_size); #endif - if (trie_blob_size >= blob.size()) + if (trie_blob_size >= blob.size()) { return util::InternalError("Trie data size exceeds the input blob size."); + } blob.remove_prefix(sizeof(trie_blob_size)); #ifdef IS_BIG_ENDIAN + CHECK_OR_RETURN(buffer); buffer->assign(blob.data(), trie_blob_size); uint32 *data = reinterpret_cast(const_cast(buffer->data())); - for (int i = 0; i < trie_blob_size / 4; ++i) data[i] = util::Swap32(data[i]); + for (int i = 0; i < buffer->size() / 4; ++i) data[i] = util::Swap32(data[i]); *trie_blob = absl::string_view(buffer->data(), trie_blob_size); #else *trie_blob = absl::string_view(blob.data(), trie_blob_size); diff --git a/src/unigram_model_trainer_test.cc b/src/unigram_model_trainer_test.cc index 9d2c526..31da90b 100644 --- a/src/unigram_model_trainer_test.cc +++ b/src/unigram_model_trainer_test.cc @@ -106,6 +106,7 @@ TrainerResult RunTrainer(const std::vector& input, int size, TrainerResult res; res.seed_pieces_and_probs = seed_pieces; + std::sort(pieces.begin(), pieces.end()); res.sentence_pieces = absl::StrJoin(pieces, " "); return res; } @@ -119,10 +120,8 @@ TEST(UnigramTrainerTest, BasicTest) { // Check seed pieces. EXPECT_EQ(27, res.seed_pieces_and_probs.size()); - LOG(INFO) << "[" << res.sentence_pieces << "]"; - // Check final pieces. - EXPECT_EQ("i a n y m l e apple ve O P r g t an v ▁ b A le ▁an p d h", + EXPECT_EQ("A O P a an apple b d e g h i l le m n p r t v ve y ▁ ▁an", res.sentence_pieces); }