From: Alexandre Mestiashvili Date: Thu, 11 Apr 2013 12:53:20 +0000 (+0200) Subject: Imported Upstream version 0.35 X-Git-Tag: archive/raspbian/4.017+ds-1+rpi1~1^2~3^2~35 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=a2284d2ed24a26a60448f6a82421289138563709;p=libsereal-encoder-perl.git Imported Upstream version 0.35 --- diff --git a/Changes b/Changes index 8c9ac99..27eee8e 100644 --- a/Changes +++ b/Changes @@ -1,5 +1,19 @@ Revision history for Perl extension Sereal-Encoder +0.35 Mon Apr 1 11:50 2013 (AMS time) + - Add new no_bless_objects option from Simon Bertrang. + +0.34 Sat Mar 23 18:59:18 2013 (AMS time) + - Fixup Manifest + +0.33 Sun Feb 17 17:26 2013 (AMS time) + - Fix problem with hv_backrefs (Issue #27) + +0.32 Sun Feb 17 15:06 2013 (AMS time) + - Add "dedupe_strings" option, which will make + the encoder do extra work to dedupe string values + in the serialized output. + 0.31 Sun Feb 17 15:06 2013 (AMS time) - Daniel Dragan spent a bunch of time digging into the weird problems we were having with Snappy diff --git a/MANIFEST b/MANIFEST index ed9d0ad..bb55adb 100644 --- a/MANIFEST +++ b/MANIFEST @@ -17,6 +17,7 @@ MANIFEST This list of files ppport.h ptable.h snappy/csnappy.h +snappy/csnappy_compat.h snappy/csnappy_compress.c snappy/csnappy_decompress.c snappy/csnappy_internal.h @@ -32,7 +33,9 @@ t/002_constants.t t/003_ptable.t t/010_desperate.t t/020_sort_keys.t +t/021_sort_keys_option.t t/100_roundtrip.t +t/110_nobless.t t/160_recursion.t t/200_bulk.t t/300_fail.t diff --git a/META.yml b/META.yml index 21a9750..5b46ef7 100644 --- a/META.yml +++ b/META.yml @@ -1,6 +1,6 @@ --- #YAML:1.0 name: Sereal-Encoder -version: 0.31 +version: 0.35 abstract: Fast, compact, powerful binary serialization author: - Steffen Mueller , Yves Orton @@ -22,6 +22,7 @@ requires: perl: 5.008 XSLoader: 0 resources: + bugtracker: https://github.com/Sereal/Sereal/issues repository: git://github.com/Sereal/Sereal.git no_index: directory: diff --git a/Makefile.PL b/Makefile.PL index e2b0db0..ff2f5e7 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -50,7 +50,8 @@ WriteMakefile1( MIN_PERL_VERSION => '5.008', META_MERGE => { resources => { - repository => 'git://github.com/Sereal/Sereal.git' + repository => 'git://github.com/Sereal/Sereal.git', + bugtracker => 'https://github.com/Sereal/Sereal/issues', }, }, BUILD_REQUIRES => { diff --git a/lib/Sereal/Encoder.pm b/lib/Sereal/Encoder.pm index b931af4..54fd2f3 100644 --- a/lib/Sereal/Encoder.pm +++ b/lib/Sereal/Encoder.pm @@ -5,10 +5,10 @@ use warnings; use Carp qw/croak/; use XSLoader; -our $VERSION = '0.31'; # Don't forget to update the TestCompat set for testing against installed decoders! +our $VERSION = '0.35'; # Don't forget to update the TestCompat set for testing against installed decoders! # not for public consumption, just for testing. -my $TestCompat = [map sprintf("%.2f", $_/100), reverse(23..31)]; # compat with 0.23 to ... +my $TestCompat = [ map sprintf("%.2f", $_/100), reverse( 23 .. int($VERSION * 100) ) ]; # compat with 0.23 to ... sub _test_compat {return(@$TestCompat, $VERSION)} use Exporter 'import'; @@ -115,6 +115,17 @@ This can be important because blessed references can mean executing a destructor on a remote system or generally executing code based on data. +See also C to skip the blessing of objects. +When both flags are set, C has a higher precedence then +C. + +=head3 no_bless_objects + +If this option is set, then the encoder will serialize blessed references +without the bless information and provide plain data structures instead. + +See also the C option above for more details. + =head3 undef_unknown If set, unknown/unsupported data structures will be encoded as C @@ -181,6 +192,23 @@ the memory. See L for why you might want to use this, and for the various caveats involved. +=head3 dedupe_strings + +If true Sereal will use a hash to dedupe strings during serialization. This +has a peformance and memory penalty so it defaults to off, but data structures +with many duplicated strings will see a significant reduction in the size of +the encoded form. Currently only strings longer than 3 characters will be +deduped, however this may change in the future. + +Note that Sereal will perform certain types of deduping automatically even +without this option. In particular class names and hash keys are deduped +regardless of this option. Only enable this if you have good reason to +believe that there are many duplicated strings as values in your data +structure. + +Use of this option does not require an upgraded decoder. The deduping +is performed in such a way that older decoders should handle it just fine. + =head1 INSTANCE METHODS =head2 encode diff --git a/ptable.h b/ptable.h index f015fb2..a51ffcd 100644 --- a/ptable.h +++ b/ptable.h @@ -9,15 +9,16 @@ #ifndef PTABLE_H_ #define PTABLE_H_ -#include "ppport.h" +#include #include +#include "ppport.h" #if PTRSIZE == 8 /* * This is one of Thomas Wang's hash functions for 64-bit integers from: * http://www.concentric.net/~Ttwang/tech/inthash.htm */ - STATIC U32 ptr_hash(PTRV u) { + SRL_STATIC_INLINE U32 ptr_hash(PTRV u) { u = (~u) + (u << 18); u = u ^ (u >> 31); u = u * 21; @@ -31,7 +32,7 @@ * This is one of Bob Jenkins' hash functions for 32-bit integers * from: http://burtleburtle.net/bob/hash/integer.html */ - STATIC U32 ptr_hash(PTRV u) { + SRL_STATIC_INLINE U32 ptr_hash(PTRV u) { u = (u + 0x7ed55d16) + (u << 12); u = (u ^ 0xc761c23c) ^ (u >> 19); u = (u + 0x165667b1) + (u << 5); diff --git a/snappy/csnappy_compat.h b/snappy/csnappy_compat.h new file mode 100644 index 0000000..3c9e068 --- /dev/null +++ b/snappy/csnappy_compat.h @@ -0,0 +1,16 @@ +#ifndef CSNAPPY_COMPAT_H + +/* This file was added to Sereal to attempt some MSVC compatibility, + * but is at best a band-aid. And done without a lot of experience + * in whatever subset of C99 MSVC supports. + */ + +#ifndef INLINE +# if defined(_MSC_VER) +# define INLINE __inline +# else +# define INLINE inline +# endif +#endif + +#endif diff --git a/snappy/csnappy_compress.c b/snappy/csnappy_compress.c index 7204043..4f9c07a 100644 --- a/snappy/csnappy_compress.c +++ b/snappy/csnappy_compress.c @@ -30,6 +30,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. File modified for the Linux Kernel by Zeev Tarantov + +File modified for Sereal by +Steffen Mueller */ #include "csnappy_internal.h" @@ -40,7 +43,7 @@ Zeev Tarantov #include "csnappy.h" -static inline char* +static INLINE char* encode_varint32(char *sptr, uint32_t v) { uint8_t* ptr = (uint8_t *)sptr; @@ -222,12 +225,12 @@ the_end: * input. Of course, it doesn't hurt if the hash function is reasonably fast * either, as it gets called a lot. */ -static inline uint32_t HashBytes(uint32_t bytes, int shift) +static INLINE uint32_t HashBytes(uint32_t bytes, int shift) { uint32_t kMul = 0x1e35a7bd; return (bytes * kMul) >> shift; } -static inline uint32_t Hash(const char *p, int shift) +static INLINE uint32_t Hash(const char *p, int shift) { return HashBytes(UNALIGNED_LOAD32(p), shift); } @@ -247,7 +250,7 @@ static inline uint32_t Hash(const char *p, int shift) * x86_64 is little endian. */ #if defined(__x86_64__) -static inline int +static INLINE int FindMatchLength(const char *s1, const char *s2, const char *s2_limit) { uint64_t x; @@ -291,7 +294,7 @@ FindMatchLength(const char *s1, const char *s2, const char *s2_limit) return matched; } #else /* !defined(__x86_64__) */ -static inline int +static INLINE int FindMatchLength(const char *s1, const char *s2, const char *s2_limit) { /* Implementation based on the x86-64 version, above. */ @@ -326,7 +329,7 @@ FindMatchLength(const char *s1, const char *s2, const char *s2_limit) #endif /* !defined(__x86_64__) */ -static inline char* +static INLINE char* EmitLiteral(char *op, const char *literal, int len, int allow_fast_path) { int n = len - 1; /* Zero-length literals are disallowed */ @@ -367,7 +370,7 @@ EmitLiteral(char *op, const char *literal, int len, int allow_fast_path) return op + len; } -static inline char* +static INLINE char* EmitCopyLessThan64(char *op, int offset, int len) { DCHECK_LE(len, 64); @@ -389,7 +392,7 @@ EmitCopyLessThan64(char *op, int offset, int len) return op; } -static inline char* +static INLINE char* EmitCopy(char *op, int offset, int len) { /* Emit 64 byte copies but make sure to keep at least four bytes @@ -420,7 +423,7 @@ empirically found that overlapping loads such as are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32. We have different versions for 64- and 32-bit; ideally we would avoid the -two functions and just inline the UNALIGNED_LOAD64 call into +two functions and just INLINE the UNALIGNED_LOAD64 call into GetUint32AtOffset, but GCC (at least not as of 4.6) is seemingly not clever enough to avoid loading the value multiple times then. For 64-bit, the load is done when GetEightBytesAt() is called, whereas for 32-bit, the load is @@ -431,11 +434,11 @@ done at GetUint32AtOffset() time. typedef uint64_t EightBytesReference; -static inline EightBytesReference GetEightBytesAt(const char* ptr) { +static INLINE EightBytesReference GetEightBytesAt(const char* ptr) { return UNALIGNED_LOAD64(ptr); } -static inline uint32_t GetUint32AtOffset(uint64_t v, int offset) { +static INLINE uint32_t GetUint32AtOffset(uint64_t v, int offset) { DCHECK_GE(offset, 0); DCHECK_LE(offset, 4); #ifdef __LITTLE_ENDIAN @@ -449,11 +452,11 @@ static inline uint32_t GetUint32AtOffset(uint64_t v, int offset) { typedef const char* EightBytesReference; -static inline EightBytesReference GetEightBytesAt(const char* ptr) { +static INLINE EightBytesReference GetEightBytesAt(const char* ptr) { return ptr; } -static inline uint32_t GetUint32AtOffset(const char* v, int offset) { +static INLINE uint32_t GetUint32AtOffset(const char* v, int offset) { DCHECK_GE(offset, 0); DCHECK_LE(offset, 4); return UNALIGNED_LOAD32(v + offset); diff --git a/snappy/csnappy_decompress.c b/snappy/csnappy_decompress.c index 5bb411d..ac6cb10 100644 --- a/snappy/csnappy_decompress.c +++ b/snappy/csnappy_decompress.c @@ -30,6 +30,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. File modified for the Linux Kernel by Zeev Tarantov + +File modified for Sereal by +Steffen Mueller */ #include "csnappy_internal.h" @@ -194,7 +197,7 @@ static const uint16_t char_table[256] = { * Note that this does not match the semantics of either memcpy() * or memmove(). */ -static inline void IncrementalCopy(const char *src, char *op, int len) +static INLINE void IncrementalCopy(const char *src, char *op, int len) { DCHECK_GT(len, 0); do { @@ -235,7 +238,7 @@ static inline void IncrementalCopy(const char *src, char *op, int len) * position 1. Thus, ten excess bytes. */ static const int kMaxIncrementCopyOverflow = 10; -static inline void IncrementalCopyFastPath(const char *src, char *op, int len) +static INLINE void IncrementalCopyFastPath(const char *src, char *op, int len) { while (op - src < 8) { UnalignedCopy64(src, op); @@ -258,7 +261,7 @@ struct SnappyArrayWriter { char *op_limit; }; -static inline int +static INLINE int SAW__AppendFastPath(struct SnappyArrayWriter *this, const char *ip, uint32_t len) { @@ -276,7 +279,7 @@ SAW__AppendFastPath(struct SnappyArrayWriter *this, return CSNAPPY_E_OK; } -static inline int +static INLINE int SAW__Append(struct SnappyArrayWriter *this, const char *ip, uint32_t len) { @@ -289,7 +292,7 @@ SAW__Append(struct SnappyArrayWriter *this, return CSNAPPY_E_OK; } -static inline int +static INLINE int SAW__AppendFromSelf(struct SnappyArrayWriter *this, uint32_t offset, uint32_t len) { diff --git a/snappy/csnappy_internal.h b/snappy/csnappy_internal.h index 5c73da1..dd24910 100644 --- a/snappy/csnappy_internal.h +++ b/snappy/csnappy_internal.h @@ -31,11 +31,16 @@ Various stubs for the open-source version of Snappy. File modified for the Linux Kernel by Zeev Tarantov + +File modified for Sereal by +Steffen Mueller */ #ifndef CSNAPPY_INTERNAL_H_ #define CSNAPPY_INTERNAL_H_ +#include "csnappy_compat.h" + #ifndef __KERNEL__ #include "csnappy_internal_userspace.h" #include @@ -85,7 +90,7 @@ Zeev Tarantov defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) || defined(__ARMV6__) || \ defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) -static inline void UnalignedCopy64(const void *src, void *dst) { +static INLINE void UnalignedCopy64(const void *src, void *dst) { #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || ARCH_ARM_HAVE_UNALIGNED if ((sizeof(void *) == 8) || (sizeof(long) == 8)) { UNALIGNED_STORE64(dst, UNALIGNED_LOAD64(src)); @@ -114,7 +119,7 @@ static inline void UnalignedCopy64(const void *src, void *dst) { #if defined(__arm__) #if ARCH_ARM_HAVE_UNALIGNED - static inline uint32_t get_unaligned_le(const void *p, uint32_t n) + static INLINE uint32_t get_unaligned_le(const void *p, uint32_t n) { uint32_t wordmask = (1U << (8 * n)) - 1; return get_unaligned_le32(p) & wordmask; @@ -124,7 +129,7 @@ static inline void UnalignedCopy64(const void *src, void *dst) { #define get_unaligned_le get_unaligned_le_armv5 #endif #else - static inline uint32_t get_unaligned_le(const void *p, uint32_t n) + static INLINE uint32_t get_unaligned_le(const void *p, uint32_t n) { /* Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits */ static const uint32_t wordmask[] = { diff --git a/snappy/csnappy_internal_userspace.h b/snappy/csnappy_internal_userspace.h index 81c9468..4d06d86 100644 --- a/snappy/csnappy_internal_userspace.h +++ b/snappy/csnappy_internal_userspace.h @@ -31,6 +31,9 @@ Various stubs for the open-source version of Snappy. File modified by Zeev Tarantov + +File modified for Sereal by +Steffen Mueller */ #ifndef CSNAPPY_INTERNAL_USERSPACE_H_ @@ -94,6 +97,8 @@ typedef __int32 int32_t; /* Sereal specific change, see csnappy_decompress.c(271 #define DCHECK(cond) #endif +#include "csnappy_compat.h" + /* Uses code from http://code.google.com/p/exfat/source/browse/trunk/libexfat/byteorder.h with 3-clause BSD license instead of GPL, with permission from: @@ -202,13 +207,13 @@ Albert Lee struct una_u64 { uint64_t x; }; #pragma pack() -static inline uint64_t UNALIGNED_LOAD64(const void *p) +static INLINE uint64_t UNALIGNED_LOAD64(const void *p) { const struct una_u64 *ptr = (const struct una_u64 *)p; return ptr->x; } -static inline void UNALIGNED_STORE64(void *p, uint64_t v) +static INLINE void UNALIGNED_STORE64(void *p, uint64_t v) { struct una_u64 *ptr = (struct una_u64 *)p; ptr->x = v; @@ -222,37 +227,37 @@ struct una_u32 { uint32_t x; }; struct una_u64 { uint64_t x; }; #pragma pack() -static inline uint16_t UNALIGNED_LOAD16(const void *p) +static INLINE uint16_t UNALIGNED_LOAD16(const void *p) { const struct una_u16 *ptr = (const struct una_u16 *)p; return ptr->x; } -static inline uint32_t UNALIGNED_LOAD32(const void *p) +static INLINE uint32_t UNALIGNED_LOAD32(const void *p) { const struct una_u32 *ptr = (const struct una_u32 *)p; return ptr->x; } -static inline uint64_t UNALIGNED_LOAD64(const void *p) +static INLINE uint64_t UNALIGNED_LOAD64(const void *p) { const struct una_u64 *ptr = (const struct una_u64 *)p; return ptr->x; } -static inline void UNALIGNED_STORE16(void *p, uint16_t v) +static INLINE void UNALIGNED_STORE16(void *p, uint16_t v) { struct una_u16 *ptr = (struct una_u16 *)p; ptr->x = v; } -static inline void UNALIGNED_STORE32(void *p, uint32_t v) +static INLINE void UNALIGNED_STORE32(void *p, uint32_t v) { struct una_u32 *ptr = (struct una_u32 *)p; ptr->x = v; } -static inline void UNALIGNED_STORE64(void *p, uint64_t v) +static INLINE void UNALIGNED_STORE64(void *p, uint64_t v) { struct una_u64 *ptr = (struct una_u64 *)p; ptr->x = v; @@ -265,21 +270,21 @@ static inline void UNALIGNED_STORE64(void *p, uint64_t v) #define get_unaligned_le32(p) UNALIGNED_LOAD32(p) #define put_unaligned_le16(v, p) UNALIGNED_STORE16(p, v) #elif __BYTE_ORDER == __BIG_ENDIAN -static inline uint32_t get_unaligned_le32(const void *p) +static INLINE uint32_t get_unaligned_le32(const void *p) { return bswap_32(UNALIGNED_LOAD32(p)); } -static inline void put_unaligned_le16(uint16_t val, void *p) +static INLINE void put_unaligned_le16(uint16_t val, void *p) { UNALIGNED_STORE16(p, bswap_16(val)); } #else -static inline uint32_t get_unaligned_le32(const void *p) +static INLINE uint32_t get_unaligned_le32(const void *p) { const uint8_t *b = (const uint8_t *)p; return b[0] | (b[1] << 8) | (b[2] << 16) | (b[3] << 24); } -static inline void put_unaligned_le16(uint16_t val, void *p) +static INLINE void put_unaligned_le16(uint16_t val, void *p) { uint8_t *b = (uint8_t *)p; b[0] = val & 255; @@ -290,19 +295,19 @@ static inline void put_unaligned_le16(uint16_t val, void *p) #if defined(HAVE_BUILTIN_CTZ) -static inline int FindLSBSetNonZero(uint32_t n) +static INLINE int FindLSBSetNonZero(uint32_t n) { return __builtin_ctz(n); } -static inline int FindLSBSetNonZero64(uint64_t n) +static INLINE int FindLSBSetNonZero64(uint64_t n) { return __builtin_ctzll(n); } #else /* Portable versions. */ -static inline int FindLSBSetNonZero(uint32_t n) +static INLINE int FindLSBSetNonZero(uint32_t n) { int rc = 31, i, shift; uint32_t x; @@ -318,7 +323,7 @@ static inline int FindLSBSetNonZero(uint32_t n) } /* FindLSBSetNonZero64() is defined in terms of FindLSBSetNonZero(). */ -static inline int FindLSBSetNonZero64(uint64_t n) +static INLINE int FindLSBSetNonZero64(uint64_t n) { const uint32_t bottombits = (uint32_t)n; if (bottombits == 0) { diff --git a/srl_encoder.c b/srl_encoder.c index c150016..3ec6520 100644 --- a/srl_encoder.c +++ b/srl_encoder.c @@ -40,6 +40,9 @@ extern "C" { #define MY_CAN_FIND_PLACEHOLDERS #define HAS_SV2OBJ #endif +#if (PERL_VERSION >= 10) +#define HAS_HV_BACKREFS +#endif #include "srl_protocol.h" #include "srl_encoder.h" @@ -78,7 +81,8 @@ extern "C" { /* some static function declarations */ static void srl_dump_sv(pTHX_ srl_encoder_t *enc, SV *src); -static void srl_dump_pv(pTHX_ srl_encoder_t *enc, const char* src, STRLEN src_len, int is_utf8); +SRL_STATIC_INLINE void srl_dump_svpv(pTHX_ srl_encoder_t *enc, SV *src); +SRL_STATIC_INLINE void srl_dump_pv(pTHX_ srl_encoder_t *enc, const char* src, STRLEN src_len, int is_utf8); SRL_STATIC_INLINE void srl_fixup_weakrefs(pTHX_ srl_encoder_t *enc); SRL_STATIC_INLINE void srl_dump_av(pTHX_ srl_encoder_t *enc, AV *src, U32 refcnt); SRL_STATIC_INLINE void srl_dump_hv(pTHX_ srl_encoder_t *enc, HV *src, U32 refcnt); @@ -89,8 +93,13 @@ SRL_STATIC_INLINE void srl_dump_classname(pTHX_ srl_encoder_t *enc, SV *src); SRL_STATIC_INLINE PTABLE_t *srl_init_string_hash(srl_encoder_t *enc); SRL_STATIC_INLINE PTABLE_t *srl_init_ref_hash(srl_encoder_t *enc); SRL_STATIC_INLINE PTABLE_t *srl_init_weak_hash(srl_encoder_t *enc); +SRL_STATIC_INLINE HV *srl_init_string_deduper_hv(pTHX_ srl_encoder_t *enc); + +#define SRL_GET_STR_DEDUPER_HV(enc) ( (enc)->string_deduper_hv == NULL \ + ? srl_init_string_deduper_hv(aTHX_ enc) \ + : (enc)->string_deduper_hv ) -#define SRL_GET_STR_SEENHASH(enc) ( (enc)->str_seenhash == NULL \ +#define SRL_GET_STR_PTR_SEENHASH(enc) ( (enc)->str_seenhash == NULL \ ? srl_init_string_hash(enc) \ : (enc)->str_seenhash ) @@ -112,9 +121,7 @@ SRL_STATIC_INLINE PTABLE_t *srl_init_weak_hash(srl_encoder_t *enc); !SvROK((src)) \ ) { \ if (SvPOKp((src))) { \ - STRLEN len; \ - char *str = SvPV((src), len); \ - srl_dump_pv(aTHX_ (enc), str, len, SvUTF8((src))); \ + srl_dump_svpv(aTHX_ (enc), (src)); \ } \ else \ if (SvNOKp((src))) { \ @@ -170,6 +177,8 @@ srl_clear_encoder(pTHX_ srl_encoder_t *enc) PTABLE_clear(enc->str_seenhash); if (enc->weak_seenhash != NULL) PTABLE_clear(enc->weak_seenhash); + if (enc->string_deduper_hv != NULL) + hv_clear(enc->string_deduper_hv); enc->pos = enc->buf_start; SRL_ENC_RESET_OPER_FLAG(enc, SRL_OF_ENCODER_DIRTY); @@ -186,6 +195,8 @@ srl_destroy_encoder(pTHX_ srl_encoder_t *enc) PTABLE_free(enc->str_seenhash); if (enc->weak_seenhash != NULL) PTABLE_free(enc->weak_seenhash); + if (enc->string_deduper_hv != NULL) + SvREFCNT_dec(enc->string_deduper_hv); Safefree(enc); } @@ -215,6 +226,7 @@ srl_empty_encoder_struct(pTHX) enc->str_seenhash = NULL; enc->ref_seenhash = NULL; enc->snappy_workmem = NULL; + enc->string_deduper_hv = NULL; return enc; } @@ -241,6 +253,10 @@ srl_build_encoder_struct(pTHX_ HV *opt) if ( svp && SvTRUE(*svp) ) enc->flags |= SRL_F_CROAK_ON_BLESS; + svp = hv_fetchs(opt, "no_bless_objects", 0); + if ( svp && SvTRUE(*svp) ) + enc->flags |= SRL_F_NO_BLESS_OBJECTS; + svp = hv_fetchs(opt, "snappy", 0); if ( svp && SvTRUE(*svp) ) enc->flags |= SRL_F_COMPRESS_SNAPPY; @@ -257,10 +273,14 @@ srl_build_encoder_struct(pTHX_ HV *opt) svp = hv_fetchs(opt, "sort_keys", 0); if ( svp && SvTRUE(*svp) ) { - undef_unknown = 1; enc->flags |= SRL_F_SORT_KEYS; } + svp = hv_fetchs(opt, "dedupe_strings", 0); + if ( svp && SvTRUE(*svp) ) { + enc->flags |= SRL_F_DEDUPE_STRINGS; + } + svp = hv_fetchs(opt, "stringify_unknown", 0); if ( svp && SvTRUE(*svp) ) { if (expect_false( undef_unknown )) { @@ -328,6 +348,12 @@ srl_init_weak_hash(srl_encoder_t *enc) return enc->weak_seenhash; } +SRL_STATIC_INLINE HV * +srl_init_string_deduper_hv(pTHX_ srl_encoder_t *enc) +{ + enc->string_deduper_hv = newHV(); + return enc->string_deduper_hv; +} void srl_write_header(pTHX_ srl_encoder_t *enc) @@ -447,7 +473,7 @@ SRL_STATIC_INLINE void srl_dump_classname(pTHX_ srl_encoder_t *enc, SV *src) { const HV *stash = SvSTASH(src); - PTABLE_t *string_seenhash = SRL_GET_STR_SEENHASH(enc); + PTABLE_t *string_seenhash = SRL_GET_STR_PTR_SEENHASH(enc); const ptrdiff_t oldoffset = (ptrdiff_t)PTABLE_fetch(string_seenhash, (SV *)stash); if (oldoffset != 0) { @@ -946,7 +972,7 @@ srl_dump_hk(pTHX_ srl_encoder_t *enc, HE *src, const int share_keys) #endif ) { - PTABLE_t *string_seenhash = SRL_GET_STR_SEENHASH(enc); + PTABLE_t *string_seenhash = SRL_GET_STR_PTR_SEENHASH(enc); const ptrdiff_t oldoffset = (ptrdiff_t)PTABLE_fetch(string_seenhash, str); if (oldoffset != 0) { /* Issue COPY instead of literal hash key string */ @@ -971,8 +997,34 @@ srl_dump_hk(pTHX_ srl_encoder_t *enc, HE *src, const int share_keys) } } +SRL_STATIC_INLINE void +srl_dump_svpv(pTHX_ srl_encoder_t *enc, SV *src) +{ + STRLEN len; + const char const *str= SvPV(src, len); + if ( SRL_ENC_HAVE_OPTION(enc, SRL_F_DEDUPE_STRINGS) && len > 3 ) { + HV *string_deduper_hv= SRL_GET_STR_DEDUPER_HV(enc); + HE *dupe_offset_he= hv_fetch_ent(string_deduper_hv, src, 1, 0); + if (!dupe_offset_he) { + croak("out of memory (hv_fetch_ent returned NULL)"); + } else { + SV *ofs_sv= HeVAL(dupe_offset_he); + if (SvIOK(ofs_sv)) { + /* emit copy */ + srl_buf_cat_varint(aTHX_ enc, SRL_HDR_COPY, SvIV(ofs_sv)); + return; + } else if (SvUOK(ofs_sv)) { + srl_buf_cat_varint(aTHX_ enc, SRL_HDR_COPY, SvUV(ofs_sv)); + return; + } else { + sv_setuv(ofs_sv, (UV)BUF_POS_OFS(enc)); + } + } + } + srl_dump_pv(aTHX_ enc, str, len, SvUTF8(src)); +} -static void +SRL_STATIC_INLINE void srl_dump_pv(pTHX_ srl_encoder_t *enc, const char* src, STRLEN src_len, int is_utf8) { BUF_SIZE_ASSERT(enc, 1 + SRL_MAX_VARINT_LENGTH + src_len); /* overallocate a bit sometimes */ @@ -987,9 +1039,6 @@ srl_dump_pv(pTHX_ srl_encoder_t *enc, const char* src, STRLEN src_len, int is_ut enc->pos += src_len; } - - - /* Dumps generic SVs and delegates * to more specialized functions for RVs, etc. */ /* TODO decide when to use the IV, when to use the PV, and when @@ -1010,6 +1059,7 @@ srl_dump_sv(pTHX_ srl_encoder_t *enc, SV *src) UV weakref_ofs= 0; /* preserved between loops */ SSize_t ref_rewrite_pos= 0; /* preserved between loops - note SSize_t is a perl define */ assert(src); + int nobless = SRL_ENC_HAVE_OPTION(enc, SRL_F_NO_BLESS_OBJECTS); if (++enc->recursion_depth == enc->max_recursion_depth) { croak("Hit maximum recursion depth (%lu), aborting serialization", @@ -1024,14 +1074,16 @@ redo_dump: DEBUG_ASSERT_BUF_SANE(enc); if ( SvMAGICAL(src) ) { SvGETMAGIC(src); -#ifdef Perl_hv_backreferences_p +#ifdef HAS_HV_BACKREFS if (svt != SVt_PVHV) #endif mg = mg_find(src, PERL_MAGIC_backref); } -#ifdef Perl_hv_backreferences_p - if (svt == SVt_PVHV) +#ifdef HAS_HV_BACKREFS + if (svt == SVt_PVHV) { backrefs= *Perl_hv_backreferences_p(aTHX_ (HV *)src); + if (DEBUGHACK) warn("backreferences %p", src); + } #endif if ( mg || backrefs ) { PTABLE_t *weak_seenhash= SRL_GET_WEAK_SEENHASH(enc); @@ -1088,7 +1140,7 @@ redo_dump: } if (weakref_ofs != 0) { sv_dump(src); - assert(weakref_ofs == 0); + croak("Corrupted weakref? weakref_ofs=0 (this should not happen)"); } if (SvPOKp(src)) { #if defined(MODERN_REGEXP) && !defined(REGEXP_NO_LONGER_POK) @@ -1099,11 +1151,7 @@ redo_dump: } else #endif - { - STRLEN len; - char *str = SvPV(src, len); - srl_dump_pv(aTHX_ enc, str, len, SvUTF8(src)); - } + srl_dump_svpv(aTHX_ enc, src); } else #if defined(MODERN_REGEXP) && defined(REGEXP_NO_LONGER_POK) @@ -1135,6 +1183,7 @@ redo_dump: } #endif if (SvWEAKREF(src)) { + if (DEBUGHACK) warn("Is weakref %p", src); weakref_ofs= BUF_POS_OFS(enc); srl_buf_cat_char(enc, SRL_HDR_WEAKEN); } @@ -1148,11 +1197,13 @@ redo_dump: } /* FIXME reuse/ref/... should INCLUDE the bless stuff. */ /* Write bless operator with class name */ - srl_dump_classname(aTHX_ enc, referent); + if (!nobless) + srl_dump_classname(aTHX_ enc, referent); } srl_buf_cat_char(enc, SRL_HDR_REFN); refsv= src; src= referent; + if (DEBUGHACK) warn("Going to redo %p", src); goto redo_dump; } else diff --git a/srl_encoder.h b/srl_encoder.h index b8fad00..a1703d3 100644 --- a/srl_encoder.h +++ b/srl_encoder.h @@ -22,9 +22,10 @@ typedef struct { UV max_recursion_depth; /* Configurable limit on the number of recursive calls we're willing to make */ UV recursion_depth; /* current Perl-ref recursion depth */ - ptable_ptr ref_seenhash; /* ptr table for avoiding circular refs */ + ptable_ptr ref_seenhash; /* ptr table for avoiding circular refs */ ptable_ptr weak_seenhash; /* ptr table for avoiding dangling weakrefs */ - ptable_ptr str_seenhash; /* ptr table for issuing COPY commands */ + ptable_ptr str_seenhash; /* ptr table for issuing COPY commands based on PTRS (used for classnames and keys) */ + HV *string_deduper_hv; /* track strings we have seen before, by content */ void *snappy_workmem; /* lazily allocated if and only if using Snappy */ IV snappy_threshold; /* do not compress things smaller than this even if Snappy enabled */ @@ -50,36 +51,42 @@ void srl_dump_data_structure(pTHX_ srl_encoder_t *enc, SV *src); /* Will default to "on". If set, hash keys will be shared using COPY. * Corresponds to the inverse of constructor option "no_shared_hashkeys" */ -#define SRL_F_SHARED_HASHKEYS 1UL +#define SRL_F_SHARED_HASHKEYS 0x00001UL /* If set, then we're using the OO interface and we shouldn't destroy the * encoder struct during SAVEDESTRUCTOR_X time */ -#define SRL_F_REUSE_ENCODER 2UL +#define SRL_F_REUSE_ENCODER 0x00002UL /* If set in flags, then we rather croak than serialize an object. * Corresponds to the 'croak_on_bless' option to the Perl constructor. */ -#define SRL_F_CROAK_ON_BLESS 4UL +#define SRL_F_CROAK_ON_BLESS 0x00004UL /* If set in flags, then we will emit for all data types * that aren't supported. Corresponds to the 'undef_unknown' option. */ -#define SRL_F_UNDEF_UNKNOWN 8UL +#define SRL_F_UNDEF_UNKNOWN 0x00008UL /* If set in flags, then we will stringify (SvPV) all data types * that aren't supported. Corresponds to the 'stringify_unknown' option. */ -#define SRL_F_STRINGIFY_UNKNOWN 16UL +#define SRL_F_STRINGIFY_UNKNOWN 0x00010UL /* If set in flags, then we warn() when trying to serialize an unsupported * data structure. Applies only if stringify_unknown or undef_unknown are * set since we otherwise croak. Corresponds to the 'warn_unknown' option. */ -#define SRL_F_WARN_UNKNOWN 32UL +#define SRL_F_WARN_UNKNOWN 0x00020UL /* WARNING: This is different from the protocol bit SRL_PROTOCOL_ENCODING_SNAPPY in that it's * a flag on the encoder struct indicating that we want to use Snappy. */ -#define SRL_F_COMPRESS_SNAPPY 64UL -#define SRL_F_COMPRESS_SNAPPY_INCREMENTAL 128UL +#define SRL_F_COMPRESS_SNAPPY 0x00040UL +#define SRL_F_COMPRESS_SNAPPY_INCREMENTAL 0x00080UL /* Only meaningful if SRL_F_WARN_UNKNOWN also set. If this one is set, then we don't warn * if the unsupported item has string overloading. */ -#define SRL_F_NOWARN_UNKNOWN_OVERLOAD 256UL +#define SRL_F_NOWARN_UNKNOWN_OVERLOAD 0x00100UL /* Only meaningful if SRL_F_WARN_UNKNOWN also set. If this one is set, then we don't warn * if the unsupported item has string overloading. */ -#define SRL_F_SORT_KEYS 512UL +#define SRL_F_SORT_KEYS 0x00200UL + +#define SRL_F_DEDUPE_STRINGS 0x00400UL + +/* If set in flags, then we serialize objects without class information. + * Corresponds to the 'no_bless_objects' flag found in the Decoder. */ +#define SRL_F_NO_BLESS_OBJECTS 0x00800UL /* Set while the encoder is in active use / dirty */ #define SRL_OF_ENCODER_DIRTY 1UL diff --git a/t/010_desperate.t b/t/010_desperate.t index 0202fec..ea3251c 100644 --- a/t/010_desperate.t +++ b/t/010_desperate.t @@ -23,6 +23,7 @@ use Test::More; run_tests("plain"); run_tests("no_shared_hk", {no_shared_hashkeys => 1}); +run_tests("dedupe_strings", {dedupe_strings => 1}); done_testing(); sub run_tests { diff --git a/t/021_sort_keys_option.t b/t/021_sort_keys_option.t new file mode 100644 index 0000000..7860c8e --- /dev/null +++ b/t/021_sort_keys_option.t @@ -0,0 +1,10 @@ +#!perl +use strict; +use warnings; +use Test::More tests => 1; +use Sereal::Encoder qw(encode_sereal); + +eval { encode_sereal(\1, { sort_keys => 1, stringify_unknown => 1 }); }; +ok !$@, "We shouldn't die on sort_keys combined with stringify_unknown"; + + diff --git a/t/110_nobless.t b/t/110_nobless.t new file mode 100644 index 0000000..5ac712d --- /dev/null +++ b/t/110_nobless.t @@ -0,0 +1,50 @@ +#!perl +use strict; +use warnings; +use Sereal::Encoder; +use File::Spec; +use Scalar::Util qw( blessed ); +use lib File::Spec->catdir(qw(t lib)); +BEGIN { + lib->import('lib') + if !-d 't'; +} + +use Sereal::TestSet qw(:all); +use Test::More; + +my $ok = have_encoder_and_decoder(); +if (not $ok) { + plan skip_all => 'Did not find right version of encoder'; +} +else { + my $class = 'MyFoo'; + my %hash = ( x => 1 ); + my $object = bless( \%hash, $class ); + my $dec = Sereal::Decoder->new(); + + # do not bless anything + { + my $enc = Sereal::Encoder->new({ no_bless_objects => 1 }); + my $blob = $enc->encode( $object ); + + my $data = $dec->decode( $blob ); + + ok( ref( $data ) && !blessed( $data ), 'reference without class' ); + is_deeply( $data, \%hash, 'same structure' ); + } + + # normally do the blessing + { + my $enc = Sereal::Encoder->new(); + my $blob = $enc->encode( $object ); + + my $data = $dec->decode( $blob ); + + is_deeply( $data, $object, 'same structure' ); + isa_ok( $data, $class, 'same class' ); + } +} + +done_testing(); + diff --git a/t/400_evil.t b/t/400_evil.t index edb7278..d2adf05 100644 --- a/t/400_evil.t +++ b/t/400_evil.t @@ -135,7 +135,7 @@ SCOPE: { }; $enc->encode(["foo", sub{}]); }; - ok($die_run == 2, "__DIE__ called, encode 2 did not die"); + ok($die_run == 2, "__DIE__ called, encode 2 did not die ($die_run)"); } # github Sereal/Sereal issue 7 regression test: diff --git a/t/lib/Sereal/TestSet.pm b/t/lib/Sereal/TestSet.pm index f866938..d8212ec 100644 --- a/t/lib/Sereal/TestSet.pm +++ b/t/lib/Sereal/TestSet.pm @@ -595,6 +595,7 @@ sub run_roundtrip_tests { ['snappy', { snappy => 1 } ], ['snappy_incr', { snappy_incr => 1 } ], ['sort_keys', { sort_keys => 1 } ], + ['dedupe_strings', { dedupe_strings => 1 } ], ) { run_roundtrip_tests_internal(@$opt); }