diff -urNp zlib-1.2.5-orig/deflate.c zlib-1.2.5/deflate.c --- zlib-1.2.5-orig/deflate.c 2010-04-20 12:12:21.000000000 +0800 +++ zlib-1.2.5/deflate.c 2010-07-26 03:53:34.000000000 +0800 @@ -49,6 +49,17 @@ /* @(#) $Id$ */ +/* We can use 2-byte chunks only if 'unsigned short' has been defined + * appropriately and MAX_MATCH has the default value. + */ +#ifdef UNALIGNED_OK +# include +# include "zutil.h" +# if (MAX_MATCH != 258) || (USHRT_MAX != 0xffff) +# undef UNALIGNED_OK +# endif +#endif + #include "deflate.h" const char deflate_copyright[] = @@ -1119,7 +1130,8 @@ local uInt longest_match(s, cur_match) * However the length of the match is limited to the lookahead, so * the output of deflate is not affected by the uninitialized values. */ -#if (defined(UNALIGNED_OK) && MAX_MATCH == 258) +#ifdef UNALIGNED_OK + /* This code assumes sizeof(unsigned short) == 2. Do not use * UNALIGNED_OK if your compiler uses a different size. */ diff -urNp zlib-1.2.5-orig/deflate.h zlib-1.2.5/deflate.h --- zlib-1.2.5-orig/deflate.h 2010-04-19 12:00:46.000000000 +0800 +++ zlib-1.2.5/deflate.h 2010-07-26 03:53:34.000000000 +0800 @@ -251,9 +251,12 @@ typedef struct internal_state { ulg bits_sent; /* bit length of compressed data sent mod 2^32 */ #endif - ush bi_buf; + ulg bi_buf; /* Output buffer. bits are inserted starting at the bottom (least - * significant bits). + * significant bits). Room for at least two short values to allow + * for a simpler overflow handling. However, if more than 16 bits + * have been buffered, it will be flushed and* and no more then 16 + * bits will be in use afterwards. */ int bi_valid; /* Number of valid bits in bi_buf. All bits above the last valid bit @@ -274,6 +277,20 @@ typedef struct internal_state { */ #define put_byte(s, c) {s->pending_buf[s->pending++] = (c);} +/* Output a short LSB first on the stream. + * IN assertion: there is enough room in pendingBuf. + */ +#if defined(LITTLE_ENDIAN) && defined(UNALIGNED_OK) +# define put_short(s, w) { \ + *(ush*)(s->pending_buf + s->pending) = (ush)(w);\ + s->pending += 2; \ +} +#else +# define put_short(s, w) { \ + put_byte(s, (uch)((w) & 0xff)); \ + put_byte(s, (uch)((ush)(w) >> 8)); \ +} +#endif #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) /* Minimum amount of lookahead, except at the end of the input file. diff -urNp zlib-1.2.5-orig/inffast.c zlib-1.2.5/inffast.c --- zlib-1.2.5-orig/inffast.c 2010-04-19 12:16:23.000000000 +0800 +++ zlib-1.2.5/inffast.c 2010-07-26 03:53:34.000000000 +0800 @@ -1,5 +1,6 @@ /* inffast.c -- fast decoding - * Copyright (C) 1995-2008, 2010 Mark Adler + * Copyright (C) 1995-2004, 2010 Mark Adler + * 2010 Optimizations by Stefan Fuhrmann * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -10,16 +11,35 @@ #ifndef ASMINF +/* This is a highly optimized implementation of the decoder function for + * large code blocks. It cannot be used to decode close to the end of + * input nor output buffers (see below). + * + * Before trying to hand-tune assembly code for your target, you should + * make sure that alignment, endianess, word size optimizations etc. have + * already been enabled for the respective target platform. + + * For MS VC++ 2008, the performance gain of specialized code against + * DISABLE_INFLATE_FAST_OPTIMIZATIONS (base line) is as follows: + * + * x86 (32 bit): +60% throughput + * x64 (64 bit): +70% throughput + * + * Measurements were taken on a Core i7 CPU with a mix of small and large + * buffers (110MB total) or varying content and an average compression rate + * of 2.2 . + */ + /* Allow machine dependent optimization for post-increment or pre-increment. - Based on testing to date, - Pre-increment preferred for: - - PowerPC G3 (Adler) - - MIPS R5000 (Randers-Pehrson) - Post-increment preferred for: - - none - No measurable difference: - - Pentium III (Anderson) - - M68060 (Nikl) + * Based on testing to date, + * Pre-increment preferred for: + * - PowerPC G3 (Adler) + * - MIPS R5000 (Randers-Pehrson) + * Post-increment preferred for: + * - none + * No measurable difference: + * - Pentium III (Anderson) + * - M68060 (Nikl) */ #ifdef POSTINC # define OFF 0 @@ -29,6 +49,212 @@ # define PUP(a) *++(a) #endif +/* On a number of architectures, it is more efficient to + * read 64 bits from the input stream at once than only + * a 32 bit chunk. That allows for fewer memory accesses + * and calculations as well as for more aggressive loop + * unrolling. + */ +#if defined(_M_X64) || defined(__x86_64) +# define HOLD_64BIT_CHUNKS +#endif + +/* For debugging purposes, we may want to disable code + * optimizations as we won't be otherwise able to access + * alternative code paths. + * Please note that undefining these features does affect + * this file only. + */ +#ifdef DISABLE_INFLATE_FAST_OPTIMIZATIONS +# ifdef UNALIGNED_OK +# undef UNALIGNED_OK +# endif +# ifdef HOLD_64BIT_CHUNKS +# undef HOLD_64BIT_CHUNKS +# endif +# ifdef LITTLE_ENDIAN +# undef LITTLE_ENDIAN +# endif +# ifdef USE_SSE2 +# undef USE_SSE2 +# endif +#endif + +/* A reusable code-snippet. It copies 'len' bytes from 'from' + * to 'out'. 'len' must be 3 or larger. This code will be used + * when no optimization will is available. + */ +#define STANDARD_MIN3_COPY\ + while (len > 2) {\ + PUP(out) = PUP(from);\ + PUP(out) = PUP(from);\ + PUP(out) = PUP(from);\ + len -= 3;\ + }\ + if (len) { \ + PUP(out) = PUP(from);\ + if (len > 1)\ + PUP(out) = PUP(from);\ + } + +/* A reusable code-snippet. It copies data from 'from'to 'out'. + * up to 'last' with the last chunk possibly exceeding 'last' + * by up to 15 bytes. + */ +#ifdef USE_SSE2 +# include +# define TRY_CHUNKY_COPY\ + if ((dist >= sizeof (__m128i)) || (last <= out)) { \ + do {\ + _mm_storeu_si128 ((__m128i*)(out+OFF), \ + _mm_loadu_si128((const __m128i*)(from+OFF)));\ + out += sizeof (__m128i);\ + from += sizeof (__m128i);\ + } while (out < last); \ + } +#else +# define TRY_CHUNKY_COPY\ + if (dist >= sizeof(long) || (last <= out)) { \ + do {\ + *(long*)(out+OFF) = *(long*)(from+OFF);\ + out += sizeof (long);\ + from += sizeof (long);\ + } while (out < last); \ + } +#endif + +/* The 'copy / repeat an existing sequence' is at the core of LZ- + * style encoding. Therefore, whenever the CPU allows, we use few, + * unaligned 4-byte copies instead of many single-byte accesses. + * + * The local variable definition actually leads to better code + * being generated by the MS compiler. + */ +#ifdef UNALIGNED_OK +# define QUICK_COPY\ + {\ + unsigned char FAR *from = out - dist;\ + unsigned char FAR *last = out + len;\ + TRY_CHUNKY_COPY\ + else {\ + do { \ + *(out+OFF+0) = *(from+OFF+0);\ + *(out+OFF+1) = *(from+OFF+1);\ + *(out+OFF+2) = *(from+OFF+2);\ + from += 3;\ + out += 3;\ + } while (out < last);\ + }\ + out = last;\ + } +#else +# define QUICK_COPY\ + from = out - dist;\ + STANDARD_MIN3_COPY +#endif + +/* Whenever we don't copy / repeat existing sequences, we add new + * literals. This is the code snippet that will be used in an + * unrolled loop for extracting literals one-by-one. + * We bail out if a non-literal has been found. We also assume that + * the loop head already made sure we don't read / write beyond + * buffer boundaries. + */ +#define EXTRACT_NEXT_IF_LITERAL\ + here = lcode[hold & lmask];\ + if (here.op != 0)\ + goto dolen;\ +\ + op = (unsigned)(here.bits);\ + hold >>= op;\ + bits -= op;\ + Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?\ + "inflate: literal '%c'\n" :\ + "inflate: literal 0x%02x\n", here.val));\ + PUP(out) = (unsigned char)(here.val); + +/* Unrolled loop content. Using 32 bit chunks, we can unroll it + * only once because every step consumes up to 9 bits of the + * input stream. We got 25/57 bits (using 32/64 bit chunks) + * entering the loop but must leave with at least 9 bits left + * for the top of the main loop. + */ +#if defined(HOLD_64BIT_CHUNKS) +# define LITERAL_UNROLL_SIZE 5 +# define UNROLLED_LITERAL_LOOP {\ + EXTRACT_NEXT_IF_LITERAL \ + EXTRACT_NEXT_IF_LITERAL \ + EXTRACT_NEXT_IF_LITERAL \ + EXTRACT_NEXT_IF_LITERAL \ + EXTRACT_NEXT_IF_LITERAL \ + } +#else +# define LITERAL_UNROLL_SIZE 1 +# define UNROLLED_LITERAL_LOOP { EXTRACT_NEXT_IF_LITERAL } +#endif + +/* Chunk that can be prefetched from the input stream. + */ +#if defined(HOLD_64BIT_CHUNKS) +# define HOLD_TYPE unsigned long long +#else +# define HOLD_TYPE unsigned long +#endif + +/* Code snipped that reads a single byte from 'in' and + * adds it to the prefetched ('hold') data. + */ +#define PREFETCH_BYTE \ + hold += (HOLD_TYPE)(PUP(in)) << bits;\ + bits += 8; + +/* Code snipped completely filling the prefetch variable. + */ +#if defined(LITTLE_ENDIAN) && defined(UNALIGNED_OK) +# define TOP_UP_BITS \ + {\ + hold |= (*(HOLD_TYPE*)(in + OFF)) << bits;\ + added = (sizeof (HOLD_TYPE) * 8 - bits) / 8;\ + in += added;\ + bits += added * 8; \ + } +#else +# if defined(HOLD_64BIT_CHUNKS) +# define TOP_UP_BITS\ + if (bits < 33) {\ + PREFETCH_BYTE\ + PREFETCH_BYTE\ + PREFETCH_BYTE\ + PREFETCH_BYTE\ + }\ + if (bits < 49) {\ + PREFETCH_BYTE\ + PREFETCH_BYTE\ + }\ + if (bits < 57) {\ + PREFETCH_BYTE\ + } +# else +# define TOP_UP_BITS\ + if (bits < 17) {\ + PREFETCH_BYTE\ + PREFETCH_BYTE\ + }\ + if (bits < 25) {\ + PREFETCH_BYTE\ + } +# endif +#endif + +/* For 64 bit chunks, we don't need to prefetch a second + * time inside the main loop when decoding the distance. + */ +#if defined(HOLD_64BIT_CHUNKS) +# define TOP_UP_BITS_32 +#else +# define TOP_UP_BITS_32 TOP_UP_BITS +#endif + /* Decode literal, length, and distance codes and write out the resulting literal and match bytes until either not enough input or output is @@ -40,8 +266,8 @@ Entry assumptions: state->mode == LEN - strm->avail_in >= 6 - strm->avail_out >= 258 + strm->avail_in >= 8 + strm->avail_out >= 273 start >= strm->avail_out state->bits < 8 @@ -56,13 +282,15 @@ - The maximum input bits used by a length/distance pair is 15 bits for the length code, 5 bits for the length extra, 15 bits for the distance code, and 13 bits for the distance extra. This totals 48 bits, or six bytes. - Therefore if strm->avail_in >= 6, then there is enough input to avoid - checking for available input while decoding. + However, we prefetch 1x8 or 2x4 bytes. Therefore if strm->avail_in >= 8 + is always true, then there is enough input to avoid checking for available + input while decoding. - The maximum bytes that a single length/distance pair can output is 258 - bytes, which is the maximum length that can be coded. inflate_fast() - requires strm->avail_out >= 258 for each loop to avoid checking for - output space. + bytes, which is the maximum length that can be coded. Another 15 bytes + padding are required to simplify copying in chunks of up to 16 bytes. + inflate_fast() requires strm->avail_out >= 273 for each loop to avoid + checking for output space. */ void ZLIB_INTERNAL inflate_fast(strm, start) z_streamp strm; @@ -81,8 +309,9 @@ unsigned start; /* inflate()'s s unsigned whave; /* valid bytes in the window */ unsigned wnext; /* window write index */ unsigned char FAR *window; /* allocated sliding window, if wsize != 0 */ - unsigned long hold; /* local strm->hold */ + HOLD_TYPE hold; /* local strm->hold */ unsigned bits; /* local strm->bits */ + unsigned added; /* number of bytes fetched in TOP_UP_BITS */ code const FAR *lcode; /* local strm->lencode */ code const FAR *dcode; /* local strm->distcode */ unsigned lmask; /* mask for first level of length codes */ @@ -97,10 +326,10 @@ unsigned start; /* inflate()'s s /* copy state to local variables */ state = (struct inflate_state FAR *)strm->state; in = strm->next_in - OFF; - last = in + (strm->avail_in - 5); + last = in + (strm->avail_in - 7); out = strm->next_out - OFF; beg = out - (start - strm->avail_out); - end = out + (strm->avail_out - 257); + end = out + (strm->avail_out - 272); #ifdef INFLATE_STRICT dmax = state->dmax; #endif @@ -117,61 +346,47 @@ unsigned start; /* inflate()'s s /* decode literals and length/distances until end-of-block or not enough input data or output space */ + TOP_UP_BITS /* bits = 32/64 */ do { - if (bits < 15) { - hold += (unsigned long)(PUP(in)) << bits; - bits += 8; - hold += (unsigned long)(PUP(in)) << bits; - bits += 8; - } + /* bits >= 10/10 */ here = lcode[hold & lmask]; dolen: op = (unsigned)(here.bits); hold >>= op; bits -= op; + TOP_UP_BITS /* bits >= 25/57 */ + op = (unsigned)(here.op); if (op == 0) { /* literal */ Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ? "inflate: literal '%c'\n" : "inflate: literal 0x%02x\n", here.val)); PUP(out) = (unsigned char)(here.val); + /* bits >= 25/57 */ + if (out + LITERAL_UNROLL_SIZE-1 < end && in < last) + UNROLLED_LITERAL_LOOP + /* bits >= 16/12 */ } else if (op & 16) { /* length base */ len = (unsigned)(here.val); op &= 15; /* number of extra bits */ if (op) { - if (bits < op) { - hold += (unsigned long)(PUP(in)) << bits; - bits += 8; - } len += (unsigned)hold & ((1U << op) - 1); hold >>= op; bits -= op; } Tracevv((stderr, "inflate: length %u\n", len)); - if (bits < 15) { - hold += (unsigned long)(PUP(in)) << bits; - bits += 8; - hold += (unsigned long)(PUP(in)) << bits; - bits += 8; - } + /* bits >= 10/42 */ here = dcode[hold & dmask]; dodist: op = (unsigned)(here.bits); hold >>= op; bits -= op; + TOP_UP_BITS_32 /* bits >= 25/36 */ op = (unsigned)(here.op); if (op & 16) { /* distance base */ dist = (unsigned)(here.val); op &= 15; /* number of extra bits */ - if (bits < op) { - hold += (unsigned long)(PUP(in)) << bits; - bits += 8; - if (bits < op) { - hold += (unsigned long)(PUP(in)) << bits; - bits += 8; - } - } dist += (unsigned)hold & ((1U << op) - 1); #ifdef INFLATE_STRICT if (dist > dmax) { @@ -182,6 +397,7 @@ unsigned start; /* inflate()'s s #endif hold >>= op; bits -= op; + /* bits >= 10/21 */ Tracevv((stderr, "inflate: distance %u\n", dist)); op = (unsigned)(out - beg); /* max distance in output */ if (dist > op) { /* see if copy from window */ @@ -190,9 +406,9 @@ unsigned start; /* inflate()'s s if (state->sane) { strm->msg = (char *)"invalid distance too far back"; - state->mode = BAD; - break; - } + state->mode = BAD; + break; + } #ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR if (len <= op - whave) { do { @@ -253,31 +469,10 @@ unsigned start; /* inflate()'s s from = out - dist; /* rest from output */ } } - while (len > 2) { - PUP(out) = PUP(from); - PUP(out) = PUP(from); - PUP(out) = PUP(from); - len -= 3; - } - if (len) { - PUP(out) = PUP(from); - if (len > 1) - PUP(out) = PUP(from); - } + STANDARD_MIN3_COPY } - else { - from = out - dist; /* copy direct from output */ - do { /* minimum length is three */ - PUP(out) = PUP(from); - PUP(out) = PUP(from); - PUP(out) = PUP(from); - len -= 3; - } while (len > 2); - if (len) { - PUP(out) = PUP(from); - if (len > 1) - PUP(out) = PUP(from); - } + else { /* copy direct from output */ + QUICK_COPY } } else if ((op & 64) == 0) { /* 2nd level distance code */ @@ -304,7 +499,7 @@ unsigned start; /* inflate()'s s state->mode = BAD; break; } - } while (in < last && out < end); + } while (out < end && in < last); /* return unused bytes (on entry, bits < 8, so in won't go too far back) */ len = bits >> 3; @@ -315,9 +510,9 @@ unsigned start; /* inflate()'s s /* update state and return */ strm->next_in = in + OFF; strm->next_out = out + OFF; - strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last)); + strm->avail_in = (unsigned)(in < last ? 7 + (last - in) : 7 - (in - last)); strm->avail_out = (unsigned)(out < end ? - 257 + (end - out) : 257 - (out - end)); + 272 + (end - out) : 272 - (out - end)); state->hold = hold; state->bits = bits; return; @@ -335,6 +530,26 @@ unsigned start; /* inflate()'s s - Swapping window/direct else - Larger unrolled copy loops (three is about right) - Moving len -= 3 statement into middle of loop + + The critical code path is the following: + here = lcode[hold & lmask]; + op = (unsigned)(here.bits); + + It requires + 2 accesses to hold and lmask (0 ticks if in register, + otherwise: 4 ticks typ = 1 + L1 latency) + +1 ALU latency (usually 1 tick) + +1 L1 latency (2..4 ticks, typ. 3 ticks) + +1 member access latency (0 ticks on some arch if 'bits' is the MSB, + 2 ALU ops / 2 ticks otherwise) + -> 4 .. 12 ticks latency + + Therefore, we "splice" the data prefetch code (hold) into the critical + path (a good compiler will interleave the data load from TOP_UP_BITS + with the lcode access). All calculation be parallelized very well on + most architectures so that TOP_UP_BITS becomes relatively cheap at 4 or + less ticks overhead with no branch mispredictions possible. Also, 'hold' + will be readily available the next iteration. */ #endif /* !ASMINF */ diff -urNp zlib-1.2.5-orig/inftrees.c zlib-1.2.5/inftrees.c --- zlib-1.2.5-orig/inftrees.c 2010-04-20 12:12:21.000000000 +0800 +++ zlib-1.2.5/inftrees.c 2010-07-26 03:53:34.000000000 +0800 @@ -138,13 +138,20 @@ unsigned short FAR *work; return -1; /* incomplete set */ /* generate offsets into symbol table for each length for sorting */ - offs[1] = 0; - for (len = 1; len < MAXBITS; len++) - offs[len + 1] = offs[len] + count[len]; + { + unsigned short offset = 0; + offs[1] = 0; + for (len = 1; len < MAXBITS; len++) { + offset += count[len]; + offs[len + 1] = offset; + } + } /* sort symbols by length, by symbol order within each length */ - for (sym = 0; sym < codes; sym++) - if (lens[sym] != 0) work[offs[lens[sym]]++] = (unsigned short)sym; + for (sym = 0; sym < codes; sym++) { + unsigned len = lens[sym]; + if (len != 0) work[offs[len]++] = (unsigned short)sym; + } /* Create and fill in decoding tables. In this loop, the table being @@ -215,14 +222,15 @@ unsigned short FAR *work; /* process all codes and make table entries */ for (;;) { /* create table entry */ + unsigned work_sym = work[sym]; here.bits = (unsigned char)(len - drop); - if ((int)(work[sym]) < end) { + if ((int)(work_sym) < end) { here.op = (unsigned char)0; - here.val = work[sym]; + here.val = work_sym; } - else if ((int)(work[sym]) > end) { - here.op = (unsigned char)(extra[work[sym]]); - here.val = base[work[sym]]; + else if ((int)(work_sym) > end) { + here.op = (unsigned char)(extra[work_sym]); + here.val = base[work_sym]; } else { here.op = (unsigned char)(32 + 64); /* end of block */ diff -urNp zlib-1.2.5-orig/trees.c zlib-1.2.5/trees.c --- zlib-1.2.5-orig/trees.c 2010-04-19 12:03:44.000000000 +0800 +++ zlib-1.2.5/trees.c 2010-07-26 03:53:34.000000000 +0800 @@ -175,15 +175,6 @@ local void gen_trees_header OF((void)); #endif /* =========================================================================== - * Output a short LSB first on the stream. - * IN assertion: there is enough room in pendingBuf. - */ -#define put_short(s, w) { \ - put_byte(s, (uch)((w) & 0xff)); \ - put_byte(s, (uch)((ush)(w) >> 8)); \ -} - -/* =========================================================================== * Send a value on a given number of bits. * IN assertion: length <= 16 and value fits in length bits. */ @@ -203,29 +194,23 @@ local void send_bits(s, value, length) * (16 - bi_valid) bits from value, leaving (width - (16-bi_valid)) * unused bits in value. */ - if (s->bi_valid > (int)Buf_size - length) { - s->bi_buf |= (ush)value << s->bi_valid; + s->bi_buf |= (ulg)value << s->bi_valid; + s->bi_valid += (ulg)length; + if (s->bi_valid > Buf_size) { put_short(s, s->bi_buf); - s->bi_buf = (ush)value >> (Buf_size - s->bi_valid); - s->bi_valid += length - Buf_size; - } else { - s->bi_buf |= (ush)value << s->bi_valid; - s->bi_valid += length; - } + s->bi_buf >>= Buf_size; + s->bi_valid -= Buf_size; + } } #else /* !DEBUG */ #define send_bits(s, value, length) \ -{ int len = length;\ - if (s->bi_valid > (int)Buf_size - len) {\ - int val = value;\ - s->bi_buf |= (ush)val << s->bi_valid;\ +{ s->bi_buf |= (ulg)(value) << s->bi_valid;\ + s->bi_valid += (ulg)(length);\ + if (s->bi_valid > Buf_size) {\ put_short(s, s->bi_buf);\ - s->bi_buf = (ush)val >> (Buf_size - s->bi_valid);\ - s->bi_valid += len - Buf_size;\ - } else {\ - s->bi_buf |= (ush)(value) << s->bi_valid;\ - s->bi_valid += len;\ + s->bi_buf >>= Buf_size;\ + s->bi_valid -= Buf_size;\ }\ } #endif /* DEBUG */ @@ -1154,7 +1139,7 @@ local int detect_data_type(s) || s->dyn_ltree[13].Freq != 0) return Z_TEXT; for (n = 32; n < LITERALS; n++) - if (s->dyn_ltree[n].Freq != 0) + if (s->dyn_ltree[n].Freq != 0) return Z_TEXT; /* There are no "black-listed" or "white-listed" bytes: diff -urNp zlib-1.2.5-orig/zconf.h zlib-1.2.5/zconf.h --- zlib-1.2.5-orig/zconf.h 2010-04-19 01:58:06.000000000 +0800 +++ zlib-1.2.5/zconf.h 2010-07-26 03:53:34.000000000 +0800 @@ -160,10 +160,52 @@ #ifdef SYS16BIT # define MAXSEG_64K #endif -#ifdef MSDOS + +/* + * Many machines allow efficient access to unaligned data, that is + * reading 2 or more bytes at once from a random and possibly unaligned + * memory address is *on average* more efficient than reading the data + * one byte at a time and then combining it. + */ +#if !defined(UNALIGNED_OK) && defined(MSDOS) +# define UNALIGNED_OK +#endif +#if !defined(UNALIGNED_OK) && (defined(_M_IX86) || defined(_M_X64)) # define UNALIGNED_OK #endif +#if !defined(UNALIGNED_OK) && (defined(i386) || defined(__x86_64)) +# define UNALIGNED_OK +#endif + +/* + * Most information in compressed data streams is stored in LSB first + * (little endian) order. If that matches the machine byte order, we may + * apply certain optimizations. + */ +#if !defined(LITTLE_ENDIAN) && (defined(_M_IX86) || defined(_M_X64)) +# define LITTLE_ENDIAN +#endif +#if !defined(LITTLE_ENDIAN) && (defined(i386) || defined(__x86_64)) +# define LITTLE_ENDIAN +#endif +#if !defined(LITTLE_ENDIAN) && defined(__LITTLE_ENDIAN__) +# define LITTLE_ENDIAN +#endif + +/* + * With the availability of SSE2, we can optimize certain functions + * by operating on large chunks of data at once. + */ +#if !defined(USE_SSE2) && defined(__GNUC__) && defined(__SSE2__) +# define USE_SSE2 +#endif +#if !defined(USE_SSE2) && (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP>=2))) +# define USE_SSE2 +#endif +/* + * C standard level. + */ #ifdef __STDC_VERSION__ # ifndef STDC # define STDC