diff --git a/zfp/LICENSE b/zfp/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..7945102e870621acb36ddfddec709168f9db5c52 --- /dev/null +++ b/zfp/LICENSE @@ -0,0 +1,57 @@ +Copyright (c) 2014-2018, Lawrence Livermore National Security, LLC. +Produced at the Lawrence Livermore National Laboratory. +Written by Peter Lindstrom, Markus Salasoo, and Matt Larsen. +LLNL-CODE-663824. +All rights reserved. + +This file is part of the zfp library. +For details, see http://computation.llnl.gov/casc/zfp/. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the disclaimer below. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the disclaimer (as noted below) in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the LLNS/LLNL nor the names of its contributors may +be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, +LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Additional BSD Notice + +1. This notice is required to be provided under our contract with the U.S. +Department of Energy (DOE). This work was produced at Lawrence Livermore +National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. + +2. Neither the United States Government nor Lawrence Livermore National +Security, LLC nor any of their employees, makes any warranty, express or +implied, or assumes any liability or responsibility for the accuracy, +completeness, or usefulness of any information, apparatus, product, or +process disclosed, or represents that its use would not infringe +privately-owned rights. + +3. Also, reference herein to any specific commercial products, process, or +services by trade name, trademark, manufacturer or otherwise does not +necessarily constitute or imply its endorsement, recommendation, or +favoring by the United States Government or Lawrence Livermore National +Security, LLC. The views and opinions of authors expressed herein do not +necessarily state or reflect those of the United States Government or +Lawrence Livermore National Security, LLC, and shall not be used for +advertising or product endorsement purposes. diff --git a/zfp/Makefile b/zfp/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..cc696640144c360aa1126111ed47430a17795dbf --- /dev/null +++ b/zfp/Makefile @@ -0,0 +1,69 @@ +# see Config file for compile-time settings +#include Config + +include ../Make_include + +CFLAGS += -std=c99 -I../include + +MAKEFLAGS += --no-print-directory + +# macOS compiler options (uncomment on macOS) --------------------------------- + +# SOFLAGS += -undefined dynamic_lookup + +# default targets +BUILD_CFP = 0 +BUILD_ZFORP = 0 +BUILD_UTILITIES = 0 +BUILD_EXAMPLES = 0 +BUILD_TESTING = 0 +BUILD_SHARED_LIBS = 0 + +LIBRARY = static +LIBZFP = libzfp.a + +# compiler options ------------------------------------------------------------ + +# default: build all targets enabled in Config +all: + @echo $(LIBRARY) + @cd src; $(MAKE) $(LIBRARY) +ifneq ($(BUILD_CFP),0) + @cd cfp/src; $(MAKE) clean $(LIBRARY) +endif +ifneq ($(BUILD_ZFORP),0) + @cd fortran; $(MAKE) clean $(LIBRARY) +endif +ifneq ($(BUILD_UTILITIES),0) + @cd utils; $(MAKE) clean all +endif +ifneq ($(BUILD_TESTING),0) + @cd tests; $(MAKE) clean all +endif +ifneq ($(BUILD_EXAMPLES),0) + @cd examples; $(MAKE) clean all +endif + + +# run basic regression tests +test: + @cd tests; $(MAKE) test + +# clean all +clean: + @cd src; $(MAKE) clean +ifneq ($(BUILD_CFP),0) + @cd cfp/src; $(MAKE) clean +endif +ifneq ($(BUILD_ZFORP),0) + @cd fortran; $(MAKE) clean +endif +ifneq ($(BUILD_UTILITIES),0) + @cd utils; $(MAKE) clean +endif +ifneq ($(BUILD_TESTING),0) + @cd tests; $(MAKE) clean +endif +ifneq ($(BUILD_EXAMPLES),0) + @cd examples; $(MAKE) clean +endif diff --git a/zfp/README.md b/zfp/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c2aa80dece7d9d759347ee2dff3b6589f6e0ca5f --- /dev/null +++ b/zfp/README.md @@ -0,0 +1,134 @@ +ZFP +=== + +INTRODUCTION +------------ + +zfp is an open source C/C++ library for compressed numerical arrays that +support high throughput read and write random access. zfp also supports +streaming compression of integer and floating-point data, e.g., for +applications that read and write large data sets to and from disk. + +zfp was developed at Lawrence Livermore National Laboratory and is loosely +based on the algorithm described in the following paper: + + Peter Lindstrom + "Fixed-Rate Compressed Floating-Point Arrays" + IEEE Transactions on Visualization and Computer Graphics + 20(12):2674-2683, December 2014 + doi:10.1109/TVCG.2014.2346458 + +zfp was originally designed for floating-point arrays only, but has been +extended to also support integer data, and could for instance be used to +compress images and quantized volumetric data. To achieve high compression +ratios, zfp uses lossy but optionally error-bounded compression. Although +bit-for-bit lossless compression of floating-point data is not always +possible, zfp is usually accurate to within machine epsilon in near-lossless +mode. + +zfp works best for 2D and 3D arrays that exhibit spatial correlation, such as +continuous fields from physics simulations, images, regularly sampled terrain +surfaces, etc. Although zfp also provides a 1D array class that can be used +for 1D signals such as audio, or even unstructured floating-point streams, +the compression scheme has not been well optimized for this use case, and +rate and quality may not be competitive with floating-point compressors +designed specifically for 1D streams. + +zfp is freely available as open source under a BSD license, as outlined in +the file 'LICENSE'. For more information on zfp and comparisons with other +compressors, please see the zfp +[website](https://computation.llnl.gov/projects/floating-point-compression). +For questions, comments, requests, and bug reports, please contact +[Peter Lindstrom](mailto:pl@llnl.gov). + + +DOCUMENTATION +------------- + +Full +[documentation](http://zfp.readthedocs.io/en/release0.5.4/) +is available online via Read the Docs. A +[PDF](http://readthedocs.org/projects/zfp/downloads/pdf/release0.5.4/) +version is also available. + + +INSTALLATION +------------ + +zfp consists of three distinct parts: a compression library written in C; +a set of C++ header files with C wrappers that implement compressed arrays; +and a set of C and C++ examples. The main compression codec is written in +C and should conform to both the ISO C89 and C99 standards. The C++ array +classes are implemented entirely in header files and can be included as is, +but since they call the compression library, applications must link with +libzfp. + +On Linux, macOS, and MinGW, zfp is easiest compiled using gcc and gmake. +CMake support is also available, e.g., for Windows builds. See below for +instructions on GNU and CMake builds. + +zfp has successfully been built and tested using these compilers: + + gcc versions 4.4.7, 4.9.4, 5.5.0, 6.1.0, 6.4.0, 7.1.0, 7.3.0, 8.1.0 + icc versions 15.0.6, 16.0.4, 17.0.2, 18.0.2, 19.0.0 + clang versions 3.9.1, 4.0.0, 5.0.0, 6.0.0 + MinGW version 5.3.0 + Visual Studio versions 14 (2015), 15 (2017) + +zfp conforms to various language standards, including C89, C99, C++98, +C++11, and C++14. + +NOTE: zfp requires 64-bit compiler and operating system support. + +## GNU builds + +To compile zfp using gcc, type + + make + +from this directory. This builds libzfp as a static library as well as +utilities and example programs. See documentation for complete build +instructions. + +## CMake builds + +To build zfp using CMake on Linux or macOS, start a Unix shell and type + + mkdir build + cd build + cmake .. + make + +To also build the examples, replace the cmake line with + + cmake -DBUILD_EXAMPLES=ON .. + +To build zfp using Visual Studio on Windows, start a DOS shell, cd to the +top-level zfp directory, and type + + mkdir build + cd build + cmake .. + cmake --build . --config Release + +This builds zfp in release mode. Replace 'Release' with 'Debug' to build +zfp in debug mode. See the instructions for Linux on how to change the +cmake line to also build the example programs. + +## Testing + +To test that zfp is working properly, type + + make test + +or using CMake + + ctest + +If the compilation or regression tests fail, it is possible that some of the +macros in the file 'Config' have to be adjusted. Also, the tests may fail +due to minute differences in the computed floating-point fields being +compressed, which will be indicated by checksum errors. If most tests +succeed and the failures result in byte sizes and error values reasonably +close to the expected values, then it is likely that the compressor is +working correctly. diff --git a/zfp/VERSIONS.md b/zfp/VERSIONS.md new file mode 100644 index 0000000000000000000000000000000000000000..1a33ff9aafdc906c46dbf4bd0a6cd79d2d22ce06 --- /dev/null +++ b/zfp/VERSIONS.md @@ -0,0 +1,273 @@ +# zfp Release Notes + +## 0.5.4 (October 1, 2018) + +- Added support for CUDA fixed-rate compression and decompression. + +- Added views into compressed arrays for thread safety, nested array + indexing, slicing, and array subsetting. + +- Added C language bindings for compressed arrays. + +- Added support for compressing and decompressing 4D data. + +- Changes: + - Execution policy now applies to both compression and decompression. + - Compressed array accessors now return Scalar type instead of + const Scalar& to avoid stale references to evicted cache lines. + +- Bug fixes: + - Handling of negative strides. + - Command line tool handling of arrays with more than 2^32 elements. + - bitstream C++ compatibility. + - Respect minimum cache size request. + + +## 0.5.3 (March 28, 2018) + +- Added support for OpenMP multithreaded compression (but not decompression). + +- Added options for OpenMP execution to zfp command-line tool. + +- Changed return value of zfp\_decompress to indicate the number of compressed + bytes processed so far (now returns same value as zfp\_compress on success). + +- Added compressed array support for copy construction and assignment via + deep copies. + +- Added virtual destructors to enable inheritance from zfp arrays. + + +## 0.5.2 (September 28, 2017) + +- Added iterators and proxy objects for pointers and references. + +- Added example illustrating how to use iterators and pointers. + +- Modified diffusion example to optionally use iterators. + +- Moved internal headers under array to array/zfp. + +- Modified 64-bit integer typedefs to avoid the C89 non-compliant long long + and allow for user-supplied types and literal suffixes. + +- Renamed compile-time macros that did not have a ZFP prefix. + +- Fixed issue with setting stream word type via CMake. + +- Rewrote documentation in reStructuredText and added complete + documentation of all public functions, classes, types, and macros. + Removed ASCII documentation. + + +## 0.5.1 (March 28, 2017) + +- This release primarily fixes a few minor issues but also includes + changes in anticipation of a large number of planned future additions + to the library. No changes have been made to the compressed format, + which is backwards compatible with version 0.5.0. + +- Added high-level API support for integer types. + +- Separated library version from CODEC version and added version string. + +- Added example that illustrates in-place compression. + +- Added support for CMake builds. + +- Corrected inconsistent naming of BIT\_STREAM macros in code and + documentation. + +- Renamed some of the header bit mask macros. + +- Added return values to stream\_skip and stream\_flush to indicate the + number of bits skipped or output. + +- Renamed stream\_block and stream\_delta to make it clear that they refer + to strided streams. Added missing definition of stream\_stride\_block. + +- Changed int/uint types in places to use ptrdiff\_t/size\_t where + appropriate. + +- Changed API for zfp\_set\_precision and zfp\_set\_accuracy to not require + the scalar type. + +- Added missing static keyword in decode\_block. + +- Changed testzfp to allow specifying which tests to perform on the + command line. + +- Fixed bug that prevented defining uninitialized arrays. + +- Fixed incorrect computation of array sizes in zfp\_field\_size. + +- Fixed minor issues that prevented code from compiling on Windows. + +- Fixed issue with fixed-accuracy headers that caused unnecessary storage. + +- Modified directory structure. + +- Added documentation that discusses common issues with using zfp. + + +## 0.5.0 (February 29, 2016) + +- Modified CODEC to more efficiently encode blocks whose values are all + zero or are smaller in magnitude than the absolute error tolerance. + This allows representing "empty" blocks using only one bit each. This + version is not backwards compatible with prior zfp versions. + +- Changed behavior of zfp\_compress and zfp\_decompress to not automatically + rewind the bit stream. This makes it easier to concatenate multiple + compressed bit streams, e.g. when compressing vector fields or multiple + scalars together. + +- Added functions for compactly encoding the compression parameters + and field meta data, e.g. for producing self-contained compressed + streams. Also added functions for reading and writing a header + containing these parameters. + +- Changed the zfp example program interface to allow reading and writing + compressed streams, optionally with a header. The zfp tool can now be + used to compress and decompress files as a stand alone utility. + + +## 0.4.1 (December 28, 2015) + +- Fixed bug that caused segmentation fault when compressing 3D arrays + whose dimensions are not multiples of four. Specifically, arrays of + dimensions nx * ny * nz, with ny not a multiple of four, were not + handled correctly. + +- Modified examples/fields.h to ensure standard compliance. Previously, + C99 support was needed to handle the hex float constants, which are + not supported in C++98. + +- Added simple.c as a minimal example of how to call the compressor. + +- Changed compilation of diffusion example to output two executables: + one with and one without compression. + + +## 0.4.0 (December 5, 2015) + +- Substantial changes to the compression algorithm that improve PSNR + by about 6 dB and speed by a factor of 2-3. These changes are not + backward compatible with previous versions of zfp. + +- Added support for 31-bit and 63-bit integer data, as well as shorter + integer types. + +- Rewrote compression codec entirely in C to make linking and calling + easier from other programming languages, and to expose the low-level + interface through C instead of C++. This necessitated significant + changes to the API as well. + +- Minor changes to the C++ compressed array API, as well as major + implementation changes to support the C library. The namespace and + public types are now all in lower case. + +- Deprecated support for general fixed-point decorrelating transforms + and slimmed down implementation. + +- Added new examples for evaluating the throughput of the (de)compressor + and for compressing grayscale images in the pgm format. + +- Added FAQ. + + +## 0.3.2 (December 3, 2015) + +- Fixed bug in Array::get() that caused the wrong cached block to be + looked up, thus occasionally copying incorrect values back to parts + of the array. + + +## 0.3.1 (May 6, 2015) + +- Fixed rare bug caused by exponent underflow in blocks with no normal + and some denormal numbers. + + +## 0.3.0 (March 3, 2015) + +- Modified the default decorrelating transform to one that uses only + additions and bit shifts. This new transform, in addition to being + faster, also has some theoretical optimality properties and tends to + improve rate distortion. + +- Added compile-time support for parameterized transforms, e.g. to + support other popular transforms like DCT, HCT, and Walsh-Hadamard. + +- Made forward transform range preserving: (-1, 1) is mapped to (-1, 1). + Consequently Q1.62 fixed point can be used throughout. + +- Changed the order in which bits are emitted within each bit plane + to be more intelligent. Group tests are now deferred until they + are needed, i.e. just before the value bits for the group being + tested. This improves the quality of fixed-rate encodings, but + has no impact on compressed size. + +- Made several optimizations to improve performance. + +- Added floating-point traits to reduce the number of template + parameters. It is now possible to declare a 3D array as + Array3<float>, for example. + +- Added functions for setting the array scalar type and dimensions. + +- Consolidated several header files. + +- Added testzfp for regression testing. + + +## 0.2.1 (December 12, 2014) + +- Added Win64 support via Microsoft Visual Studio compiler. + +- Fixed broken support for IBM's xlc compiler. + +- Made several minor changes to suppress compiler warnings. + +- Documented expected output for the diffusion example. + + +## 0.2.0 (December 2, 2014) + +- The compression interface from zfpcompress was relocated to a + separate library, called libzfp, and modified to be callable from C. + This API now uses a parameter object (zfp\_params) to specify array + type and dimensions as well as compression parameters. + +- Several utility functions were added to simplify libzfp usage: + + * Functions for setting the rate, precision, and accuracy. + Corresponding functions were also added to the Codec class. + + * A function for estimating the buffer size needed for compression. + +- The Array class functionality was expanded: + + * Support for accessing the compressed bit stream stored with an + array, e.g. for offline compressed storage and for initializing + an already compressed array. + + * Functions for dynamically specifying the cache size. + + * The default cache is now direct-mapped instead of two-way + associative. + +- Minor bug fixes: + + * Corrected the value of the lowest possible bit plane to account for + both the smallest exponent and the number of bits in the significand. + + * Corrected inconsistent use of rate and precision. The rate refers + to the number of compressed bits per floating-point value, while + the precision refers to the number of uncompressed bits. The Array + API was changed accordingly. + + +## 0.1.0 (November 12, 2014) + +- Initial beta release. diff --git a/zfp/appveyor.yml b/zfp/appveyor.yml new file mode 100644 index 0000000000000000000000000000000000000000..c808bfaa25f0681e14a753cf9137a1b3503dac86 --- /dev/null +++ b/zfp/appveyor.yml @@ -0,0 +1,96 @@ +version: 0.5.4-{build} + +environment: + matrix: + - COMPILER: mingw + GENERATOR: MinGW Makefiles + PLATFORM: Win32 + BUILD_TYPE: Debug + + - COMPILER: mingw + GENERATOR: MinGW Makefiles + PLATFORM: Win32 + BUILD_TYPE: Release + + - COMPILER: mingw-w64 + GENERATOR: MinGW Makefiles + PLATFORM: x64 + BUILD_TYPE: Debug + + - COMPILER: mingw-w64 + GENERATOR: MinGW Makefiles + PLATFORM: x64 + BUILD_TYPE: Release + + - COMPILER: msvc + GENERATOR: Visual Studio 15 2017 Win64 + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 + PLATFORM: x64 + BUILD_TYPE: Debug + + - COMPILER: msvc + GENERATOR: Visual Studio 15 2017 Win64 + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 + PLATFORM: x64 + BUILD_TYPE: Release + + - COMPILER: msvc + GENERATOR: Visual Studio 15 2017 + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 + PLATFORM: Win32 + BUILD_TYPE: Debug + + - COMPILER: msvc + GENERATOR: Visual Studio 15 2017 + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 + PLATFORM: Win32 + BUILD_TYPE: Release + + - COMPILER: msvc + GENERATOR: Visual Studio 14 2015 Win64 + PLATFORM: x64 + BUILD_TYPE: Debug + + - COMPILER: msvc + GENERATOR: Visual Studio 14 2015 Win64 + PLATFORM: x64 + BUILD_TYPE: Release + + - COMPILER: msvc + GENERATOR: Visual Studio 14 2015 + PLATFORM: Win32 + BUILD_TYPE: Debug + + - COMPILER: msvc + GENERATOR: Visual Studio 14 2015 + PLATFORM: Win32 + BUILD_TYPE: Release + +install: + - if "%COMPILER%"=="mingw" set PATH=C:\MinGW\bin;%PATH% + - if "%COMPILER%"=="mingw-w64" set PATH=C:\MinGW\bin;%PATH% + +build_script: + - mkdir build + - cd build + + # build/test without OpenMP, with CFP + - if "%COMPILER%"=="msvc" cmake -G "%GENERATOR%" -DCMAKE_BUILD_TYPE="%BUILD_TYPE%" -DZFP_WITH_OPENMP=OFF -DBUILD_CFP=ON .. + - if not "%COMPILER%"=="msvc" cmake -G "%GENERATOR%" -DCMAKE_BUILD_TYPE="%BUILD_TYPE%" -DCMAKE_SH=CMAKE_SH-NOTFOUND -DZFP_WITH_OPENMP=OFF -DBUILD_CFP=ON .. + + - if "%COMPILER%"=="msvc" cmake --build . --config "%BUILD_TYPE%" + - if not "%COMPILER%"=="msvc" cmake --build . + + - ctest -V -C "%BUILD_TYPE%" + + - rm -rf ./* + + # build/test with OpenMP, with CFP custom namespace + - if "%COMPILER%"=="msvc" cmake -G "%GENERATOR%" -DCMAKE_BUILD_TYPE="%BUILD_TYPE%" -DBUILD_CFP=ON -DCFP_NAMESPACE=cfp2 .. + - if not "%COMPILER%"=="msvc" cmake -G "%GENERATOR%" -DCMAKE_BUILD_TYPE="%BUILD_TYPE%" -DCMAKE_SH=CMAKE_SH-NOTFOUND -DBUILD_CFP=ON -DCFP_NAMESPACE=cfp2 .. + + - if "%COMPILER%"=="msvc" cmake --build . --config "%BUILD_TYPE%" + - if not "%COMPILER%"=="msvc" cmake --build . + + - ctest -V -C "%BUILD_TYPE%" + diff --git a/zfp/array/zfp/cache.h b/zfp/array/zfp/cache.h new file mode 100644 index 0000000000000000000000000000000000000000..280ac70812dd1f722fe141f4bc8d7304aad75990 --- /dev/null +++ b/zfp/array/zfp/cache.h @@ -0,0 +1,257 @@ +#ifndef ZFP_CACHE_H +#define ZFP_CACHE_H + +#include "memory.h" + +#ifdef ZFP_WITH_CACHE_PROFILE + // maintain stats on hit and miss rates + #include <iostream> +#endif + +// direct-mapped or two-way skew-associative write-back cache +template <class Line> +class Cache { +public: + // cache line index (zero is reserved for unused lines) + typedef uint Index; + + // cache tag containing line meta data + class Tag { + public: + Tag() : x(0) {} + + Tag(Index x, bool d) : x(2 * x + d) {} + + // cache line index + Index index() const { return x >> 1; } + + // is line dirty? + bool dirty() const { return x & 1; } + + // is line used? + bool used() const { return x != 0; } + + // mark line as dirty + void mark() { x |= 1u; } + + // mark line as unused + void clear() { x = 0; } + + protected: + Index x; + }; + + // sequential iterator for looping over cache lines + class const_iterator { + public: + friend class Cache; + class Pair { + public: + Pair(Line* l, Tag t) : line(l), tag(t) {} + Line* line; + Tag tag; + }; + const_iterator& operator++() + { + advance(); + return *this; + } + const_iterator operator++(int) + { + const_iterator iter = *this; + advance(); + return iter; + } + const Pair& operator*() const { return pair; } + const Pair* operator->() const { return &pair; } + operator const void*() const { return pair.line ? this : 0; } + + protected: + const_iterator(Cache* cache) : c(cache), pair(cache->line, cache->tag[0]) + { + if (!pair.tag.used()) + advance(); + } + void advance() + { + if (pair.line) { + uint i; + for (i = uint(pair.line - c->line) + 1; i <= c->mask && !c->tag[i].used(); i++); + pair = (i <= c->mask ? Pair(c->line + i, c->tag[i]) : Pair(0, Tag())); + } + } + Cache* c; + Pair pair; + }; + + // allocate cache with at least minsize lines + Cache(uint minsize = 0) : tag(0), line(0) + { + resize(minsize); +#ifdef ZFP_WITH_CACHE_PROFILE + std::cerr << "cache lines=" << mask + 1 << std::endl; + hit[0][0] = hit[1][0] = miss[0] = back[0] = 0; + hit[0][1] = hit[1][1] = miss[1] = back[1] = 0; +#endif + } + + // copy constructor--performs a deep copy + Cache(const Cache& c) : tag(0), line(0) + { + deep_copy(c); + } + + // destructor + ~Cache() + { + deallocate(tag); + deallocate(line); +#ifdef ZFP_WITH_CACHE_PROFILE + std::cerr << "cache R1=" << hit[0][0] << " R2=" << hit[1][0] << " RM=" << miss[0] << " RB=" << back[0] + << " W1=" << hit[0][1] << " W2=" << hit[1][1] << " WM=" << miss[1] << " WB=" << back[1] << std::endl; +#endif + } + + // assignment operator--performs a deep copy + Cache& operator=(const Cache& c) + { + if (this != &c) + deep_copy(c); + return *this; + } + + // cache size in number of lines + uint size() const { return mask + 1; } + + // change cache size to at least minsize lines (all contents will be lost) + void resize(uint minsize) + { + for (mask = minsize ? minsize - 1 : 1; mask & (mask + 1); mask |= mask + 1); + reallocate(tag, ((size_t)mask + 1) * sizeof(Tag), 0x100); + reallocate(line, ((size_t)mask + 1) * sizeof(Line), 0x100); + clear(); + } + + // look up cache line #x and return pointer to it if in the cache; + // otherwise return null + const Line* lookup(Index x) const + { + uint i = primary(x); + if (tag[i].index() == x) + return line + i; +#ifdef ZFP_WITH_CACHE_TWOWAY + uint j = secondary(x); + if (tag[j].index() == x) + return line + j; +#endif + return 0; + } + + // look up cache line #x and set ptr to where x is or should be stored; + // if the returned tag does not match x, then the caller must implement + // write-back (if the line is in use) and then fetch the requested line + Tag access(Line*& ptr, Index x, bool write) + { + uint i = primary(x); + if (tag[i].index() == x) { + ptr = line + i; + if (write) + tag[i].mark(); +#ifdef ZFP_WITH_CACHE_PROFILE + hit[0][write]++; +#endif + return tag[i]; + } +#ifdef ZFP_WITH_CACHE_TWOWAY + uint j = secondary(x); + if (tag[j].index() == x) { + ptr = line + j; + if (write) + tag[j].mark(); +#ifdef ZFP_WITH_CACHE_PROFILE + hit[1][write]++; +#endif + return tag[j]; + } + // cache line not found; prefer primary and not dirty slots + i = tag[j].used() && (!tag[i].dirty() || tag[j].dirty()) ? i : j; +#endif + ptr = line + i; + Tag t = tag[i]; + tag[i] = Tag(x, write); +#ifdef ZFP_WITH_CACHE_PROFILE + miss[write]++; + if (tag[i].dirty()) + back[write]++; +#endif + return t; + } + + // clear cache without writing back + void clear() + { + for (uint i = 0; i <= mask; i++) + tag[i].clear(); + } + + // flush cache line + void flush(const Line* l) + { + uint i = uint(l - line); + tag[i].clear(); + } + + // return iterator to first cache line + const_iterator first() { return const_iterator(this); } + +protected: + // perform a deep copy + void deep_copy(const Cache& c) + { + mask = c.mask; + clone(tag, c.tag, mask + 1, 0x100u); + clone(line, c.line, mask + 1, 0x100u); +#ifdef ZFP_WITH_CACHE_PROFILE + hit[0][0] = c.hit[0][0]; + hit[0][1] = c.hit[0][1]; + hit[1][0] = c.hit[1][0]; + hit[1][1] = c.hit[1][1]; + miss[0] = c.miss[0]; + miss[1] = c.miss[1]; + back[0] = c.back[0]; + back[1] = c.back[1]; +#endif + } + + uint primary(Index x) const { return x & mask; } + uint secondary(Index x) const + { +#ifdef ZFP_WITH_CACHE_FAST_HASH + // max entropy hash for 26- to 16-bit mapping (not full avalanche) + x -= x << 7; + x ^= x >> 16; + x -= x << 3; +#else + // Jenkins hash; see http://burtleburtle.net/bob/hash/integer.html + x -= x << 6; + x ^= x >> 17; + x -= x << 9; + x ^= x << 4; + x -= x << 3; + x ^= x << 10; + x ^= x >> 15; +#endif + return x & mask; + } + + Index mask; // cache line mask + Tag* tag; // cache line tags + Line* line; // actual decompressed cache lines +#ifdef ZFP_WITH_CACHE_PROFILE + uint64 hit[2][2]; // number of primary/secondary read/write hits + uint64 miss[2]; // number of read/write misses + uint64 back[2]; // number of write-backs due to read/writes +#endif +}; + +#endif diff --git a/zfp/array/zfp/iterator1.h b/zfp/array/zfp/iterator1.h new file mode 100644 index 0000000000000000000000000000000000000000..310e8e2dd397072fa2f49a5c7519c23cb49ef625 --- /dev/null +++ b/zfp/array/zfp/iterator1.h @@ -0,0 +1,38 @@ +// random access iterator that visits 1D array block by block; this class is nested within zfp::array1 +class iterator { +public: + // typedefs for STL compatibility + typedef Scalar value_type; + typedef ptrdiff_t difference_type; + typedef typename array1::reference reference; + typedef typename array1::pointer pointer; + typedef std::random_access_iterator_tag iterator_category; + + iterator() : ref(0, 0) {} + iterator operator=(const iterator& it) { ref.array = it.ref.array; ref.i = it.ref.i; return *this; } + reference operator*() const { return ref; } + reference operator[](difference_type d) const { return *operator+(d); } + iterator& operator++() { increment(); return *this; } + iterator& operator--() { decrement(); return *this; } + iterator operator++(int) { iterator it = *this; increment(); return it; } + iterator operator--(int) { iterator it = *this; decrement(); return it; } + iterator operator+=(difference_type d) { ref.i += d; return *this; } + iterator operator-=(difference_type d) { ref.i -= d; return *this; } + iterator operator+(difference_type d) const { return iterator(ref.array, ref.i + d); } + iterator operator-(difference_type d) const { return iterator(ref.array, ref.i - d); } + difference_type operator-(const iterator& it) const { return static_cast<difference_type>(ref.i) - static_cast<difference_type>(it.ref.i); } + bool operator==(const iterator& it) const { return ref.array == it.ref.array && ref.i == it.ref.i; } + bool operator!=(const iterator& it) const { return !operator==(it); } + bool operator<=(const iterator& it) const { return ref.array == it.ref.array && ref.i <= it.ref.i; } + bool operator>=(const iterator& it) const { return ref.array == it.ref.array && ref.i >= it.ref.i; } + bool operator<(const iterator& it) const { return !operator>=(it); } + bool operator>(const iterator& it) const { return !operator<=(it); } + uint i() const { return ref.i; } + +protected: + friend class array1; + explicit iterator(array1* array, uint i) : ref(array, i) {} + void increment() { ref.i++; } + void decrement() { ref.i--; } + reference ref; +}; diff --git a/zfp/array/zfp/iterator2.h b/zfp/array/zfp/iterator2.h new file mode 100644 index 0000000000000000000000000000000000000000..03052c4e6806a8b2dd09d04dd346f9504a9aeddd --- /dev/null +++ b/zfp/array/zfp/iterator2.h @@ -0,0 +1,42 @@ +// forward iterator that visits 2D array block by block; this class is nested within zfp::array2 +class iterator { +public: + // typedefs for STL compatibility + typedef Scalar value_type; + typedef ptrdiff_t difference_type; + typedef typename array2::reference reference; + typedef typename array2::pointer pointer; + typedef std::forward_iterator_tag iterator_category; + + iterator() : ref(0, 0, 0) {} + iterator operator=(const iterator& it) { ref.array = it.ref.array; ref.i = it.ref.i; ref.j = it.ref.j; return *this; } + reference operator*() const { return ref; } + iterator& operator++() { increment(); return *this; } + iterator operator++(int) { iterator it = *this; increment(); return it; } + bool operator==(const iterator& it) const { return ref.array == it.ref.array && ref.i == it.ref.i && ref.j == it.ref.j; } + bool operator!=(const iterator& it) const { return !operator==(it); } + uint i() const { return ref.i; } + uint j() const { return ref.j; } + +protected: + friend class array2; + explicit iterator(array2* array, uint i, uint j) : ref(array, i, j) {} + void increment() + { + ref.i++; + if (!(ref.i & 3u) || ref.i == ref.array->nx) { + ref.i = (ref.i - 1) & ~3u; + ref.j++; + if (!(ref.j & 3u) || ref.j == ref.array->ny) { + ref.j = (ref.j - 1) & ~3u; + // done with block; advance to next + if ((ref.i += 4) >= ref.array->nx) { + ref.i = 0; + if ((ref.j += 4) >= ref.array->ny) + ref.j = ref.array->ny; + } + } + } + } + reference ref; +}; diff --git a/zfp/array/zfp/iterator3.h b/zfp/array/zfp/iterator3.h new file mode 100644 index 0000000000000000000000000000000000000000..3889fc1cacdcd4c87d07930e391bc60ba94baa1b --- /dev/null +++ b/zfp/array/zfp/iterator3.h @@ -0,0 +1,50 @@ +// forward iterator that visits 3D array block by block; this class is nested within zfp::array3 +class iterator { +public: + // typedefs for STL compatibility + typedef Scalar value_type; + typedef ptrdiff_t difference_type; + typedef typename array3::reference reference; + typedef typename array3::pointer pointer; + typedef std::forward_iterator_tag iterator_category; + + iterator() : ref(0, 0, 0, 0) {} + iterator operator=(const iterator& it) { ref.array = it.ref.array; ref.i = it.ref.i; ref.j = it.ref.j; ref.k = it.ref.k; return *this; } + reference operator*() const { return ref; } + iterator& operator++() { increment(); return *this; } + iterator operator++(int) { iterator it = *this; increment(); return it; } + bool operator==(const iterator& it) const { return ref.array == it.ref.array && ref.i == it.ref.i && ref.j == it.ref.j && ref.k == it.ref.k; } + bool operator!=(const iterator& it) const { return !operator==(it); } + uint i() const { return ref.i; } + uint j() const { return ref.j; } + uint k() const { return ref.k; } + +protected: + friend class array3; + explicit iterator(array3* array, uint i, uint j, uint k) : ref(array, i, j, k) {} + void increment() + { + ref.i++; + if (!(ref.i & 3u) || ref.i == ref.array->nx) { + ref.i = (ref.i - 1) & ~3u; + ref.j++; + if (!(ref.j & 3u) || ref.j == ref.array->ny) { + ref.j = (ref.j - 1) & ~3u; + ref.k++; + if (!(ref.k & 3u) || ref.k == ref.array->nz) { + ref.k = (ref.k - 1) & ~3u; + // done with block; advance to next + if ((ref.i += 4) >= ref.array->nx) { + ref.i = 0; + if ((ref.j += 4) >= ref.array->ny) { + ref.j = 0; + if ((ref.k += 4) >= ref.array->nz) + ref.k = ref.array->nz; + } + } + } + } + } + } + reference ref; +}; diff --git a/zfp/array/zfp/memory.h b/zfp/array/zfp/memory.h new file mode 100644 index 0000000000000000000000000000000000000000..ea20b77f594781d002b52e3c6bfd219c3c781fd1 --- /dev/null +++ b/zfp/array/zfp/memory.h @@ -0,0 +1,60 @@ +#ifndef ZFP_MEMORY_H +#define ZFP_MEMORY_H + +#include <algorithm> +#include <cstdlib> +#include "zfp/types.h" + +// allocate size bytes with optional alignment +inline void* +allocate(size_t size, size_t alignment = 0) +{ +#if defined(__USE_XOPEN2K) && defined(ZFP_WITH_ALIGNED_ALLOC) + void* ptr; + if (alignment > 1) + posix_memalign(&ptr, alignment, size); + else + ptr = malloc(size); + return ptr; +#else + return new uchar[size]; +#endif +} + +// deallocate memory pointed to by ptr +template <typename T> +inline void +deallocate(T* ptr) +{ +#if defined(__USE_XOPEN2K) && defined(ZFP_WITH_ALIGNED_ALLOC) + if (ptr) + free(ptr); +#else + delete[] ptr; +#endif +} + +// reallocate size bytes with optional alignment +template <typename T> +inline void +reallocate(T*& ptr, size_t size, size_t alignment = 0) +{ + deallocate(ptr); + ptr = static_cast<T*>(allocate(size, alignment)); +} + +// clone array 'T src[count]' with optional alignment +template <typename T> +inline void +clone(T*& dst, const T* src, size_t count, size_t alignment = 0) +{ + deallocate(dst); + if (src) { + dst = static_cast<T*>(allocate(count * sizeof(T), alignment)); + std::copy(src, src + count, dst); + } + else + dst = 0; +} + +#endif diff --git a/zfp/array/zfp/pointer1.h b/zfp/array/zfp/pointer1.h new file mode 100644 index 0000000000000000000000000000000000000000..f58557c0df368b5672164ad61c268b6bcdc02b21 --- /dev/null +++ b/zfp/array/zfp/pointer1.h @@ -0,0 +1,30 @@ +// pointer to a 1D array element; this class is nested within zfp::array1 +class pointer { +public: + pointer() : ref(0, 0) {} + pointer operator=(const pointer& p) { ref.array = p.ref.array; ref.i = p.ref.i; return *this; } + reference operator*() const { return ref; } + reference operator[](ptrdiff_t d) const { return *operator+(d); } + pointer& operator++() { increment(); return *this; } + pointer& operator--() { decrement(); return *this; } + pointer operator++(int) { pointer p = *this; increment(); return p; } + pointer operator--(int) { pointer p = *this; decrement(); return p; } + pointer operator+=(ptrdiff_t d) { ref.i += d; return *this; } + pointer operator-=(ptrdiff_t d) { ref.i -= d; return *this; } + pointer operator+(ptrdiff_t d) const { pointer p = *this; p += d; return p; } + pointer operator-(ptrdiff_t d) const { pointer p = *this; p -= d; return p; } + ptrdiff_t operator-(const pointer& p) const { return index() - p.index(); } + bool operator==(const pointer& p) const { return ref.array == p.ref.array && ref.i == p.ref.i; } + bool operator!=(const pointer& p) const { return !operator==(p); } + +protected: + friend class array1; + friend class reference; + explicit pointer(reference r) : ref(r) {} + explicit pointer(array1* array, uint i) : ref(array, i) {} + ptrdiff_t index() const { return ref.i; } + void set(ptrdiff_t index) { ref.i = index; } + void increment() { ref.i++; } + void decrement() { ref.i--; } + reference ref; +}; diff --git a/zfp/array/zfp/pointer2.h b/zfp/array/zfp/pointer2.h new file mode 100644 index 0000000000000000000000000000000000000000..dcdb518fb90072a282dccb4f61a36d5d289667b2 --- /dev/null +++ b/zfp/array/zfp/pointer2.h @@ -0,0 +1,42 @@ +// pointer to a 2D array element; this class is nested within zfp::array2 +class pointer { +public: + pointer() : ref(0, 0, 0) {} + pointer operator=(const pointer& p) { ref.array = p.ref.array; ref.i = p.ref.i; ref.j = p.ref.j; return *this; } + reference operator*() const { return ref; } + reference operator[](ptrdiff_t d) const { return *operator+(d); } + pointer& operator++() { increment(); return *this; } + pointer& operator--() { decrement(); return *this; } + pointer operator++(int) { pointer p = *this; increment(); return p; } + pointer operator--(int) { pointer p = *this; decrement(); return p; } + pointer operator+=(ptrdiff_t d) { set(index() + d); return *this; } + pointer operator-=(ptrdiff_t d) { set(index() - d); return *this; } + pointer operator+(ptrdiff_t d) const { pointer p = *this; p += d; return p; } + pointer operator-(ptrdiff_t d) const { pointer p = *this; p -= d; return p; } + ptrdiff_t operator-(const pointer& p) const { return index() - p.index(); } + bool operator==(const pointer& p) const { return ref.array == p.ref.array && ref.i == p.ref.i && ref.j == p.ref.j; } + bool operator!=(const pointer& p) const { return !operator==(p); } + +protected: + friend class array2; + friend class reference; + explicit pointer(reference r) : ref(r) {} + explicit pointer(array2* array, uint i, uint j) : ref(array, i, j) {} + ptrdiff_t index() const { return ref.i + ref.array->nx * ref.j; } + void set(ptrdiff_t index) { ref.array->ij(ref.i, ref.j, index); } + void increment() + { + if (++ref.i == ref.array->nx) { + ref.i = 0; + ref.j++; + } + } + void decrement() + { + if (!ref.i--) { + ref.i = ref.array->nx - 1; + ref.j--; + } + } + reference ref; +}; diff --git a/zfp/array/zfp/pointer3.h b/zfp/array/zfp/pointer3.h new file mode 100644 index 0000000000000000000000000000000000000000..091af6044b45cf96db396edc44a3ee7bd5b2c5c6 --- /dev/null +++ b/zfp/array/zfp/pointer3.h @@ -0,0 +1,48 @@ +// pointer to a 3D array element; this class is nested within zfp::array3 +class pointer { +public: + pointer() : ref(0, 0, 0, 0) {} + pointer operator=(const pointer& p) { ref.array = p.ref.array; ref.i = p.ref.i; ref.j = p.ref.j; ref.k = p.ref.k; return *this; } + reference operator*() const { return ref; } + reference operator[](ptrdiff_t d) const { return *operator+(d); } + pointer& operator++() { increment(); return *this; } + pointer& operator--() { decrement(); return *this; } + pointer operator++(int) { pointer p = *this; increment(); return p; } + pointer operator--(int) { pointer p = *this; decrement(); return p; } + pointer operator+=(ptrdiff_t d) { set(index() + d); return *this; } + pointer operator-=(ptrdiff_t d) { set(index() - d); return *this; } + pointer operator+(ptrdiff_t d) const { pointer p = *this; p += d; return p; } + pointer operator-(ptrdiff_t d) const { pointer p = *this; p -= d; return p; } + ptrdiff_t operator-(const pointer& p) const { return index() - p.index(); } + bool operator==(const pointer& p) const { return ref.array == p.ref.array && ref.i == p.ref.i && ref.j == p.ref.j && ref.k == p.ref.k; } + bool operator!=(const pointer& p) const { return !operator==(p); } + +protected: + friend class array3; + friend class reference; + explicit pointer(reference r) : ref(r) {} + explicit pointer(array3* array, uint i, uint j, uint k) : ref(array, i, j, k) {} + ptrdiff_t index() const { return ref.i + ref.array->nx * (ref.j + ref.array->ny * ref.k); } + void set(ptrdiff_t index) { ref.array->ijk(ref.i, ref.j, ref.k, index); } + void increment() + { + if (++ref.i == ref.array->nx) { + ref.i = 0; + if (++ref.j == ref.array->ny) { + ref.j = 0; + ref.k++; + } + } + } + void decrement() + { + if (!ref.i--) { + ref.i = ref.array->nx - 1; + if (!ref.j--) { + ref.j = ref.array->ny - 1; + ref.k--; + } + } + } + reference ref; +}; diff --git a/zfp/array/zfp/reference1.h b/zfp/array/zfp/reference1.h new file mode 100644 index 0000000000000000000000000000000000000000..99f2e6a67643d0fb47db27ccaccdd418c71cf301 --- /dev/null +++ b/zfp/array/zfp/reference1.h @@ -0,0 +1,27 @@ +// reference to a 1D array element; this class is nested within zfp::array1 +class reference { +public: + operator Scalar() const { return array->get(i); } + reference operator=(const reference& r) { array->set(i, r.operator Scalar()); return *this; } + reference operator=(Scalar val) { array->set(i, val); return *this; } + reference operator+=(Scalar val) { array->add(i, val); return *this; } + reference operator-=(Scalar val) { array->sub(i, val); return *this; } + reference operator*=(Scalar val) { array->mul(i, val); return *this; } + reference operator/=(Scalar val) { array->div(i, val); return *this; } + pointer operator&() const { return pointer(*this); } + // swap two array elements via proxy references + friend void swap(reference a, reference b) + { + Scalar x = a.operator Scalar(); + Scalar y = b.operator Scalar(); + b.operator=(x); + a.operator=(y); + } + +protected: + friend class array1; + friend class iterator; + explicit reference(array1* array, uint i) : array(array), i(i) {} + array1* array; + uint i; +}; diff --git a/zfp/array/zfp/reference2.h b/zfp/array/zfp/reference2.h new file mode 100644 index 0000000000000000000000000000000000000000..76a0bd3b10158e015203e644a74070159703eed9 --- /dev/null +++ b/zfp/array/zfp/reference2.h @@ -0,0 +1,27 @@ +// reference to a 2D array element; this class is nested within zfp::array2 +class reference { +public: + operator Scalar() const { return array->get(i, j); } + reference operator=(const reference& r) { array->set(i, j, r.operator Scalar()); return *this; } + reference operator=(Scalar val) { array->set(i, j, val); return *this; } + reference operator+=(Scalar val) { array->add(i, j, val); return *this; } + reference operator-=(Scalar val) { array->sub(i, j, val); return *this; } + reference operator*=(Scalar val) { array->mul(i, j, val); return *this; } + reference operator/=(Scalar val) { array->div(i, j, val); return *this; } + pointer operator&() const { return pointer(*this); } + // swap two array elements via proxy references + friend void swap(reference a, reference b) + { + Scalar x = a.operator Scalar(); + Scalar y = b.operator Scalar(); + b.operator=(x); + a.operator=(y); + } + +protected: + friend class array2; + friend class iterator; + explicit reference(array2* array, uint i, uint j) : array(array), i(i), j(j) {} + array2* array; + uint i, j; +}; diff --git a/zfp/array/zfp/reference3.h b/zfp/array/zfp/reference3.h new file mode 100644 index 0000000000000000000000000000000000000000..91175e18033c1c1a578346bc0c5a0342e646853b --- /dev/null +++ b/zfp/array/zfp/reference3.h @@ -0,0 +1,27 @@ +// reference to a 3D array element; this class is nested within zfp::array3 +class reference { +public: + operator Scalar() const { return array->get(i, j, k); } + reference operator=(const reference& r) { array->set(i, j, k, r.operator Scalar()); return *this; } + reference operator=(Scalar val) { array->set(i, j, k, val); return *this; } + reference operator+=(Scalar val) { array->add(i, j, k, val); return *this; } + reference operator-=(Scalar val) { array->sub(i, j, k, val); return *this; } + reference operator*=(Scalar val) { array->mul(i, j, k, val); return *this; } + reference operator/=(Scalar val) { array->div(i, j, k, val); return *this; } + pointer operator&() const { return pointer(*this); } + // swap two array elements via proxy references + friend void swap(reference a, reference b) + { + Scalar x = a.operator Scalar(); + Scalar y = b.operator Scalar(); + b.operator=(x); + a.operator=(y); + } + +protected: + friend class array3; + friend class iterator; + explicit reference(array3* array, uint i, uint j, uint k) : array(array), i(i), j(j), k(k) {} + array3* array; + uint i, j, k; +}; diff --git a/zfp/array/zfp/view1.h b/zfp/array/zfp/view1.h new file mode 100644 index 0000000000000000000000000000000000000000..6129ae5ee06431720931c27bbd0d7d6726b5dee3 --- /dev/null +++ b/zfp/array/zfp/view1.h @@ -0,0 +1,291 @@ +// 1D array views; these classes are nested within zfp::array1 + +// abstract view of 1D array (base class) +class preview { +public: + // rate in bits per value + double rate() const { return array->rate(); } + + // dimensions of (sub)array + size_t size() const { return size_t(nx); } + + // local to global array index + uint global_x(uint i) const { return x + i; } + +protected: + // construction and assignment--perform shallow copy of (sub)array + explicit preview(array1* array) : array(array), x(0), nx(array->nx) {} + explicit preview(array1* array, uint x, uint nx) : array(array), x(x), nx(nx) {} + preview& operator=(array1* a) + { + array = a; + x = 0; + nx = a->nx; + return *this; + } + + array1* array; // underlying container + uint x; // offset into array + uint nx; // dimensions of subarray +}; + +// generic read-only view into a rectangular subset of a 1D array +class const_view : public preview { +protected: + using preview::array; + using preview::x; + using preview::nx; +public: + // construction--perform shallow copy of (sub)array + const_view(array1* array) : preview(array) {} + const_view(array1* array, uint x, uint nx) : preview(array, x, nx) {} + + // dimensions of (sub)array + uint size_x() const { return nx; } + + // [i] accessor + Scalar operator[](uint index) const { return array->get(x + index); } + + // (i) accessor + Scalar operator()(uint i) const { return array->get(x + i); } +}; + +// generic read-write view into a rectangular subset of a 1D array +class view : public const_view { +protected: + using preview::array; + using preview::x; + using preview::nx; +public: + // construction--perform shallow copy of (sub)array + view(array1* array) : const_view(array) {} + view(array1* array, uint x, uint nx) : const_view(array, x, nx) {} + + // [i] accessor from base class + using const_view::operator[]; + + // (i) accessor from base class + using const_view::operator(); + + // [i] mutator + reference operator[](uint index) { return reference(array, x + index); } + + // (i) mutator + reference operator()(uint i) { return reference(array, x + i); } +}; + +// thread-safe read-only view of 1D (sub)array with private cache +class private_const_view : public preview { +protected: + using preview::array; + using preview::x; + using preview::nx; +public: + // construction--perform shallow copy of (sub)array + private_const_view(array1* array) : + preview(array), + cache(array->cache.size()) + { + init(); + } + private_const_view(array1* array, uint x, uint nx) : + preview(array, x, nx), + cache(array->cache.size()) + { + init(); + } + + // destructor + ~private_const_view() + { + stream_close(zfp->stream); + zfp_stream_close(zfp); + } + + // dimensions of (sub)array + uint size_x() const { return nx; } + + // cache size in number of bytes + size_t cache_size() const { return cache.size() * sizeof(CacheLine); } + + // set minimum cache size in bytes (array dimensions must be known) + void set_cache_size(size_t csize) + { + cache.resize(array->lines(csize, nx)); + } + + // empty cache without compressing modified cached blocks + void clear_cache() const { cache.clear(); } + + // (i) accessor + Scalar operator()(uint i) const { return get(x + i); } + +protected: + // cache line representing one block of decompressed values + class CacheLine { + public: + const Scalar& operator()(uint i) const { return a[index(i)]; } + Scalar& operator()(uint i) { return a[index(i)]; } + const Scalar* data() const { return a; } + Scalar* data() { return a; } + protected: + static uint index(uint i) { return i & 3u; } + Scalar a[4]; + }; + + // copy private data + void init() + { + // copy compressed stream + zfp = zfp_stream_open(0); + *zfp = *array->zfp; + // copy bit stream + zfp->stream = stream_clone(array->zfp->stream); + } + + // inspector + const Scalar& get(uint i) const + { + const CacheLine* p = line(i); + return (*p)(i); + } + + // return cache line for i; may require write-back and fetch + CacheLine* line(uint i) const + { + CacheLine* p = 0; + uint b = array->block(i); + typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, false); + uint c = t.index() - 1; + // fetch cache line; no writeback possible since view is read-only + if (c != b) + decode(b, p->data()); + return p; + } + + // decode block with given index + void decode(uint index, Scalar* block) const + { + stream_rseek(zfp->stream, index * array->blkbits); + Codec::decode_block_1(zfp, block, array->shape ? array->shape[index] : 0); + } + + zfp_stream* zfp; // stream of compressed blocks + mutable Cache<CacheLine> cache; // cache of decompressed blocks +}; + +// thread-safe read-write view of private 1D (sub)array +class private_view : public private_const_view { +protected: + using preview::array; + using preview::x; + using preview::nx; + using private_const_view::zfp; + using private_const_view::cache; + using private_const_view::init; + using private_const_view::decode; + class view_reference; + typedef typename private_const_view::CacheLine CacheLine; +public: + // construction--perform shallow copy of (sub)array + private_view(array1* array) : private_const_view(array) {} + private_view(array1* array, uint x, uint nx) : private_const_view(array, x, nx) {} + + // partition view into count block-aligned pieces, with 0 <= index < count + void partition(uint index, uint count) + { + partition(x, nx, index, count); + } + + // flush cache by compressing all modified cached blocks + void flush_cache() const + { + for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) { + if (p->tag.dirty()) { + uint b = p->tag.index() - 1; + encode(b, p->line->data()); + } + cache.flush(p->line); + } + } + + // (i) accessor from base class + using private_const_view::operator(); + + // (i) mutator + view_reference operator()(uint i) { return view_reference(this, x + i); } + +protected: + class view_reference { + public: + operator Scalar() const { return view->get(i); } + view_reference operator=(const view_reference& r) { view->set(i, r.operator Scalar()); return *this; } + view_reference operator=(Scalar val) { view->set(i, val); return *this; } + view_reference operator+=(Scalar val) { view->add(i, val); return *this; } + view_reference operator-=(Scalar val) { view->sub(i, val); return *this; } + view_reference operator*=(Scalar val) { view->mul(i, val); return *this; } + view_reference operator/=(Scalar val) { view->div(i, val); return *this; } + // swap two array elements via proxy references + friend void swap(view_reference a, view_reference b) + { + Scalar x = a.operator Scalar(); + Scalar y = b.operator Scalar(); + b.operator=(x); + a.operator=(y); + } + + protected: + friend class private_view; + explicit view_reference(private_view* view, uint i) : view(view), i(i) {} + private_view* view; + uint i; + }; + + // block-aligned partition of [offset, offset + size): index out of count + static void partition(uint& offset, uint& size, uint index, uint count) + { + uint bmin = offset / 4; + uint bmax = (offset + size + 3) / 4; + uint xmin = std::max(offset + 0, 4 * (bmin + (bmax - bmin) * (index + 0) / count)); + uint xmax = std::min(offset + size, 4 * (bmin + (bmax - bmin) * (index + 1) / count)); + offset = xmin; + size = xmax - xmin; + } + + // mutator + void set(uint i, Scalar val) + { + CacheLine* p = line(i, true); + (*p)(i) = val; + } + + // in-place updates + void add(uint i, Scalar val) { (*line(i, true))(i) += val; } + void sub(uint i, Scalar val) { (*line(i, true))(i) -= val; } + void mul(uint i, Scalar val) { (*line(i, true))(i) *= val; } + void div(uint i, Scalar val) { (*line(i, true))(i) /= val; } + + // return cache line for i; may require write-back and fetch + CacheLine* line(uint i, bool write) const + { + CacheLine* p = 0; + uint b = array->block(i); + typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write); + uint c = t.index() - 1; + if (c != b) { + // write back occupied cache line if it is dirty + if (t.dirty()) + encode(c, p->data()); + decode(b, p->data()); + } + return p; + } + + // encode block with given index + void encode(uint index, const Scalar* block) const + { + stream_wseek(zfp->stream, index * array->blkbits); + Codec::encode_block_1(zfp, block, array->shape ? array->shape[index] : 0); + stream_flush(zfp->stream); + } +}; diff --git a/zfp/array/zfp/view2.h b/zfp/array/zfp/view2.h new file mode 100644 index 0000000000000000000000000000000000000000..fcfdf8cad8f05594ca3963541590eec81f305888 --- /dev/null +++ b/zfp/array/zfp/view2.h @@ -0,0 +1,393 @@ +// 2D array views; these classes are nested within zfp::array2 + +// abstract view of 2D array (base class) +class preview { +public: + // rate in bits per value + double rate() const { return array->rate(); } + + // dimensions of (sub)array + size_t size() const { return size_t(nx) * size_t(ny); } + + // local to global array indices + uint global_x(uint i) const { return x + i; } + uint global_y(uint j) const { return y + j; } + +protected: + // construction and assignment--perform shallow copy of (sub)array + explicit preview(array2* array) : array(array), x(0), y(0), nx(array->nx), ny(array->ny) {} + explicit preview(array2* array, uint x, uint y, uint nx, uint ny) : array(array), x(x), y(y), nx(nx), ny(ny) {} + preview& operator=(array2* a) + { + array = a; + x = y = 0; + nx = a->nx; + ny = a->ny; + return *this; + } + + array2* array; // underlying container + uint x, y; // offset into array + uint nx, ny; // dimensions of subarray +}; + +// generic read-only view into a rectangular subset of a 2D array +class const_view : public preview { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::nx; + using preview::ny; +public: + // construction--perform shallow copy of (sub)array + const_view(array2* array) : preview(array) {} + const_view(array2* array, uint x, uint y, uint nx, uint ny) : preview(array, x, y, nx, ny) {} + + // dimensions of (sub)array + uint size_x() const { return nx; } + uint size_y() const { return ny; } + + // (i, j) accessor + Scalar operator()(uint i, uint j) const { return array->get(x + i, y + j); } +}; + +// generic read-write view into a rectangular subset of a 2D array +class view : public const_view { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::nx; + using preview::ny; +public: + // construction--perform shallow copy of (sub)array + view(array2* array) : const_view(array) {} + view(array2* array, uint x, uint y, uint nx, uint ny) : const_view(array, x, y, nx, ny) {} + + // (i, j) accessor from base class + using const_view::operator(); + + // (i, j) mutator + reference operator()(uint i, uint j) { return reference(array, x + i, y + j); } +}; + +// flat view of 2D array (operator[] returns scalar) +class flat_view : public view { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::nx; + using preview::ny; +public: + // construction--perform shallow copy of (sub)array + flat_view(array2* array) : view(array) {} + flat_view(array2* array, uint x, uint y, uint nx, uint ny) : view(array, x, y, nx, ny) {} + + // convert (i, j) index to flat index + uint index(uint i, uint j) const { return i + nx * j; } + + // convert flat index to (i, j) index + void ij(uint& i, uint& j, uint index) const + { + i = index % nx; index /= nx; + j = index; + } + + // flat index accessors + Scalar operator[](uint index) const + { + uint i, j; + ij(i, j, index); + return array->get(x + i, y + j); + } + reference operator[](uint index) + { + uint i, j; + ij(i, j, index); + return reference(array, x + i, y + j); + } +}; + +// forward declaration of friends +class nested_view1; +class nested_view2; + +// nested view into a 1D rectangular subset of a 2D array +class nested_view1 : public preview { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::nx; + using preview::ny; +public: + // dimensions of (sub)array + uint size_x() const { return nx; } + + // [i] accessor and mutator + Scalar operator[](uint index) const { return array->get(x + index, y); } + reference operator[](uint index) { return reference(array, x + index, y); } + + // (i) accessor and mutator + Scalar operator()(uint i) const { return array->get(x + i, y); } + reference operator()(uint i) { return reference(array, x + i, y); } + +protected: + // construction--perform shallow copy of (sub)array + friend class nested_view2; + explicit nested_view1(array2* array) : preview(array) {} + explicit nested_view1(array2* array, uint x, uint y, uint nx, uint ny) : preview(array, x, y, nx, ny) {} +}; + +// nested view into a 2D rectangular subset of a 2D array +class nested_view2 : public preview { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::nx; + using preview::ny; +public: + // construction--perform shallow copy of (sub)array + nested_view2(array2* array) : preview(array) {} + nested_view2(array2* array, uint x, uint y, uint nx, uint ny) : preview(array, x, y, nx, ny) {} + + // dimensions of (sub)array + uint size_x() const { return nx; } + uint size_y() const { return ny; } + + // 1D view + nested_view1 operator[](uint index) const { return nested_view1(array, x, y + index, nx, 1); } + + // (i, j) accessor and mutator + Scalar operator()(uint i, uint j) const { return array->get(x + i, y + j); } + reference operator()(uint i, uint j) { return reference(array, x + i, y + j); } +}; + +typedef nested_view2 nested_view; + +// thread-safe read-only view of 2D (sub)array with private cache +class private_const_view : public preview { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::nx; + using preview::ny; +public: + // construction--perform shallow copy of (sub)array + private_const_view(array2* array) : + preview(array), + cache(array->cache.size()) + { + init(); + } + private_const_view(array2* array, uint x, uint y, uint nx, uint ny) : + preview(array, x, y, nx, ny), + cache(array->cache.size()) + { + init(); + } + + // destructor + ~private_const_view() + { + stream_close(zfp->stream); + zfp_stream_close(zfp); + } + + // dimensions of (sub)array + uint size_x() const { return nx; } + uint size_y() const { return ny; } + + // cache size in number of bytes + size_t cache_size() const { return cache.size() * sizeof(CacheLine); } + + // set minimum cache size in bytes (array dimensions must be known) + void set_cache_size(size_t csize) + { + cache.resize(array->lines(csize, nx, ny)); + } + + // empty cache without compressing modified cached blocks + void clear_cache() const { cache.clear(); } + + // (i, j) accessor + Scalar operator()(uint i, uint j) const { return get(x + i, y + j); } + +protected: + // cache line representing one block of decompressed values + class CacheLine { + public: + const Scalar& operator()(uint i, uint j) const { return a[index(i, j)]; } + Scalar& operator()(uint i, uint j) { return a[index(i, j)]; } + const Scalar* data() const { return a; } + Scalar* data() { return a; } + protected: + static uint index(uint i, uint j) { return (i & 3u) + 4 * (j & 3u); } + Scalar a[16]; + }; + + // copy private data + void init() + { + // copy compressed stream + zfp = zfp_stream_open(0); + *zfp = *array->zfp; + // copy bit stream + zfp->stream = stream_clone(array->zfp->stream); + } + + // inspector + const Scalar& get(uint i, uint j) const + { + const CacheLine* p = line(i, j); + return (*p)(i, j); + } + + // return cache line for (i, j); may require write-back and fetch + CacheLine* line(uint i, uint j) const + { + CacheLine* p = 0; + uint b = array->block(i, j); + typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, false); + uint c = t.index() - 1; + // fetch cache line; no writeback possible since view is read-only + if (c != b) + decode(b, p->data()); + return p; + } + + // decode block with given index + void decode(uint index, Scalar* block) const + { + stream_rseek(zfp->stream, index * array->blkbits); + Codec::decode_block_2(zfp, block, array->shape ? array->shape[index] : 0); + } + + zfp_stream* zfp; // stream of compressed blocks + mutable Cache<CacheLine> cache; // cache of decompressed blocks +}; + +// thread-safe read-write view of private 2D (sub)array +class private_view : public private_const_view { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::nx; + using preview::ny; + using private_const_view::zfp; + using private_const_view::cache; + using private_const_view::init; + using private_const_view::decode; + class view_reference; + typedef typename private_const_view::CacheLine CacheLine; +public: + // construction--perform shallow copy of (sub)array + private_view(array2* array) : private_const_view(array) {} + private_view(array2* array, uint x, uint y, uint nx, uint ny) : private_const_view(array, x, y, nx, ny) {} + + // partition view into count block-aligned pieces, with 0 <= index < count + void partition(uint index, uint count) + { + if (nx > ny) + partition(x, nx, index, count); + else + partition(y, ny, index, count); + } + + // flush cache by compressing all modified cached blocks + void flush_cache() const + { + for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) { + if (p->tag.dirty()) { + uint b = p->tag.index() - 1; + encode(b, p->line->data()); + } + cache.flush(p->line); + } + } + + // (i, j) accessor from base class + using private_const_view::operator(); + + // (i, j) mutator + view_reference operator()(uint i, uint j) { return view_reference(this, x + i, y + j); } + +protected: + class view_reference { + public: + operator Scalar() const { return view->get(i, j); } + view_reference operator=(const view_reference& r) { view->set(i, j, r.operator Scalar()); return *this; } + view_reference operator=(Scalar val) { view->set(i, j, val); return *this; } + view_reference operator+=(Scalar val) { view->add(i, j, val); return *this; } + view_reference operator-=(Scalar val) { view->sub(i, j, val); return *this; } + view_reference operator*=(Scalar val) { view->mul(i, j, val); return *this; } + view_reference operator/=(Scalar val) { view->div(i, j, val); return *this; } + // swap two array elements via proxy references + friend void swap(view_reference a, view_reference b) + { + Scalar x = a.operator Scalar(); + Scalar y = b.operator Scalar(); + b.operator=(x); + a.operator=(y); + } + + protected: + friend class private_view; + explicit view_reference(private_view* view, uint i, uint j) : view(view), i(i), j(j) {} + private_view* view; + uint i, j; + }; + + // block-aligned partition of [offset, offset + size): index out of count + static void partition(uint& offset, uint& size, uint index, uint count) + { + uint bmin = offset / 4; + uint bmax = (offset + size + 3) / 4; + uint xmin = std::max(offset + 0, 4 * (bmin + (bmax - bmin) * (index + 0) / count)); + uint xmax = std::min(offset + size, 4 * (bmin + (bmax - bmin) * (index + 1) / count)); + offset = xmin; + size = xmax - xmin; + } + + // mutator + void set(uint i, uint j, Scalar val) + { + CacheLine* p = line(i, j, true); + (*p)(i, j) = val; + } + + // in-place updates + void add(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) += val; } + void sub(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) -= val; } + void mul(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) *= val; } + void div(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) /= val; } + + // return cache line for (i, j); may require write-back and fetch + CacheLine* line(uint i, uint j, bool write) const + { + CacheLine* p = 0; + uint b = array->block(i, j); + typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write); + uint c = t.index() - 1; + if (c != b) { + // write back occupied cache line if it is dirty + if (t.dirty()) + encode(c, p->data()); + decode(b, p->data()); + } + return p; + } + + // encode block with given index + void encode(uint index, const Scalar* block) const + { + stream_wseek(zfp->stream, index * array->blkbits); + Codec::encode_block_2(zfp, block, array->shape ? array->shape[index] : 0); + stream_flush(zfp->stream); + } +}; diff --git a/zfp/array/zfp/view3.h b/zfp/array/zfp/view3.h new file mode 100644 index 0000000000000000000000000000000000000000..b1bf457fc8a1f144ab67e4fc9f0caf1221eb494a --- /dev/null +++ b/zfp/array/zfp/view3.h @@ -0,0 +1,445 @@ +// 3D array views; these classes are nested within zfp::array3 + +// abstract view of 3D array (base class) +class preview { +public: + // rate in bits per value + double rate() const { return array->rate(); } + + // dimensions of (sub)array + size_t size() const { return size_t(nx) * size_t(ny) * size_t(nz); } + + // local to global array indices + uint global_x(uint i) const { return x + i; } + uint global_y(uint j) const { return y + j; } + uint global_z(uint k) const { return z + k; } + +protected: + // construction and assignment--perform shallow copy of (sub)array + explicit preview(array3* array) : array(array), x(0), y(0), z(0), nx(array->nx), ny(array->ny), nz(array->nz) {} + explicit preview(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : array(array), x(x), y(y), z(z), nx(nx), ny(ny), nz(nz) {} + preview& operator=(array3* a) + { + array = a; + x = y = z = 0; + nx = a->nx; + ny = a->ny; + nz = a->nz; + return *this; + } + + array3* array; // underlying container + uint x, y, z; // offset into array + uint nx, ny, nz; // dimensions of subarray +}; + +// generic read-only view into a rectangular subset of a 3D array +class const_view : public preview { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::z; + using preview::nx; + using preview::ny; + using preview::nz; +public: + // construction--perform shallow copy of (sub)array + const_view(array3* array) : preview(array) {} + const_view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : preview(array, x, y, z, nx, ny, nz) {} + + // dimensions of (sub)array + uint size_x() const { return nx; } + uint size_y() const { return ny; } + uint size_z() const { return nz; } + + // (i, j, k) accessor + Scalar operator()(uint i, uint j, uint k) const { return array->get(x + i, y + j, z + k); } +}; + +// generic read-write view into a rectangular subset of a 3D array +class view : public const_view { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::z; + using preview::nx; + using preview::ny; + using preview::nz; +public: + // construction--perform shallow copy of (sub)array + view(array3* array) : const_view(array) {} + view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : const_view(array, x, y, z, nx, ny, nz) {} + + // (i, j, k) accessor from base class + using const_view::operator(); + + // (i, j, k) mutator + reference operator()(uint i, uint j, uint k) { return reference(array, x + i, y + j, z + k); } +}; + +// flat view of 3D array (operator[] returns scalar) +class flat_view : public view { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::z; + using preview::nx; + using preview::ny; + using preview::nz; +public: + // construction--perform shallow copy of (sub)array + flat_view(array3* array) : view(array) {} + flat_view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : view(array, x, y, z, nx, ny, nz) {} + + // convert (i, j, k) index to flat index + uint index(uint i, uint j, uint k) const { return i + nx * (j + ny * k); } + + // convert flat index to (i, j, k) index + void ijk(uint& i, uint& j, uint& k, uint index) const + { + i = index % nx; index /= nx; + j = index % ny; index /= ny; + k = index; + } + + // flat index accessors + Scalar operator[](uint index) const + { + uint i, j, k; + ijk(i, j, k, index); + return array->get(x + i, y + j, z + k); + } + reference operator[](uint index) + { + uint i, j, k; + ijk(i, j, k, index); + return reference(array, x + i, y + j, z + k); + } +}; + +// forward declaration of friends +class nested_view1; +class nested_view2; +class nested_view3; + +// nested view into a 1D rectangular subset of a 3D array +class nested_view1 : public preview { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::z; + using preview::nx; + using preview::ny; + using preview::nz; +public: + // dimensions of (sub)array + uint size_x() const { return nx; } + + // [i] accessor and mutator + Scalar operator[](uint index) const { return array->get(x + index, y, z); } + reference operator[](uint index) { return reference(array, x + index, y, z); } + + // (i) accessor and mutator + Scalar operator()(uint i) const { return array->get(x + i, y, z); } + reference operator()(uint i) { return reference(array, x + i, y, z); } + +protected: + // construction--perform shallow copy of (sub)array + friend class nested_view2; + explicit nested_view1(array3* array) : preview(array) {} + explicit nested_view1(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : preview(array, x, y, z, nx, ny, nz) {} +}; + +// nested view into a 2D rectangular subset of a 3D array +class nested_view2 : public preview { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::z; + using preview::nx; + using preview::ny; + using preview::nz; +public: + // dimensions of (sub)array + uint size_x() const { return nx; } + uint size_y() const { return ny; } + + // 1D view + nested_view1 operator[](uint index) const { return nested_view1(array, x, y + index, z, nx, 1, 1); } + + // (i, j) accessor and mutator + Scalar operator()(uint i, uint j) const { return array->get(x + i, y + j, z); } + reference operator()(uint i, uint j) { return reference(array, x + i, y + j, z); } + +protected: + // construction--perform shallow copy of (sub)array + friend class nested_view3; + explicit nested_view2(array3* array) : preview(array) {} + explicit nested_view2(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : preview(array, x, y, z, nx, ny, nz) {} +}; + +// nested view into a 3D rectangular subset of a 3D array +class nested_view3 : public preview { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::z; + using preview::nx; + using preview::ny; + using preview::nz; +public: + // construction--perform shallow copy of (sub)array + nested_view3(array3* array) : preview(array) {} + nested_view3(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : preview(array, x, y, z, nx, ny, nz) {} + + // dimensions of (sub)array + uint size_x() const { return nx; } + uint size_y() const { return ny; } + uint size_z() const { return nz; } + + // 2D view + nested_view2 operator[](uint index) const { return nested_view2(array, x, y, z + index, nx, ny, 1); } + + // (i, j, k) accessor and mutator + Scalar operator()(uint i, uint j, uint k) const { return array->get(x + i, y + j, z + k); } + reference operator()(uint i, uint j, uint k) { return reference(array, x + i, y + j, z + k); } +}; + +typedef nested_view3 nested_view; + +// thread-safe read-only view of 3D (sub)array with private cache +class private_const_view : public preview { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::z; + using preview::nx; + using preview::ny; + using preview::nz; +public: + // construction--perform shallow copy of (sub)array + private_const_view(array3* array) : + preview(array), + cache(array->cache.size()) + { + init(); + } + private_const_view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : + preview(array, x, y, z, nx, ny, nz), + cache(array->cache.size()) + { + init(); + } + + // destructor + ~private_const_view() + { + stream_close(zfp->stream); + zfp_stream_close(zfp); + } + + // dimensions of (sub)array + uint size_x() const { return nx; } + uint size_y() const { return ny; } + uint size_z() const { return nz; } + + // cache size in number of bytes + size_t cache_size() const { return cache.size() * sizeof(CacheLine); } + + // set minimum cache size in bytes (array dimensions must be known) + void set_cache_size(size_t csize) + { + cache.resize(array->lines(csize, nx, ny, nz)); + } + + // empty cache without compressing modified cached blocks + void clear_cache() const { cache.clear(); } + + // (i, j, k) accessor + Scalar operator()(uint i, uint j, uint k) const { return get(x + i, y + j, z + k); } + +protected: + // cache line representing one block of decompressed values + class CacheLine { + public: + const Scalar& operator()(uint i, uint j, uint k) const { return a[index(i, j, k)]; } + Scalar& operator()(uint i, uint j, uint k) { return a[index(i, j, k)]; } + const Scalar* data() const { return a; } + Scalar* data() { return a; } + protected: + static uint index(uint i, uint j, uint k) { return (i & 3u) + 4 * ((j & 3u) + 4 * (k & 3u)); } + Scalar a[64]; + }; + + // copy private data + void init() + { + // copy compressed stream + zfp = zfp_stream_open(0); + *zfp = *array->zfp; + // copy bit stream + zfp->stream = stream_clone(array->zfp->stream); + } + + // inspector + const Scalar& get(uint i, uint j, uint k) const + { + const CacheLine* p = line(i, j, k); + return (*p)(i, j, k); + } + + // return cache line for (i, j, k); may require write-back and fetch + CacheLine* line(uint i, uint j, uint k) const + { + CacheLine* p = 0; + uint b = array->block(i, j, k); + typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, false); + uint c = t.index() - 1; + // fetch cache line; no writeback possible since view is read-only + if (c != b) + decode(b, p->data()); + return p; + } + + // decode block with given index + void decode(uint index, Scalar* block) const + { + stream_rseek(zfp->stream, index * array->blkbits); + Codec::decode_block_3(zfp, block, array->shape ? array->shape[index] : 0); + } + + zfp_stream* zfp; // stream of compressed blocks + mutable Cache<CacheLine> cache; // cache of decompressed blocks +}; + +// thread-safe read-write view of private 3D (sub)array +class private_view : public private_const_view { +protected: + using preview::array; + using preview::x; + using preview::y; + using preview::z; + using preview::nx; + using preview::ny; + using preview::nz; + using private_const_view::zfp; + using private_const_view::cache; + using private_const_view::init; + using private_const_view::decode; + class view_reference; + typedef typename private_const_view::CacheLine CacheLine; +public: + // construction--perform shallow copy of (sub)array + private_view(array3* array) : private_const_view(array) {} + private_view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : private_const_view(array, x, y, z, nx, ny, nz) {} + + // partition view into count block-aligned pieces, with 0 <= index < count + void partition(uint index, uint count) + { + if (nx > std::max(ny, nz)) + partition(x, nx, index, count); + else if (ny > std::max(nx, nz)) + partition(y, ny, index, count); + else + partition(z, nz, index, count); + } + + // flush cache by compressing all modified cached blocks + void flush_cache() const + { + for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) { + if (p->tag.dirty()) { + uint b = p->tag.index() - 1; + encode(b, p->line->data()); + } + cache.flush(p->line); + } + } + + // (i, j, k) accessor from base class + using private_const_view::operator(); + + // (i, j, k) mutator + view_reference operator()(uint i, uint j, uint k) { return view_reference(this, x + i, y + j, z + k); } + +protected: + class view_reference { + public: + operator Scalar() const { return view->get(i, j, k); } + view_reference operator=(const view_reference& r) { view->set(i, j, k, r.operator Scalar()); return *this; } + view_reference operator=(Scalar val) { view->set(i, j, k, val); return *this; } + view_reference operator+=(Scalar val) { view->add(i, j, k, val); return *this; } + view_reference operator-=(Scalar val) { view->sub(i, j, k, val); return *this; } + view_reference operator*=(Scalar val) { view->mul(i, j, k, val); return *this; } + view_reference operator/=(Scalar val) { view->div(i, j, k, val); return *this; } + // swap two array elements via proxy references + friend void swap(view_reference a, view_reference b) + { + Scalar x = a.operator Scalar(); + Scalar y = b.operator Scalar(); + b.operator=(x); + a.operator=(y); + } + + protected: + friend class private_view; + explicit view_reference(private_view* view, uint i, uint j, uint k) : view(view), i(i), j(j), k(k) {} + private_view* view; + uint i, j, k; + }; + + // block-aligned partition of [offset, offset + size): index out of count + static void partition(uint& offset, uint& size, uint index, uint count) + { + uint bmin = offset / 4; + uint bmax = (offset + size + 3) / 4; + uint xmin = std::max(offset + 0, 4 * (bmin + (bmax - bmin) * (index + 0) / count)); + uint xmax = std::min(offset + size, 4 * (bmin + (bmax - bmin) * (index + 1) / count)); + offset = xmin; + size = xmax - xmin; + } + + // mutator + void set(uint i, uint j, uint k, Scalar val) + { + CacheLine* p = line(i, j, k, true); + (*p)(i, j, k) = val; + } + + // in-place updates + void add(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) += val; } + void sub(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) -= val; } + void mul(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) *= val; } + void div(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) /= val; } + + // return cache line for (i, j, k); may require write-back and fetch + CacheLine* line(uint i, uint j, uint k, bool write) const + { + CacheLine* p = 0; + uint b = array->block(i, j, k); + typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write); + uint c = t.index() - 1; + if (c != b) { + // write back occupied cache line if it is dirty + if (t.dirty()) + encode(c, p->data()); + decode(b, p->data()); + } + return p; + } + + // encode block with given index + void encode(uint index, const Scalar* block) const + { + stream_wseek(zfp->stream, index * array->blkbits); + Codec::encode_block_3(zfp, block, array->shape ? array->shape[index] : 0); + stream_flush(zfp->stream); + } +}; diff --git a/zfp/array/zfparray.h b/zfp/array/zfparray.h new file mode 100644 index 0000000000000000000000000000000000000000..5d5e65f6d133b5a96700849583a9d41051003376 --- /dev/null +++ b/zfp/array/zfparray.h @@ -0,0 +1,163 @@ +#ifndef ZFP_ARRAY_H +#define ZFP_ARRAY_H + +#include <algorithm> +#include <climits> +#include "zfp.h" +#include "zfp/memory.h" + +namespace zfp { + +// abstract base class for compressed array of scalars +class array { +protected: + // default constructor + array() : + dims(0), type(zfp_type_none), + nx(0), ny(0), nz(0), + bx(0), by(0), bz(0), + blocks(0), blkbits(0), + bytes(0), data(0), + zfp(0), + shape(0) + {} + + // generic array with 'dims' dimensions and scalar type 'type' + array(uint dims, zfp_type type) : + dims(dims), type(type), + nx(0), ny(0), nz(0), + bx(0), by(0), bz(0), + blocks(0), blkbits(0), + bytes(0), data(0), + zfp(zfp_stream_open(0)), + shape(0) + {} + + // copy constructor--performs a deep copy + array(const array& a) : + data(0), + zfp(0), + shape(0) + { + deep_copy(a); + } + + // protected destructor (cannot delete array through base class pointer) + ~array() + { + free(); + zfp_stream_close(zfp); + } + + // assignment operator--performs a deep copy + array& operator=(const array& a) + { + deep_copy(a); + return *this; + } + +public: + // rate in bits per value + double rate() const { return double(blkbits) / block_size(); } + + // set compression rate in bits per value + double set_rate(double rate) + { + rate = zfp_stream_set_rate(zfp, rate, type, dims, 1); + blkbits = zfp->maxbits; + alloc(); + return rate; + } + + // empty cache without compressing modified cached blocks + virtual void clear_cache() const = 0; + + // flush cache by compressing all modified cached blocks + virtual void flush_cache() const = 0; + + // number of bytes of compressed data + size_t compressed_size() const { return bytes; } + + // pointer to compressed data for read or write access + uchar* compressed_data() const + { + // first write back any modified cached data + flush_cache(); + return data; + } + +protected: + // number of values per block + uint block_size() const { return 1u << (2 * dims); } + + // allocate memory for compressed data + void alloc(bool clear = true) + { + bytes = blocks * blkbits / CHAR_BIT; + reallocate(data, bytes, 0x100u); + if (clear) + std::fill(data, data + bytes, 0); + stream_close(zfp->stream); + zfp_stream_set_bit_stream(zfp, stream_open(data, bytes)); + clear_cache(); + } + + // free memory associated with compressed data + void free() + { + nx = ny = nz = 0; + bx = by = bz = 0; + blocks = 0; + stream_close(zfp->stream); + zfp_stream_set_bit_stream(zfp, 0); + bytes = 0; + deallocate(data); + data = 0; + deallocate(shape); + shape = 0; + } + + // perform a deep copy + void deep_copy(const array& a) + { + // copy metadata + dims = a.dims; + type = a.type; + nx = a.nx; + ny = a.ny; + nz = a.nz; + bx = a.bx; + by = a.by; + bz = a.bz; + blocks = a.blocks; + blkbits = a.blkbits; + bytes = a.bytes; + + // copy dynamically allocated data + clone(data, a.data, bytes, 0x100u); + if (zfp) { + if (zfp->stream) + stream_close(zfp->stream); + zfp_stream_close(zfp); + } + zfp = zfp_stream_open(0); + *zfp = *a.zfp; + zfp_stream_set_bit_stream(zfp, stream_open(data, bytes)); + clone(shape, a.shape, blocks); + } + + uint dims; // array dimensionality (1, 2, or 3) + zfp_type type; // scalar type + uint nx, ny, nz; // array dimensions + uint bx, by, bz; // array dimensions in number of blocks + uint blocks; // number of blocks + size_t blkbits; // number of bits per compressed block + size_t bytes; // total bytes of compressed data + mutable uchar* data; // pointer to compressed data + zfp_stream* zfp; // compressed stream of blocks + uchar* shape; // precomputed block dimensions (or null if uniform) +}; + +} + +#endif diff --git a/zfp/array/zfparray1.h b/zfp/array/zfparray1.h new file mode 100644 index 0000000000000000000000000000000000000000..7949d83bcb3db6d560a0d177e97f3750d1ce35fb --- /dev/null +++ b/zfp/array/zfparray1.h @@ -0,0 +1,286 @@ +#ifndef ZFP_ARRAY1_H +#define ZFP_ARRAY1_H + +#include <cstddef> +#include <iterator> +#include "zfparray.h" +#include "zfpcodec.h" +#include "zfp/cache.h" + +namespace zfp { + +// compressed 1D array of scalars +template < typename Scalar, class Codec = zfp::codec<Scalar> > +class array1 : public array { +public: + // forward declarations + class reference; + class pointer; + class iterator; + class view; + #include "zfp/reference1.h" + #include "zfp/pointer1.h" + #include "zfp/iterator1.h" + #include "zfp/view1.h" + + // default constructor + array1() : array(1, Codec::type) {} + + // constructor of n-sample array using rate bits per value, at least + // csize bytes of cache, and optionally initialized from flat array p + array1(uint n, double rate, const Scalar* p = 0, size_t csize = 0) : + array(1, Codec::type), + cache(lines(csize, n)) + { + set_rate(rate); + resize(n, p == 0); + if (p) + set(p); + } + + // copy constructor--performs a deep copy + array1(const array1& a) + { + deep_copy(a); + } + + // construction from view--perform deep copy of (sub)array + template <class View> + array1(const View& v) : + array(1, Codec::type), + cache(lines(0, v.size_x())) + { + set_rate(v.rate()); + resize(v.size_x(), true); + // initialize array in its preferred order + for (iterator it = begin(); it != end(); ++it) + *it = v(it.i()); + } + + // virtual destructor + virtual ~array1() {} + + // assignment operator--performs a deep copy + array1& operator=(const array1& a) + { + if (this != &a) + deep_copy(a); + return *this; + } + + // total number of elements in array + size_t size() const { return size_t(nx); } + + // array dimensions + uint size_x() const { return nx; } + + // resize the array (all previously stored data will be lost) + void resize(uint n, bool clear = true) + { + if (n == 0) + free(); + else { + nx = n; + bx = (nx + 3) / 4; + blocks = bx; + alloc(clear); + + // precompute block dimensions + deallocate(shape); + if (nx & 3u) { + shape = (uchar*)allocate(blocks); + uchar* p = shape; + for (uint i = 0; i < bx; i++) + *p++ = (i == bx - 1 ? -nx & 3u : 0); + } + else + shape = 0; + } + } + + // cache size in number of bytes + size_t cache_size() const { return cache.size() * sizeof(CacheLine); } + + // set minimum cache size in bytes (array dimensions must be known) + void set_cache_size(size_t csize) + { + flush_cache(); + cache.resize(lines(csize, nx)); + } + + // empty cache without compressing modified cached blocks + void clear_cache() const { cache.clear(); } + + // flush cache by compressing all modified cached blocks + void flush_cache() const + { + for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) { + if (p->tag.dirty()) { + uint b = p->tag.index() - 1; + encode(b, p->line->data()); + } + cache.flush(p->line); + } + } + + // decompress array and store at p + void get(Scalar* p) const + { + uint b = 0; + for (uint i = 0; i < bx; i++, p += 4, b++) { + const CacheLine* line = cache.lookup(b + 1); + if (line) + line->get(p, 1, shape ? shape[b] : 0); + else + decode(b, p, 1); + } + } + + // initialize array by copying and compressing data stored at p + void set(const Scalar* p) + { + uint b = 0; + for (uint i = 0; i < bx; i++, b++, p += 4) + encode(b, p, 1); + cache.clear(); + } + + // (i) accessors + Scalar operator()(uint i) const { return get(i); } + reference operator()(uint i) { return reference(this, i); } + + // flat index accessors + Scalar operator[](uint index) const { return get(index); } + reference operator[](uint index) { return reference(this, index); } + + // random access iterators + iterator begin() { return iterator(this, 0); } + iterator end() { return iterator(this, nx); } + +protected: + // cache line representing one block of decompressed values + class CacheLine { + public: + Scalar operator()(uint i) const { return a[index(i)]; } + Scalar& operator()(uint i) { return a[index(i)]; } + const Scalar* data() const { return a; } + Scalar* data() { return a; } + // copy cache line + void get(Scalar* p, int sx) const + { + const Scalar* q = a; + for (uint x = 0; x < 4; x++, p += sx, q++) + *p = *q; + } + void get(Scalar* p, int sx, uint shape) const + { + if (!shape) + get(p, sx); + else { + // determine block dimensions + uint nx = 4 - (shape & 3u); shape >>= 2; + const Scalar* q = a; + for (uint x = 0; x < nx; x++, p += sx, q++) + *p = *q; + } + } + protected: + static uint index(uint i) { return i & 3u; } + Scalar a[4]; + }; + + // perform a deep copy + void deep_copy(const array1& a) + { + // copy base class members + array::deep_copy(a); + // copy cache + cache = a.cache; + } + + // inspector + Scalar get(uint i) const + { + const CacheLine* p = line(i, false); + return (*p)(i); + } + + // mutator + void set(uint i, Scalar val) + { + CacheLine* p = line(i, true); + (*p)(i) = val; + } + + // in-place updates + void add(uint i, Scalar val) { (*line(i, true))(i) += val; } + void sub(uint i, Scalar val) { (*line(i, true))(i) -= val; } + void mul(uint i, Scalar val) { (*line(i, true))(i) *= val; } + void div(uint i, Scalar val) { (*line(i, true))(i) /= val; } + + // return cache line for i; may require write-back and fetch + CacheLine* line(uint i, bool write) const + { + CacheLine* p = 0; + uint b = block(i); + typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write); + uint c = t.index() - 1; + if (c != b) { + // write back occupied cache line if it is dirty + if (t.dirty()) + encode(c, p->data()); + // fetch cache line + decode(b, p->data()); + } + return p; + } + + // encode block with given index + void encode(uint index, const Scalar* block) const + { + stream_wseek(zfp->stream, index * blkbits); + Codec::encode_block_1(zfp, block, shape ? shape[index] : 0); + stream_flush(zfp->stream); + } + + // encode block with given index from strided array + void encode(uint index, const Scalar* p, int sx) const + { + stream_wseek(zfp->stream, index * blkbits); + Codec::encode_block_strided_1(zfp, p, shape ? shape[index] : 0, sx); + stream_flush(zfp->stream); + } + + // decode block with given index + void decode(uint index, Scalar* block) const + { + stream_rseek(zfp->stream, index * blkbits); + Codec::decode_block_1(zfp, block, shape ? shape[index] : 0); + } + + // decode block with given index to strided array + void decode(uint index, Scalar* p, int sx) const + { + stream_rseek(zfp->stream, index * blkbits); + Codec::decode_block_strided_1(zfp, p, shape ? shape[index] : 0, sx); + } + + // block index for i + static uint block(uint i) { return i / 4; } + + // number of cache lines corresponding to size (or suggested size if zero) + static uint lines(size_t size, uint n) + { + n = uint(((size ? size : 8 * sizeof(Scalar)) + sizeof(CacheLine) - 1) / sizeof(CacheLine)); + return std::max(n, 1u); + } + + mutable Cache<CacheLine> cache; // cache of decompressed blocks +}; + +typedef array1<float> array1f; +typedef array1<double> array1d; + +} + +#endif diff --git a/zfp/array/zfparray2.h b/zfp/array/zfparray2.h new file mode 100644 index 0000000000000000000000000000000000000000..152b06698a1380c7f3d67fc2e7a249f01e32aecd --- /dev/null +++ b/zfp/array/zfparray2.h @@ -0,0 +1,313 @@ +#ifndef ZFP_ARRAY2_H +#define ZFP_ARRAY2_H + +#include <cstddef> +#include <iterator> +#include "zfparray.h" +#include "zfpcodec.h" +#include "zfp/cache.h" + +namespace zfp { + +// compressed 2D array of scalars +template < typename Scalar, class Codec = zfp::codec<Scalar> > +class array2 : public array { +public: + // forward declarations + class reference; + class pointer; + class iterator; + class view; + #include "zfp/reference2.h" + #include "zfp/pointer2.h" + #include "zfp/iterator2.h" + #include "zfp/view2.h" + + // default constructor + array2() : array(2, Codec::type) {} + + // constructor of nx * ny array using rate bits per value, at least + // csize bytes of cache, and optionally initialized from flat array p + array2(uint nx, uint ny, double rate, const Scalar* p = 0, size_t csize = 0) : + array(2, Codec::type), + cache(lines(csize, nx, ny)) + { + set_rate(rate); + resize(nx, ny, p == 0); + if (p) + set(p); + } + + // copy constructor--performs a deep copy + array2(const array2& a) + { + deep_copy(a); + } + + // construction from view--perform deep copy of (sub)array + template <class View> + array2(const View& v) : + array(2, Codec::type), + cache(lines(0, v.size_x(), v.size_y())) + { + set_rate(v.rate()); + resize(v.size_x(), v.size_y(), true); + // initialize array in its preferred order + for (iterator it = begin(); it != end(); ++it) + *it = v(it.i(), it.j()); + } + + // virtual destructor + virtual ~array2() {} + + // assignment operator--performs a deep copy + array2& operator=(const array2& a) + { + if (this != &a) + deep_copy(a); + return *this; + } + + // total number of elements in array + size_t size() const { return size_t(nx) * size_t(ny); } + + // array dimensions + uint size_x() const { return nx; } + uint size_y() const { return ny; } + + // resize the array (all previously stored data will be lost) + void resize(uint nx, uint ny, bool clear = true) + { + if (nx == 0 || ny == 0) + free(); + else { + this->nx = nx; + this->ny = ny; + bx = (nx + 3) / 4; + by = (ny + 3) / 4; + blocks = bx * by; + alloc(clear); + + // precompute block dimensions + deallocate(shape); + if ((nx | ny) & 3u) { + shape = (uchar*)allocate(blocks); + uchar* p = shape; + for (uint j = 0; j < by; j++) + for (uint i = 0; i < bx; i++) + *p++ = (i == bx - 1 ? -nx & 3u : 0) + 4 * (j == by - 1 ? -ny & 3u : 0); + } + else + shape = 0; + } + } + + // cache size in number of bytes + size_t cache_size() const { return cache.size() * sizeof(CacheLine); } + + // set minimum cache size in bytes (array dimensions must be known) + void set_cache_size(size_t csize) + { + flush_cache(); + cache.resize(lines(csize, nx, ny)); + } + + // empty cache without compressing modified cached blocks + void clear_cache() const { cache.clear(); } + + // flush cache by compressing all modified cached blocks + void flush_cache() const + { + for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) { + if (p->tag.dirty()) { + uint b = p->tag.index() - 1; + encode(b, p->line->data()); + } + cache.flush(p->line); + } + } + + // decompress array and store at p + void get(Scalar* p) const + { + uint b = 0; + for (uint j = 0; j < by; j++, p += 4 * (nx - bx)) + for (uint i = 0; i < bx; i++, p += 4, b++) { + const CacheLine* line = cache.lookup(b + 1); + if (line) + line->get(p, 1, nx, shape ? shape[b] : 0); + else + decode(b, p, 1, nx); + } + } + + // initialize array by copying and compressing data stored at p + void set(const Scalar* p) + { + uint b = 0; + for (uint j = 0; j < by; j++, p += 4 * (nx - bx)) + for (uint i = 0; i < bx; i++, p += 4, b++) + encode(b, p, 1, nx); + cache.clear(); + } + + // (i, j) accessors + Scalar operator()(uint i, uint j) const { return get(i, j); } + reference operator()(uint i, uint j) { return reference(this, i, j); } + + // flat index accessors + Scalar operator[](uint index) const + { + uint i, j; + ij(i, j, index); + return get(i, j); + } + reference operator[](uint index) + { + uint i, j; + ij(i, j, index); + return reference(this, i, j); + } + + // sequential iterators + iterator begin() { return iterator(this, 0, 0); } + iterator end() { return iterator(this, 0, ny); } + +protected: + // cache line representing one block of decompressed values + class CacheLine { + public: + Scalar operator()(uint i, uint j) const { return a[index(i, j)]; } + Scalar& operator()(uint i, uint j) { return a[index(i, j)]; } + const Scalar* data() const { return a; } + Scalar* data() { return a; } + // copy cache line + void get(Scalar* p, int sx, int sy) const + { + const Scalar* q = a; + for (uint y = 0; y < 4; y++, p += sy - 4 * sx) + for (uint x = 0; x < 4; x++, p += sx, q++) + *p = *q; + } + void get(Scalar* p, int sx, int sy, uint shape) const + { + if (!shape) + get(p, sx, sy); + else { + // determine block dimensions + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + const Scalar* q = a; + for (uint y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx) + for (uint x = 0; x < nx; x++, p += sx, q++) + *p = *q; + } + } + protected: + static uint index(uint i, uint j) { return (i & 3u) + 4 * (j & 3u); } + Scalar a[16]; + }; + + // perform a deep copy + void deep_copy(const array2& a) + { + // copy base class members + array::deep_copy(a); + // copy cache + cache = a.cache; + } + + // inspector + Scalar get(uint i, uint j) const + { + const CacheLine* p = line(i, j, false); + return (*p)(i, j); + } + + // mutator + void set(uint i, uint j, Scalar val) + { + CacheLine* p = line(i, j, true); + (*p)(i, j) = val; + } + + // in-place updates + void add(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) += val; } + void sub(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) -= val; } + void mul(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) *= val; } + void div(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) /= val; } + + // return cache line for (i, j); may require write-back and fetch + CacheLine* line(uint i, uint j, bool write) const + { + CacheLine* p = 0; + uint b = block(i, j); + typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write); + uint c = t.index() - 1; + if (c != b) { + // write back occupied cache line if it is dirty + if (t.dirty()) + encode(c, p->data()); + // fetch cache line + decode(b, p->data()); + } + return p; + } + + // encode block with given index + void encode(uint index, const Scalar* block) const + { + stream_wseek(zfp->stream, index * blkbits); + Codec::encode_block_2(zfp, block, shape ? shape[index] : 0); + stream_flush(zfp->stream); + } + + // encode block with given index from strided array + void encode(uint index, const Scalar* p, int sx, int sy) const + { + stream_wseek(zfp->stream, index * blkbits); + Codec::encode_block_strided_2(zfp, p, shape ? shape[index] : 0, sx, sy); + stream_flush(zfp->stream); + } + + // decode block with given index + void decode(uint index, Scalar* block) const + { + stream_rseek(zfp->stream, index * blkbits); + Codec::decode_block_2(zfp, block, shape ? shape[index] : 0); + } + + // decode block with given index to strided array + void decode(uint index, Scalar* p, int sx, int sy) const + { + stream_rseek(zfp->stream, index * blkbits); + Codec::decode_block_strided_2(zfp, p, shape ? shape[index] : 0, sx, sy); + } + + // block index for (i, j) + uint block(uint i, uint j) const { return (i / 4) + bx * (j / 4); } + + // convert flat index to (i, j) + void ij(uint& i, uint& j, uint index) const + { + i = index % nx; + index /= nx; + j = index; + } + + // number of cache lines corresponding to size (or suggested size if zero) + static uint lines(size_t size, uint nx, uint ny) + { + uint n = uint(((size ? size : 8 * nx * sizeof(Scalar)) + sizeof(CacheLine) - 1) / sizeof(CacheLine)); + return std::max(n, 1u); + } + + mutable Cache<CacheLine> cache; // cache of decompressed blocks +}; + +typedef array2<float> array2f; +typedef array2<double> array2d; + +} + +#endif diff --git a/zfp/array/zfparray3.h b/zfp/array/zfparray3.h new file mode 100644 index 0000000000000000000000000000000000000000..c4fd7614a6fb24b884b4f4dff9b47b78c548b556 --- /dev/null +++ b/zfp/array/zfparray3.h @@ -0,0 +1,327 @@ +#ifndef ZFP_ARRAY3_H +#define ZFP_ARRAY3_H + +#include <cstddef> +#include <iterator> +#include "zfparray.h" +#include "zfpcodec.h" +#include "zfp/cache.h" + +namespace zfp { + +// compressed 3D array of scalars +template < typename Scalar, class Codec = zfp::codec<Scalar> > +class array3 : public array { +public: + // forward declarations + class reference; + class pointer; + class iterator; + class view; + #include "zfp/reference3.h" + #include "zfp/pointer3.h" + #include "zfp/iterator3.h" + #include "zfp/view3.h" + + // default constructor + array3() : array(3, Codec::type) {} + + // constructor of nx * ny * nz array using rate bits per value, at least + // csize bytes of cache, and optionally initialized from flat array p + array3(uint nx, uint ny, uint nz, double rate, const Scalar* p = 0, size_t csize = 0) : + array(3, Codec::type), + cache(lines(csize, nx, ny, nz)) + { + set_rate(rate); + resize(nx, ny, nz, p == 0); + if (p) + set(p); + } + + // copy constructor--performs a deep copy + array3(const array3& a) + { + deep_copy(a); + } + + // construction from view--perform deep copy of (sub)array + template <class View> + array3(const View& v) : + array(3, Codec::type), + cache(lines(0, v.size_x(), v.size_y(), v.size_z())) + { + set_rate(v.rate()); + resize(v.size_x(), v.size_y(), v.size_z(), true); + // initialize array in its preferred order + for (iterator it = begin(); it != end(); ++it) + *it = v(it.i(), it.j(), it.k()); + } + + // virtual destructor + virtual ~array3() {} + + // assignment operator--performs a deep copy + array3& operator=(const array3& a) + { + if (this != &a) + deep_copy(a); + return *this; + } + + // total number of elements in array + size_t size() const { return size_t(nx) * size_t(ny) * size_t(nz); } + + // array dimensions + uint size_x() const { return nx; } + uint size_y() const { return ny; } + uint size_z() const { return nz; } + + // resize the array (all previously stored data will be lost) + void resize(uint nx, uint ny, uint nz, bool clear = true) + { + if (nx == 0 || ny == 0 || nz == 0) + free(); + else { + this->nx = nx; + this->ny = ny; + this->nz = nz; + bx = (nx + 3) / 4; + by = (ny + 3) / 4; + bz = (nz + 3) / 4; + blocks = bx * by * bz; + alloc(clear); + + // precompute block dimensions + deallocate(shape); + if ((nx | ny | nz) & 3u) { + shape = (uchar*)allocate(blocks); + uchar* p = shape; + for (uint k = 0; k < bz; k++) + for (uint j = 0; j < by; j++) + for (uint i = 0; i < bx; i++) + *p++ = (i == bx - 1 ? -nx & 3u : 0) + 4 * ((j == by - 1 ? -ny & 3u : 0) + 4 * (k == bz - 1 ? -nz & 3u : 0)); + } + else + shape = 0; + } + } + + // cache size in number of bytes + size_t cache_size() const { return cache.size() * sizeof(CacheLine); } + + // set minimum cache size in bytes (array dimensions must be known) + void set_cache_size(size_t csize) + { + flush_cache(); + cache.resize(lines(csize, nx, ny, nz)); + } + + // empty cache without compressing modified cached blocks + void clear_cache() const { cache.clear(); } + + // flush cache by compressing all modified cached blocks + void flush_cache() const + { + for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) { + if (p->tag.dirty()) { + uint b = p->tag.index() - 1; + encode(b, p->line->data()); + } + cache.flush(p->line); + } + } + + // decompress array and store at p + void get(Scalar* p) const + { + uint b = 0; + for (uint k = 0; k < bz; k++, p += 4 * nx * (ny - by)) + for (uint j = 0; j < by; j++, p += 4 * (nx - bx)) + for (uint i = 0; i < bx; i++, p += 4, b++) { + const CacheLine* line = cache.lookup(b + 1); + if (line) + line->get(p, 1, nx, nx * ny, shape ? shape[b] : 0); + else + decode(b, p, 1, nx, nx * ny); + } + } + + // initialize array by copying and compressing data stored at p + void set(const Scalar* p) + { + uint b = 0; + for (uint k = 0; k < bz; k++, p += 4 * nx * (ny - by)) + for (uint j = 0; j < by; j++, p += 4 * (nx - bx)) + for (uint i = 0; i < bx; i++, p += 4, b++) + encode(b, p, 1, nx, nx * ny); + cache.clear(); + } + + // (i, j, k) accessors + Scalar operator()(uint i, uint j, uint k) const { return get(i, j, k); } + reference operator()(uint i, uint j, uint k) { return reference(this, i, j, k); } + + // flat index corresponding to (i, j, k) + uint index(uint i, uint j, uint k) const { return i + nx * (j + ny * k); } + + // flat index accessors + Scalar operator[](uint index) const + { + uint i, j, k; + ijk(i, j, k, index); + return get(i, j, k); + } + reference operator[](uint index) + { + uint i, j, k; + ijk(i, j, k, index); + return reference(this, i, j, k); + } + + // sequential iterators + iterator begin() { return iterator(this, 0, 0, 0); } + iterator end() { return iterator(this, 0, 0, nz); } + +protected: + // cache line representing one block of decompressed values + class CacheLine { + public: + Scalar operator()(uint i, uint j, uint k) const { return a[index(i, j, k)]; } + Scalar& operator()(uint i, uint j, uint k) { return a[index(i, j, k)]; } + const Scalar* data() const { return a; } + Scalar* data() { return a; } + // copy cache line + void get(Scalar* p, int sx, int sy, int sz) const + { + const Scalar* q = a; + for (uint z = 0; z < 4; z++, p += sz - 4 * sy) + for (uint y = 0; y < 4; y++, p += sy - 4 * sx) + for (uint x = 0; x < 4; x++, p += sx, q++) + *p = *q; + } + void get(Scalar* p, int sx, int sy, int sz, uint shape) const + { + if (!shape) + get(p, sx, sy, sz); + else { + // determine block dimensions + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + uint nz = 4 - (shape & 3u); shape >>= 2; + const Scalar* q = a; + for (uint z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny) + for (uint y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx) + for (uint x = 0; x < nx; x++, p += sx, q++) + *p = *q; + } + } + protected: + static uint index(uint i, uint j, uint k) { return (i & 3u) + 4 * ((j & 3u) + 4 * (k & 3u)); } + Scalar a[64]; + }; + + // perform a deep copy + void deep_copy(const array3& a) + { + // copy base class members + array::deep_copy(a); + // copy cache + cache = a.cache; + } + + // inspector + Scalar get(uint i, uint j, uint k) const + { + const CacheLine* p = line(i, j, k, false); + return (*p)(i, j, k); + } + + // mutator + void set(uint i, uint j, uint k, Scalar val) + { + CacheLine* p = line(i, j, k, true); + (*p)(i, j, k) = val; + } + + // in-place updates + void add(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) += val; } + void sub(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) -= val; } + void mul(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) *= val; } + void div(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) /= val; } + + // return cache line for (i, j, k); may require write-back and fetch + CacheLine* line(uint i, uint j, uint k, bool write) const + { + CacheLine* p = 0; + uint b = block(i, j, k); + typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write); + uint c = t.index() - 1; + if (c != b) { + // write back occupied cache line if it is dirty + if (t.dirty()) + encode(c, p->data()); + // fetch cache line + decode(b, p->data()); + } + return p; + } + + // encode block with given index + void encode(uint index, const Scalar* block) const + { + stream_wseek(zfp->stream, index * blkbits); + Codec::encode_block_3(zfp, block, shape ? shape[index] : 0); + stream_flush(zfp->stream); + } + + // encode block with given index from strided array + void encode(uint index, const Scalar* p, int sx, int sy, int sz) const + { + stream_wseek(zfp->stream, index * blkbits); + Codec::encode_block_strided_3(zfp, p, shape ? shape[index] : 0, sx, sy, sz); + stream_flush(zfp->stream); + } + + // decode block with given index + void decode(uint index, Scalar* block) const + { + stream_rseek(zfp->stream, index * blkbits); + Codec::decode_block_3(zfp, block, shape ? shape[index] : 0); + } + + // decode block with given index to strided array + void decode(uint index, Scalar* p, int sx, int sy, int sz) const + { + stream_rseek(zfp->stream, index * blkbits); + Codec::decode_block_strided_3(zfp, p, shape ? shape[index] : 0, sx, sy, sz); + } + + // block index for (i, j, k) + uint block(uint i, uint j, uint k) const { return (i / 4) + bx * ((j / 4) + by * (k / 4)); } + + // convert flat index to (i, j, k) + void ijk(uint& i, uint& j, uint& k, uint index) const + { + i = index % nx; + index /= nx; + j = index % ny; + index /= ny; + k = index; + } + + // number of cache lines corresponding to size (or suggested size if zero) + static uint lines(size_t size, uint nx, uint ny, uint nz) + { + uint n = uint(((size ? size : 8 * nx * ny * sizeof(Scalar)) + sizeof(CacheLine) - 1) / sizeof(CacheLine)); + return std::max(n, 1u); + } + + mutable Cache<CacheLine> cache; // cache of decompressed blocks +}; + +typedef array3<float> array3f; +typedef array3<double> array3d; + +} + +#endif diff --git a/zfp/array/zfpcodec.h b/zfp/array/zfpcodec.h new file mode 100644 index 0000000000000000000000000000000000000000..2d4674444e07f3262943df3c7e543ce14e3d66e0 --- /dev/null +++ b/zfp/array/zfpcodec.h @@ -0,0 +1,17 @@ +#ifndef ZFP_CODEC_H +#define ZFP_CODEC_H + +#include "zfp.h" + +namespace zfp { + +// C++ wrappers around libzfp C functions +template <typename Scalar> +struct codec {}; + +#include "zfpcodecf.h" +#include "zfpcodecd.h" + +} + +#endif diff --git a/zfp/array/zfpcodecd.h b/zfp/array/zfpcodecd.h new file mode 100644 index 0000000000000000000000000000000000000000..9e7d893234062afb0801573c01f2100c73848183 --- /dev/null +++ b/zfp/array/zfpcodecd.h @@ -0,0 +1,149 @@ +// double-precision codec +template <> +struct codec<double> { + // encode contiguous 1D block + static void encode_block_1(zfp_stream* zfp, const double* block, uint shape) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + zfp_encode_partial_block_strided_double_1(zfp, block, nx, 1); + } + else + zfp_encode_block_double_1(zfp, block); + } + + // encode 1D block from strided storage + static void encode_block_strided_1(zfp_stream* zfp, const double* p, uint shape, int sx) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + zfp_encode_partial_block_strided_double_1(zfp, p, nx, sx); + } + else + zfp_encode_block_strided_double_1(zfp, p, sx); + } + + // encode contiguous 2D block + static void encode_block_2(zfp_stream* zfp, const double* block, uint shape) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + zfp_encode_partial_block_strided_double_2(zfp, block, nx, ny, 1, 4); + } + else + zfp_encode_block_double_2(zfp, block); + } + + // encode 2D block from strided storage + static void encode_block_strided_2(zfp_stream* zfp, const double* p, uint shape, int sx, int sy) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + zfp_encode_partial_block_strided_double_2(zfp, p, nx, ny, sx, sy); + } + else + zfp_encode_block_strided_double_2(zfp, p, sx, sy); + } + + // encode contiguous 3D block + static void encode_block_3(zfp_stream* zfp, const double* block, uint shape) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + uint nz = 4 - (shape & 3u); shape >>= 2; + zfp_encode_partial_block_strided_double_3(zfp, block, nx, ny, nz, 1, 4, 16); + } + else + zfp_encode_block_double_3(zfp, block); + } + + // encode 3D block from strided storage + static void encode_block_strided_3(zfp_stream* zfp, const double* p, uint shape, int sx, int sy, int sz) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + uint nz = 4 - (shape & 3u); shape >>= 2; + zfp_encode_partial_block_strided_double_3(zfp, p, nx, ny, nz, sx, sy, sz); + } + else + zfp_encode_block_strided_double_3(zfp, p, sx, sy, sz); + } + + // decode contiguous 1D block + static void decode_block_1(zfp_stream* zfp, double* block, uint shape) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + zfp_decode_partial_block_strided_double_1(zfp, block, nx, 1); + } + else + zfp_decode_block_double_1(zfp, block); + } + + // decode 1D block to strided storage + static void decode_block_strided_1(zfp_stream* zfp, double* p, uint shape, int sx) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + zfp_decode_partial_block_strided_double_1(zfp, p, nx, sx); + } + else + zfp_decode_block_strided_double_1(zfp, p, sx); + } + + // decode contiguous 2D block + static void decode_block_2(zfp_stream* zfp, double* block, uint shape) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + zfp_decode_partial_block_strided_double_2(zfp, block, nx, ny, 1, 4); + } + else + zfp_decode_block_double_2(zfp, block); + } + + // decode 2D block to strided storage + static void decode_block_strided_2(zfp_stream* zfp, double* p, uint shape, int sx, int sy) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + zfp_decode_partial_block_strided_double_2(zfp, p, nx, ny, sx, sy); + } + else + zfp_decode_block_strided_double_2(zfp, p, sx, sy); + } + + // decode contiguous 3D block + static void decode_block_3(zfp_stream* zfp, double* block, uint shape) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + uint nz = 4 - (shape & 3u); shape >>= 2; + zfp_decode_partial_block_strided_double_3(zfp, block, nx, ny, nz, 1, 4, 16); + } + else + zfp_decode_block_double_3(zfp, block); + } + + // decode 3D block to strided storage + static void decode_block_strided_3(zfp_stream* zfp, double* p, uint shape, int sx, int sy, int sz) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + uint nz = 4 - (shape & 3u); shape >>= 2; + zfp_decode_partial_block_strided_double_3(zfp, p, nx, ny, nz, sx, sy, sz); + } + else + zfp_decode_block_strided_double_3(zfp, p, sx, sy, sz); + } + + static const zfp_type type = zfp_type_double; +}; diff --git a/zfp/array/zfpcodecf.h b/zfp/array/zfpcodecf.h new file mode 100644 index 0000000000000000000000000000000000000000..1ec74a60990281d545886ad5f49a699d18c14a10 --- /dev/null +++ b/zfp/array/zfpcodecf.h @@ -0,0 +1,149 @@ +// single-precision codec +template <> +struct codec<float> { + // encode contiguous 1D block + static void encode_block_1(zfp_stream* zfp, const float* block, uint shape) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + zfp_encode_partial_block_strided_float_1(zfp, block, nx, 1); + } + else + zfp_encode_block_float_1(zfp, block); + } + + // encode 1D block from strided storage + static void encode_block_strided_1(zfp_stream* zfp, const float* p, uint shape, int sx) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + zfp_encode_partial_block_strided_float_1(zfp, p, nx, sx); + } + else + zfp_encode_block_strided_float_1(zfp, p, sx); + } + + // encode contiguous 2D block + static void encode_block_2(zfp_stream* zfp, const float* block, uint shape) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + zfp_encode_partial_block_strided_float_2(zfp, block, nx, ny, 1, 4); + } + else + zfp_encode_block_float_2(zfp, block); + } + + // encode 2D block from strided storage + static void encode_block_strided_2(zfp_stream* zfp, const float* p, uint shape, int sx, int sy) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + zfp_encode_partial_block_strided_float_2(zfp, p, nx, ny, sx, sy); + } + else + zfp_encode_block_strided_float_2(zfp, p, sx, sy); + } + + // encode contiguous 3D block + static void encode_block_3(zfp_stream* zfp, const float* block, uint shape) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + uint nz = 4 - (shape & 3u); shape >>= 2; + zfp_encode_partial_block_strided_float_3(zfp, block, nx, ny, nz, 1, 4, 16); + } + else + zfp_encode_block_float_3(zfp, block); + } + + // encode 3D block from strided storage + static void encode_block_strided_3(zfp_stream* zfp, const float* p, uint shape, int sx, int sy, int sz) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + uint nz = 4 - (shape & 3u); shape >>= 2; + zfp_encode_partial_block_strided_float_3(zfp, p, nx, ny, nz, sx, sy, sz); + } + else + zfp_encode_block_strided_float_3(zfp, p, sx, sy, sz); + } + + // decode contiguous 1D block + static void decode_block_1(zfp_stream* zfp, float* block, uint shape) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + zfp_decode_partial_block_strided_float_1(zfp, block, nx, 1); + } + else + zfp_decode_block_float_1(zfp, block); + } + + // decode 1D block to strided storage + static void decode_block_strided_1(zfp_stream* zfp, float* p, uint shape, int sx) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + zfp_decode_partial_block_strided_float_1(zfp, p, nx, sx); + } + else + zfp_decode_block_strided_float_1(zfp, p, sx); + } + + // decode contiguous 2D block + static void decode_block_2(zfp_stream* zfp, float* block, uint shape) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + zfp_decode_partial_block_strided_float_2(zfp, block, nx, ny, 1, 4); + } + else + zfp_decode_block_float_2(zfp, block); + } + + // decode 2D block to strided storage + static void decode_block_strided_2(zfp_stream* zfp, float* p, uint shape, int sx, int sy) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + zfp_decode_partial_block_strided_float_2(zfp, p, nx, ny, sx, sy); + } + else + zfp_decode_block_strided_float_2(zfp, p, sx, sy); + } + + // decode contiguous 3D block + static void decode_block_3(zfp_stream* zfp, float* block, uint shape) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + uint nz = 4 - (shape & 3u); shape >>= 2; + zfp_decode_partial_block_strided_float_3(zfp, block, nx, ny, nz, 1, 4, 16); + } + else + zfp_decode_block_float_3(zfp, block); + } + + // decode 3D block to strided storage + static void decode_block_strided_3(zfp_stream* zfp, float* p, uint shape, int sx, int sy, int sz) + { + if (shape) { + uint nx = 4 - (shape & 3u); shape >>= 2; + uint ny = 4 - (shape & 3u); shape >>= 2; + uint nz = 4 - (shape & 3u); shape >>= 2; + zfp_decode_partial_block_strided_float_3(zfp, p, nx, ny, nz, sx, sy, sz); + } + else + zfp_decode_block_strided_float_3(zfp, p, sx, sy, sz); + } + + static const zfp_type type = zfp_type_float; +}; diff --git a/zfp/cfp/CMakeLists.txt b/zfp/cfp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..febd4f0ab6f826fc669a9047b2c86fd7dc8c351d --- /dev/null +++ b/zfp/cfp/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(src) diff --git a/zfp/cfp/include/cfparray1d.h b/zfp/cfp/include/cfparray1d.h new file mode 100644 index 0000000000000000000000000000000000000000..1be2729566a3f867018fe3f2861b0267f7a47ce8 --- /dev/null +++ b/zfp/cfp/include/cfparray1d.h @@ -0,0 +1,37 @@ +#ifndef CFP_ARRAY_1D +#define CFP_ARRAY_1D + +#include <stddef.h> +#include "zfp/types.h" + +struct cfp_array1d; +typedef struct cfp_array1d cfp_array1d; + +typedef struct { + cfp_array1d* (*ctor_default)(); + cfp_array1d* (*ctor)(uint n, double rate, const double* p, size_t csize); + cfp_array1d* (*ctor_copy)(const cfp_array1d* src); + void (*dtor)(cfp_array1d* self); + + void (*deep_copy)(cfp_array1d* self, const cfp_array1d* src); + + double (*rate)(const cfp_array1d* self); + double (*set_rate)(cfp_array1d* self, double rate); + size_t (*cache_size)(const cfp_array1d* self); + void (*set_cache_size)(cfp_array1d* self, size_t csize); + void (*clear_cache)(const cfp_array1d* self); + void (*flush_cache)(const cfp_array1d* self); + size_t (*compressed_size)(const cfp_array1d* self); + uchar* (*compressed_data)(const cfp_array1d* self); + size_t (*size)(const cfp_array1d* self); + void (*resize)(cfp_array1d* self, uint n, int clear); + + void (*get_array)(const cfp_array1d* self, double* p); + void (*set_array)(cfp_array1d* self, const double* p); + double (*get_flat)(const cfp_array1d* self, uint i); + void (*set_flat)(cfp_array1d* self, uint i, double val); + double (*get)(const cfp_array1d* self, uint i); + void (*set)(cfp_array1d* self, uint i, double val); +} cfp_array1d_api; + +#endif diff --git a/zfp/cfp/include/cfparray1f.h b/zfp/cfp/include/cfparray1f.h new file mode 100644 index 0000000000000000000000000000000000000000..90d52391e8925803f27a67d1a68928589e4abe8f --- /dev/null +++ b/zfp/cfp/include/cfparray1f.h @@ -0,0 +1,37 @@ +#ifndef CFP_ARRAY_1F +#define CFP_ARRAY_1F + +#include <stddef.h> +#include "zfp/types.h" + +struct cfp_array1f; +typedef struct cfp_array1f cfp_array1f; + +typedef struct { + cfp_array1f* (*ctor_default)(); + cfp_array1f* (*ctor)(uint n, double rate, const float* p, size_t csize); + cfp_array1f* (*ctor_copy)(const cfp_array1f* src); + void (*dtor)(cfp_array1f* self); + + void (*deep_copy)(cfp_array1f* self, const cfp_array1f* src); + + double (*rate)(const cfp_array1f* self); + double (*set_rate)(cfp_array1f* self, double rate); + size_t (*cache_size)(const cfp_array1f* self); + void (*set_cache_size)(cfp_array1f* self, size_t csize); + void (*clear_cache)(const cfp_array1f* self); + void (*flush_cache)(const cfp_array1f* self); + size_t (*compressed_size)(const cfp_array1f* self); + uchar* (*compressed_data)(const cfp_array1f* self); + size_t (*size)(const cfp_array1f* self); + void (*resize)(cfp_array1f* self, uint n, int clear); + + void (*get_array)(const cfp_array1f* self, float* p); + void (*set_array)(cfp_array1f* self, const float* p); + float (*get_flat)(const cfp_array1f* self, uint i); + void (*set_flat)(cfp_array1f* self, uint i, float val); + float (*get)(const cfp_array1f* self, uint i); + void (*set)(cfp_array1f* self, uint i, float val); +} cfp_array1f_api; + +#endif diff --git a/zfp/cfp/include/cfparray2d.h b/zfp/cfp/include/cfparray2d.h new file mode 100644 index 0000000000000000000000000000000000000000..b8d4c2a849d47078b971f00d31333ca23e4f1af1 --- /dev/null +++ b/zfp/cfp/include/cfparray2d.h @@ -0,0 +1,39 @@ +#ifndef CFP_ARRAY_2D +#define CFP_ARRAY_2D + +#include <stddef.h> +#include "zfp/types.h" + +struct cfp_array2d; +typedef struct cfp_array2d cfp_array2d; + +typedef struct { + cfp_array2d* (*ctor_default)(); + cfp_array2d* (*ctor)(uint nx, uint ny, double rate, const double* p, size_t csize); + cfp_array2d* (*ctor_copy)(const cfp_array2d* src); + void (*dtor)(cfp_array2d* self); + + void (*deep_copy)(cfp_array2d* self, const cfp_array2d* src); + + double (*rate)(const cfp_array2d* self); + double (*set_rate)(cfp_array2d* self, double rate); + size_t (*cache_size)(const cfp_array2d* self); + void (*set_cache_size)(cfp_array2d* self, size_t csize); + void (*clear_cache)(const cfp_array2d* self); + void (*flush_cache)(const cfp_array2d* self); + size_t (*compressed_size)(const cfp_array2d* self); + uchar* (*compressed_data)(const cfp_array2d* self); + size_t (*size)(const cfp_array2d* self); + uint (*size_x)(const cfp_array2d* self); + uint (*size_y)(const cfp_array2d* self); + void (*resize)(cfp_array2d* self, uint nx, uint ny, int clear); + + void (*get_array)(const cfp_array2d* self, double* p); + void (*set_array)(cfp_array2d* self, const double* p); + double (*get_flat)(const cfp_array2d* self, uint i); + void (*set_flat)(cfp_array2d* self, uint i, double val); + double (*get)(const cfp_array2d* self, uint i, uint j); + void (*set)(cfp_array2d* self, uint i, uint j, double val); +} cfp_array2d_api; + +#endif diff --git a/zfp/cfp/include/cfparray2f.h b/zfp/cfp/include/cfparray2f.h new file mode 100644 index 0000000000000000000000000000000000000000..a531ac2403e9347bd6ff200a6e614bf3da325dce --- /dev/null +++ b/zfp/cfp/include/cfparray2f.h @@ -0,0 +1,39 @@ +#ifndef CFP_ARRAY_2F +#define CFP_ARRAY_2F + +#include <stddef.h> +#include "zfp/types.h" + +struct cfp_array2f; +typedef struct cfp_array2f cfp_array2f; + +typedef struct { + cfp_array2f* (*ctor_default)(); + cfp_array2f* (*ctor)(uint nx, uint ny, double rate, const float* p, size_t csize); + cfp_array2f* (*ctor_copy)(const cfp_array2f* src); + void (*dtor)(cfp_array2f* self); + + void (*deep_copy)(cfp_array2f* self, const cfp_array2f* src); + + double (*rate)(const cfp_array2f* self); + double (*set_rate)(cfp_array2f* self, double rate); + size_t (*cache_size)(const cfp_array2f* self); + void (*set_cache_size)(cfp_array2f* self, size_t csize); + void (*clear_cache)(const cfp_array2f* self); + void (*flush_cache)(const cfp_array2f* self); + size_t (*compressed_size)(const cfp_array2f* self); + uchar* (*compressed_data)(const cfp_array2f* self); + size_t (*size)(const cfp_array2f* self); + uint (*size_x)(const cfp_array2f* self); + uint (*size_y)(const cfp_array2f* self); + void (*resize)(cfp_array2f* self, uint nx, uint ny, int clear); + + void (*get_array)(const cfp_array2f* self, float* p); + void (*set_array)(cfp_array2f* self, const float* p); + float (*get_flat)(const cfp_array2f* self, uint i); + void (*set_flat)(cfp_array2f* self, uint i, float val); + float (*get)(const cfp_array2f* self, uint i, uint j); + void (*set)(cfp_array2f* self, uint i, uint j, float val); +} cfp_array2f_api; + +#endif diff --git a/zfp/cfp/include/cfparray3d.h b/zfp/cfp/include/cfparray3d.h new file mode 100644 index 0000000000000000000000000000000000000000..8390a61949d2bb4840fe85b63637ee731b6f1b7a --- /dev/null +++ b/zfp/cfp/include/cfparray3d.h @@ -0,0 +1,40 @@ +#ifndef CFP_ARRAY_3D +#define CFP_ARRAY_3D + +#include <stddef.h> +#include "zfp/types.h" + +struct cfp_array3d; +typedef struct cfp_array3d cfp_array3d; + +typedef struct { + cfp_array3d* (*ctor_default)(); + cfp_array3d* (*ctor)(uint nx, uint ny, uint nz, double rate, const double* p, size_t csize); + cfp_array3d* (*ctor_copy)(const cfp_array3d* src); + void (*dtor)(cfp_array3d* self); + + void (*deep_copy)(cfp_array3d* self, const cfp_array3d* src); + + double (*rate)(const cfp_array3d* self); + double (*set_rate)(cfp_array3d* self, double rate); + size_t (*cache_size)(const cfp_array3d* self); + void (*set_cache_size)(cfp_array3d* self, size_t csize); + void (*clear_cache)(const cfp_array3d* self); + void (*flush_cache)(const cfp_array3d* self); + size_t (*compressed_size)(const cfp_array3d* self); + uchar* (*compressed_data)(const cfp_array3d* self); + size_t (*size)(const cfp_array3d* self); + uint (*size_x)(const cfp_array3d* self); + uint (*size_y)(const cfp_array3d* self); + uint (*size_z)(const cfp_array3d* self); + void (*resize)(cfp_array3d* self, uint nx, uint ny, uint nz, int clear); + + void (*get_array)(const cfp_array3d* self, double* p); + void (*set_array)(cfp_array3d* self, const double* p); + double (*get_flat)(const cfp_array3d* self, uint i); + void (*set_flat)(cfp_array3d* self, uint i, double val); + double (*get)(const cfp_array3d* self, uint i, uint j, uint k); + void (*set)(cfp_array3d* self, uint i, uint j, uint k, double val); +} cfp_array3d_api; + +#endif diff --git a/zfp/cfp/include/cfparray3f.h b/zfp/cfp/include/cfparray3f.h new file mode 100644 index 0000000000000000000000000000000000000000..0261df3132814c9e2645153ee47388341e3b01c6 --- /dev/null +++ b/zfp/cfp/include/cfparray3f.h @@ -0,0 +1,40 @@ +#ifndef CFP_ARRAY_3F +#define CFP_ARRAY_3F + +#include <stddef.h> +#include "zfp/types.h" + +struct cfp_array3f; +typedef struct cfp_array3f cfp_array3f; + +typedef struct { + cfp_array3f* (*ctor_default)(); + cfp_array3f* (*ctor)(uint nx, uint ny, uint nz, double rate, const float* p, size_t csize); + cfp_array3f* (*ctor_copy)(const cfp_array3f* src); + void (*dtor)(cfp_array3f* self); + + void (*deep_copy)(cfp_array3f* self, const cfp_array3f* src); + + double (*rate)(const cfp_array3f* self); + double (*set_rate)(cfp_array3f* self, double rate); + size_t (*cache_size)(const cfp_array3f* self); + void (*set_cache_size)(cfp_array3f* self, size_t csize); + void (*clear_cache)(const cfp_array3f* self); + void (*flush_cache)(const cfp_array3f* self); + size_t (*compressed_size)(const cfp_array3f* self); + uchar* (*compressed_data)(const cfp_array3f* self); + size_t (*size)(const cfp_array3f* self); + uint (*size_x)(const cfp_array3f* self); + uint (*size_y)(const cfp_array3f* self); + uint (*size_z)(const cfp_array3f* self); + void (*resize)(cfp_array3f* self, uint nx, uint ny, uint nz, int clear); + + void (*get_array)(const cfp_array3f* self, float* p); + void (*set_array)(cfp_array3f* self, const float* p); + float (*get_flat)(const cfp_array3f* self, uint i); + void (*set_flat)(cfp_array3f* self, uint i, float val); + float (*get)(const cfp_array3f* self, uint i, uint j, uint k); + void (*set)(cfp_array3f* self, uint i, uint j, uint k, float val); +} cfp_array3f_api; + +#endif diff --git a/zfp/cfp/include/cfparrays.h b/zfp/cfp/include/cfparrays.h new file mode 100644 index 0000000000000000000000000000000000000000..f716d8283eca427e58f34cd72bc178a85cd1e198 --- /dev/null +++ b/zfp/cfp/include/cfparrays.h @@ -0,0 +1,28 @@ +#ifndef CFP_ARRAYS +#define CFP_ARRAYS + +#include "cfparray1f.h" +#include "cfparray1d.h" +#include "cfparray2f.h" +#include "cfparray2d.h" +#include "cfparray3f.h" +#include "cfparray3d.h" + +#include "zfp/system.h" + +typedef struct { + cfp_array1f_api array1f; + cfp_array1d_api array1d; + cfp_array2f_api array2f; + cfp_array2d_api array2d; + cfp_array3f_api array3f; + cfp_array3d_api array3d; +} cfp_api; + +#ifndef CFP_NAMESPACE + #define CFP_NAMESPACE cfp +#endif + +extern_ const cfp_api CFP_NAMESPACE; + +#endif diff --git a/zfp/cfp/src/CMakeLists.txt b/zfp/cfp/src/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2e8c680dfb4f658689663799469e0299f63c0be --- /dev/null +++ b/zfp/cfp/src/CMakeLists.txt @@ -0,0 +1,25 @@ +add_library(cfp cfparrays.cpp) + +if(DEFINED CFP_NAMESPACE) + list(APPEND cfp_public_defs "CFP_NAMESPACE=${CFP_NAMESPACE}") +endif() + +if(WIN32) + # define ZFP_SOURCE when compiling libcfp to export symbols to Windows DLL + list(APPEND cfp_private_defs ZFP_SOURCE) +endif() + +target_compile_definitions(cfp + PUBLIC ${cfp_public_defs} + PRIVATE ${cfp_private_defs}) + +target_include_directories(cfp + PUBLIC + ${ZFP_SOURCE_DIR}/include + ${ZFP_SOURCE_DIR}/cfp/include + PRIVATE + ${ZFP_SOURCE_DIR}/array + ${ZFP_SOURCE_DIR}/src +) + +target_link_libraries(cfp zfp) diff --git a/zfp/cfp/src/Makefile b/zfp/cfp/src/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..eef12ffc96e4deea4e43de88be54dcaeed012aa7 --- /dev/null +++ b/zfp/cfp/src/Makefile @@ -0,0 +1,25 @@ +include ../../Config + +CXXFLAGS += -I../../include -I../../src -I../../array +LIBDIR = ../../lib +TARGETS = $(LIBDIR)/libcfp.a $(LIBDIR)/libcfp.so +OBJECTS = cfparrays.o + +static: $(LIBDIR)/libcfp.a + +shared: $(LIBDIR)/libcfp.so + +clean: + rm -f $(TARGETS) $(OBJECTS) + +$(LIBDIR)/libcfp.a: $(OBJECTS) + mkdir -p $(LIBDIR) + rm -f $@ + ar rc $@ $^ + +$(LIBDIR)/libcfp.so: $(OBJECTS) + mkdir -p $(LIBDIR) + $(CXX) $(CXXLAGS) -shared $(SOFLAGS) $^ -o $@ + +.cpp.o: + $(CXX) $(CXXFLAGS) -c $< diff --git a/zfp/cfp/src/cfparray1_source.cpp b/zfp/cfp/src/cfparray1_source.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bdab414d4b9e7ba61349d577194af47ab57c2487 --- /dev/null +++ b/zfp/cfp/src/cfparray1_source.cpp @@ -0,0 +1,23 @@ +static CFP_ARRAY_TYPE * +_t1(CFP_ARRAY_TYPE, ctor)(uint n, double rate, const ZFP_SCALAR_TYPE * p, size_t csize) +{ + return reinterpret_cast<CFP_ARRAY_TYPE *>(new ZFP_ARRAY_TYPE(n, rate, p, csize)); +} + +static void +_t1(CFP_ARRAY_TYPE, resize)(CFP_ARRAY_TYPE * self, uint n, int clear) +{ + reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->resize(n, clear); +} + +static ZFP_SCALAR_TYPE +_t1(CFP_ARRAY_TYPE, get)(const CFP_ARRAY_TYPE * self, uint i) +{ + return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->operator()(i); +} + +static void +_t1(CFP_ARRAY_TYPE, set)(CFP_ARRAY_TYPE * self, uint i, ZFP_SCALAR_TYPE val) +{ + reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->operator()(i) = val; +} diff --git a/zfp/cfp/src/cfparray1d.cpp b/zfp/cfp/src/cfparray1d.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1e71b0d2d745ecb7da11ddc95e71d5f1576de4f5 --- /dev/null +++ b/zfp/cfp/src/cfparray1d.cpp @@ -0,0 +1,15 @@ +#include "cfparray1d.h" +#include "zfparray1.h" + +#include "template/template.h" + +#define CFP_ARRAY_TYPE cfp_array1d +#define ZFP_ARRAY_TYPE zfp::array1d +#define ZFP_SCALAR_TYPE double + +#include "cfparray_source.cpp" +#include "cfparray1_source.cpp" + +#undef CFP_ARRAY_TYPE +#undef ZFP_ARRAY_TYPE +#undef ZFP_SCALAR_TYPE diff --git a/zfp/cfp/src/cfparray1f.cpp b/zfp/cfp/src/cfparray1f.cpp new file mode 100644 index 0000000000000000000000000000000000000000..56ecda58030d1fe528e8dce4a817df0208e1f318 --- /dev/null +++ b/zfp/cfp/src/cfparray1f.cpp @@ -0,0 +1,15 @@ +#include "cfparray1f.h" +#include "zfparray1.h" + +#include "template/template.h" + +#define CFP_ARRAY_TYPE cfp_array1f +#define ZFP_ARRAY_TYPE zfp::array1f +#define ZFP_SCALAR_TYPE float + +#include "cfparray_source.cpp" +#include "cfparray1_source.cpp" + +#undef CFP_ARRAY_TYPE +#undef ZFP_ARRAY_TYPE +#undef ZFP_SCALAR_TYPE diff --git a/zfp/cfp/src/cfparray2_source.cpp b/zfp/cfp/src/cfparray2_source.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6135ae40eb70b31521a4a60633fea7f2faaf5800 --- /dev/null +++ b/zfp/cfp/src/cfparray2_source.cpp @@ -0,0 +1,35 @@ +static CFP_ARRAY_TYPE * +_t1(CFP_ARRAY_TYPE, ctor)(uint nx, uint ny, double rate, const ZFP_SCALAR_TYPE * p, size_t csize) +{ + return reinterpret_cast<CFP_ARRAY_TYPE *>(new ZFP_ARRAY_TYPE(nx, ny, rate, p, csize)); +} + +static uint +_t1(CFP_ARRAY_TYPE, size_x)(const CFP_ARRAY_TYPE * self) +{ + return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_x(); +} + +static uint +_t1(CFP_ARRAY_TYPE, size_y)(const CFP_ARRAY_TYPE * self) +{ + return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_y(); +} + +static void +_t1(CFP_ARRAY_TYPE, resize)(CFP_ARRAY_TYPE * self, uint nx, uint ny, int clear) +{ + reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->resize(nx, ny, clear); +} + +static ZFP_SCALAR_TYPE +_t1(CFP_ARRAY_TYPE, get)(const CFP_ARRAY_TYPE * self, uint i, uint j) +{ + return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->operator()(i, j); +} + +static void +_t1(CFP_ARRAY_TYPE, set)(CFP_ARRAY_TYPE * self, uint i, uint j, ZFP_SCALAR_TYPE val) +{ + reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->operator()(i, j) = val; +} diff --git a/zfp/cfp/src/cfparray2d.cpp b/zfp/cfp/src/cfparray2d.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3debb2b8044dd8bd9bbce4213fcf0fd30c380121 --- /dev/null +++ b/zfp/cfp/src/cfparray2d.cpp @@ -0,0 +1,15 @@ +#include "cfparray2d.h" +#include "zfparray2.h" + +#include "template/template.h" + +#define CFP_ARRAY_TYPE cfp_array2d +#define ZFP_ARRAY_TYPE zfp::array2d +#define ZFP_SCALAR_TYPE double + +#include "cfparray_source.cpp" +#include "cfparray2_source.cpp" + +#undef CFP_ARRAY_TYPE +#undef ZFP_ARRAY_TYPE +#undef ZFP_SCALAR_TYPE diff --git a/zfp/cfp/src/cfparray2f.cpp b/zfp/cfp/src/cfparray2f.cpp new file mode 100644 index 0000000000000000000000000000000000000000..37407cc8a75277b0129c50b71ef36ede82fbc557 --- /dev/null +++ b/zfp/cfp/src/cfparray2f.cpp @@ -0,0 +1,15 @@ +#include "cfparray2f.h" +#include "zfparray2.h" + +#include "template/template.h" + +#define CFP_ARRAY_TYPE cfp_array2f +#define ZFP_ARRAY_TYPE zfp::array2f +#define ZFP_SCALAR_TYPE float + +#include "cfparray_source.cpp" +#include "cfparray2_source.cpp" + +#undef CFP_ARRAY_TYPE +#undef ZFP_ARRAY_TYPE +#undef ZFP_SCALAR_TYPE diff --git a/zfp/cfp/src/cfparray3_source.cpp b/zfp/cfp/src/cfparray3_source.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ae2ebf6da29835cd2a348ac4de392ab722bec44f --- /dev/null +++ b/zfp/cfp/src/cfparray3_source.cpp @@ -0,0 +1,41 @@ +static CFP_ARRAY_TYPE * +_t1(CFP_ARRAY_TYPE, ctor)(uint nx, uint ny, uint nz, double rate, const ZFP_SCALAR_TYPE * p, size_t csize) +{ + return reinterpret_cast<CFP_ARRAY_TYPE *>(new ZFP_ARRAY_TYPE(nx, ny, nz, rate, p, csize)); +} + +static uint +_t1(CFP_ARRAY_TYPE, size_x)(const CFP_ARRAY_TYPE * self) +{ + return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_x(); +} + +static uint +_t1(CFP_ARRAY_TYPE, size_y)(const CFP_ARRAY_TYPE * self) +{ + return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_y(); +} + +static uint +_t1(CFP_ARRAY_TYPE, size_z)(const CFP_ARRAY_TYPE * self) +{ + return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_z(); +} + +static void +_t1(CFP_ARRAY_TYPE, resize)(CFP_ARRAY_TYPE * self, uint nx, uint ny, uint nz, int clear) +{ + reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->resize(nx, ny, nz, clear); +} + +static ZFP_SCALAR_TYPE +_t1(CFP_ARRAY_TYPE, get)(const CFP_ARRAY_TYPE * self, uint i, uint j, uint k) +{ + return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->operator()(i, j, k); +} + +static void +_t1(CFP_ARRAY_TYPE, set)(CFP_ARRAY_TYPE * self, uint i, uint j, uint k, ZFP_SCALAR_TYPE val) +{ + reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->operator()(i, j, k) = val; +} diff --git a/zfp/cfp/src/cfparray3d.cpp b/zfp/cfp/src/cfparray3d.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fb5cc2e248148639d07186ed65778d62c1e696ef --- /dev/null +++ b/zfp/cfp/src/cfparray3d.cpp @@ -0,0 +1,15 @@ +#include "cfparray3d.h" +#include "zfparray3.h" + +#include "template/template.h" + +#define CFP_ARRAY_TYPE cfp_array3d +#define ZFP_ARRAY_TYPE zfp::array3d +#define ZFP_SCALAR_TYPE double + +#include "cfparray_source.cpp" +#include "cfparray3_source.cpp" + +#undef CFP_ARRAY_TYPE +#undef ZFP_ARRAY_TYPE +#undef ZFP_SCALAR_TYPE diff --git a/zfp/cfp/src/cfparray3f.cpp b/zfp/cfp/src/cfparray3f.cpp new file mode 100644 index 0000000000000000000000000000000000000000..69331b1c1a0463ff9b2db941ac1a938a6ade5ce8 --- /dev/null +++ b/zfp/cfp/src/cfparray3f.cpp @@ -0,0 +1,15 @@ +#include "cfparray3f.h" +#include "zfparray3.h" + +#include "template/template.h" + +#define CFP_ARRAY_TYPE cfp_array3f +#define ZFP_ARRAY_TYPE zfp::array3f +#define ZFP_SCALAR_TYPE float + +#include "cfparray_source.cpp" +#include "cfparray3_source.cpp" + +#undef CFP_ARRAY_TYPE +#undef ZFP_ARRAY_TYPE +#undef ZFP_SCALAR_TYPE diff --git a/zfp/cfp/src/cfparray_source.cpp b/zfp/cfp/src/cfparray_source.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d94e1a4979534f03b0ef278d4b0e2671bc642766 --- /dev/null +++ b/zfp/cfp/src/cfparray_source.cpp @@ -0,0 +1,106 @@ +// common constructor, destructor +static CFP_ARRAY_TYPE * +_t1(CFP_ARRAY_TYPE, ctor_default)() +{ + return reinterpret_cast<CFP_ARRAY_TYPE *>(new ZFP_ARRAY_TYPE()); +} + +static CFP_ARRAY_TYPE * +_t1(CFP_ARRAY_TYPE, ctor_copy)(const CFP_ARRAY_TYPE * src) +{ + return reinterpret_cast<CFP_ARRAY_TYPE *>( + new ZFP_ARRAY_TYPE(*reinterpret_cast<const ZFP_ARRAY_TYPE *>(src)) + ); +} + +static void +_t1(CFP_ARRAY_TYPE, dtor)(CFP_ARRAY_TYPE * self) +{ + delete reinterpret_cast<ZFP_ARRAY_TYPE *>(self); +} + +// functions defined in zfparray.h (base class) +static double +_t1(CFP_ARRAY_TYPE, rate)(const CFP_ARRAY_TYPE * self) +{ + return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->rate(); +} + +static double +_t1(CFP_ARRAY_TYPE, set_rate)(CFP_ARRAY_TYPE * self, double rate) +{ + return reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->set_rate(rate); +} + +static size_t +_t1(CFP_ARRAY_TYPE, compressed_size)(const CFP_ARRAY_TYPE * self) +{ + return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->compressed_size(); +} + +static uchar* +_t1(CFP_ARRAY_TYPE, compressed_data)(const CFP_ARRAY_TYPE * self) +{ + return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->compressed_data(); +} + +static void +_t1(CFP_ARRAY_TYPE, deep_copy)(CFP_ARRAY_TYPE * self, const CFP_ARRAY_TYPE * src) +{ + *reinterpret_cast<ZFP_ARRAY_TYPE *>(self) = *reinterpret_cast<const ZFP_ARRAY_TYPE *>(src); +} + +// functions defined in subclasses +static size_t +_t1(CFP_ARRAY_TYPE, size)(const CFP_ARRAY_TYPE * self) +{ + return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size(); +} + +static size_t +_t1(CFP_ARRAY_TYPE, cache_size)(const CFP_ARRAY_TYPE * self) +{ + return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->cache_size(); +} + +static void +_t1(CFP_ARRAY_TYPE, set_cache_size)(CFP_ARRAY_TYPE * self, size_t csize) +{ + reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->set_cache_size(csize); +} + +static void +_t1(CFP_ARRAY_TYPE, clear_cache)(const CFP_ARRAY_TYPE * self) +{ + reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->clear_cache(); +} + +static void +_t1(CFP_ARRAY_TYPE, flush_cache)(const CFP_ARRAY_TYPE * self) +{ + reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->flush_cache(); +} + +static void +_t1(CFP_ARRAY_TYPE, get_array)(const CFP_ARRAY_TYPE * self, ZFP_SCALAR_TYPE * p) +{ + reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->get(p); +} + +static void +_t1(CFP_ARRAY_TYPE, set_array)(CFP_ARRAY_TYPE * self, const ZFP_SCALAR_TYPE * p) +{ + reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->set(p); +} + +static ZFP_SCALAR_TYPE +_t1(CFP_ARRAY_TYPE, get_flat)(const CFP_ARRAY_TYPE * self, uint i) +{ + return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->operator[](i); +} + +static void +_t1(CFP_ARRAY_TYPE, set_flat)(CFP_ARRAY_TYPE * self, uint i, ZFP_SCALAR_TYPE val) +{ + reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->operator[](i) = val; +} diff --git a/zfp/cfp/src/cfparrays.cpp b/zfp/cfp/src/cfparrays.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bcd886868a2cd7510664183b3587eeca44bd1591 --- /dev/null +++ b/zfp/cfp/src/cfparrays.cpp @@ -0,0 +1,183 @@ +#include "cfparrays.h" + +#include "cfparray1f.cpp" +#include "cfparray1d.cpp" +#include "cfparray2f.cpp" +#include "cfparray2d.cpp" +#include "cfparray3f.cpp" +#include "cfparray3d.cpp" + +export_ const cfp_api CFP_NAMESPACE = { + // array1f + { + cfp_array1f_ctor_default, + cfp_array1f_ctor, + cfp_array1f_ctor_copy, + cfp_array1f_dtor, + + cfp_array1f_deep_copy, + + cfp_array1f_rate, + cfp_array1f_set_rate, + cfp_array1f_cache_size, + cfp_array1f_set_cache_size, + cfp_array1f_clear_cache, + cfp_array1f_flush_cache, + cfp_array1f_compressed_size, + cfp_array1f_compressed_data, + cfp_array1f_size, + cfp_array1f_resize, + + cfp_array1f_get_array, + cfp_array1f_set_array, + cfp_array1f_get_flat, + cfp_array1f_set_flat, + cfp_array1f_get, + cfp_array1f_set, + }, + // array1d + { + cfp_array1d_ctor_default, + cfp_array1d_ctor, + cfp_array1d_ctor_copy, + cfp_array1d_dtor, + + cfp_array1d_deep_copy, + + cfp_array1d_rate, + cfp_array1d_set_rate, + cfp_array1d_cache_size, + cfp_array1d_set_cache_size, + cfp_array1d_clear_cache, + cfp_array1d_flush_cache, + cfp_array1d_compressed_size, + cfp_array1d_compressed_data, + cfp_array1d_size, + cfp_array1d_resize, + + cfp_array1d_get_array, + cfp_array1d_set_array, + cfp_array1d_get_flat, + cfp_array1d_set_flat, + cfp_array1d_get, + cfp_array1d_set, + }, + // array2f + { + cfp_array2f_ctor_default, + cfp_array2f_ctor, + cfp_array2f_ctor_copy, + cfp_array2f_dtor, + + cfp_array2f_deep_copy, + + cfp_array2f_rate, + cfp_array2f_set_rate, + cfp_array2f_cache_size, + cfp_array2f_set_cache_size, + cfp_array2f_clear_cache, + cfp_array2f_flush_cache, + cfp_array2f_compressed_size, + cfp_array2f_compressed_data, + cfp_array2f_size, + cfp_array2f_size_x, + cfp_array2f_size_y, + cfp_array2f_resize, + + cfp_array2f_get_array, + cfp_array2f_set_array, + cfp_array2f_get_flat, + cfp_array2f_set_flat, + cfp_array2f_get, + cfp_array2f_set, + }, + // array2d + { + cfp_array2d_ctor_default, + cfp_array2d_ctor, + cfp_array2d_ctor_copy, + cfp_array2d_dtor, + + cfp_array2d_deep_copy, + + cfp_array2d_rate, + cfp_array2d_set_rate, + cfp_array2d_cache_size, + cfp_array2d_set_cache_size, + cfp_array2d_clear_cache, + cfp_array2d_flush_cache, + cfp_array2d_compressed_size, + cfp_array2d_compressed_data, + cfp_array2d_size, + cfp_array2d_size_x, + cfp_array2d_size_y, + cfp_array2d_resize, + + cfp_array2d_get_array, + cfp_array2d_set_array, + cfp_array2d_get_flat, + cfp_array2d_set_flat, + cfp_array2d_get, + cfp_array2d_set, + }, + // array3f + { + cfp_array3f_ctor_default, + cfp_array3f_ctor, + cfp_array3f_ctor_copy, + cfp_array3f_dtor, + + cfp_array3f_deep_copy, + + cfp_array3f_rate, + cfp_array3f_set_rate, + cfp_array3f_cache_size, + cfp_array3f_set_cache_size, + cfp_array3f_clear_cache, + cfp_array3f_flush_cache, + cfp_array3f_compressed_size, + cfp_array3f_compressed_data, + cfp_array3f_size, + cfp_array3f_size_x, + cfp_array3f_size_y, + cfp_array3f_size_z, + cfp_array3f_resize, + + cfp_array3f_get_array, + cfp_array3f_set_array, + cfp_array3f_get_flat, + cfp_array3f_set_flat, + cfp_array3f_get, + cfp_array3f_set, + }, + // array3d + { + cfp_array3d_ctor_default, + cfp_array3d_ctor, + cfp_array3d_ctor_copy, + cfp_array3d_dtor, + + cfp_array3d_deep_copy, + + cfp_array3d_rate, + cfp_array3d_set_rate, + cfp_array3d_cache_size, + cfp_array3d_set_cache_size, + cfp_array3d_clear_cache, + cfp_array3d_flush_cache, + cfp_array3d_compressed_size, + cfp_array3d_compressed_data, + cfp_array3d_size, + cfp_array3d_size_x, + cfp_array3d_size_y, + cfp_array3d_size_z, + cfp_array3d_resize, + + cfp_array3d_get_array, + cfp_array3d_set_array, + cfp_array3d_get_flat, + cfp_array3d_set_flat, + cfp_array3d_get, + cfp_array3d_set, + }, +}; diff --git a/zfp/examples/CMakeLists.txt b/zfp/examples/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..c879beab60b10aeb05e085a3f7a8d33333725c2d --- /dev/null +++ b/zfp/examples/CMakeLists.txt @@ -0,0 +1,35 @@ +add_executable(diffusion diffusion.cpp) +target_link_libraries(diffusion zfp) +target_compile_definitions(diffusion PRIVATE ${zfp_defs}) + +add_executable(diffusionC diffusionC.c) +target_link_libraries(diffusionC cfp) +target_compile_definitions(diffusionC PRIVATE ${zfp_defs}) + +add_executable(inplace inplace.c) +target_link_libraries(inplace zfp) +target_compile_definitions(inplace PRIVATE ${zfp_defs}) + +add_executable(iterator iterator.cpp) +target_link_libraries(iterator zfp) +target_compile_definitions(iterator PRIVATE ${zfp_defs}) + +add_executable(pgm pgm.c) +target_link_libraries(pgm zfp) +target_compile_definitions(pgm PRIVATE ${zfp_defs}) + +add_executable(simple simple.c) +target_link_libraries(simple zfp) +target_compile_definitions(simple PRIVATE ${zfp_defs}) + +add_executable(speed speed.c) +target_link_libraries(speed zfp) +target_compile_definitions(speed PRIVATE ${zfp_defs}) + +if(HAVE_LIBM_MATH) + target_link_libraries(diffusion m) + target_link_libraries(diffusionC m) + target_link_libraries(inplace m) + target_link_libraries(pgm m) + target_link_libraries(simple m) +endif() diff --git a/zfp/examples/Makefile b/zfp/examples/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..bb44b1e17ac76820d7f39b0e12ea1c5aaa11afc0 --- /dev/null +++ b/zfp/examples/Makefile @@ -0,0 +1,35 @@ +include ../Config + +BINDIR = ../bin +TARGETS = $(BINDIR)/diffusion\ + $(BINDIR)/inplace\ + $(BINDIR)/iterator\ + $(BINDIR)/pgm\ + $(BINDIR)/simple\ + $(BINDIR)/speed +LIBS = -L../lib -lzfp +CLIBS = $(LIBS) -lm +CXXLIBS = $(LIBS) + +all: $(TARGETS) + +$(BINDIR)/diffusion: diffusion.cpp ../lib/$(LIBZFP) + $(CXX) $(CXXFLAGS) -I../array diffusion.cpp $(CXXLIBS) -o $@ + +$(BINDIR)/inplace: inplace.c ../lib/$(LIBZFP) + $(CC) $(CFLAGS) inplace.c $(CLIBS) -o $@ + +$(BINDIR)/iterator: iterator.cpp ../lib/$(LIBZFP) + $(CXX) $(CXXFLAGS) -I../array iterator.cpp $(CXXLIBS) -o $@ + +$(BINDIR)/pgm: pgm.c ../lib/$(LIBZFP) + $(CC) $(CFLAGS) pgm.c $(CLIBS) -o $@ + +$(BINDIR)/simple: simple.c ../lib/$(LIBZFP) + $(CC) $(CFLAGS) simple.c $(CLIBS) -o $@ + +$(BINDIR)/speed: speed.c ../lib/$(LIBZFP) + $(CC) $(CFLAGS) speed.c $(CLIBS) -o $@ + +clean: + rm -f $(TARGETS) diff --git a/zfp/examples/array2d.h b/zfp/examples/array2d.h new file mode 100644 index 0000000000000000000000000000000000000000..861fa25a9a27d72864b5a763c20a975b8cb104a3 --- /dev/null +++ b/zfp/examples/array2d.h @@ -0,0 +1,49 @@ +#ifndef ARRAY2D_H +#define ARRAY2D_H + +#include <climits> +#include <vector> + +typedef unsigned int uint; + +// uncompressed 2D double-precision array (for comparison) +namespace raw { +class array2d { +public: + array2d() : nx(0), ny(0) {} + array2d(uint nx, uint ny, double rate = 0.0, const double* p = 0, size_t csize = 0) : nx(nx), ny(ny), data(nx * ny, 0.0) {} + void resize(uint nx, uint ny) { this->nx = nx; this->ny = ny; data.resize(nx * ny, 0.0); } + size_t size() const { return data.size(); } + size_t size_x() const { return nx; } + size_t size_y() const { return ny; } + double rate() const { return CHAR_BIT * sizeof(double); } + size_t cache_size() const { return 0; } + double& operator()(uint x, uint y) { return data[x + nx * y]; } + const double& operator()(uint x, uint y) const { return data[x + nx * y]; } + double& operator[](uint i) { return data[i]; } + const double& operator[](uint i) const { return data[i]; } + class iterator { + public: + double& operator*() const { return array->operator[](index); } + iterator& operator++() { index++; return *this; } + iterator operator++(int) { iterator p = *this; index++; return p; } + bool operator==(const iterator& it) const { return array == it.array && index == it.index; } + bool operator!=(const iterator& it) const { return !operator==(it); } + uint i() const { return index % array->nx; } + uint j() const { return index / array->nx; } + protected: + friend class array2d; + iterator(array2d* array, uint index) : array(array), index(index) {} + array2d* array; + uint index; + }; + iterator begin() { return iterator(this, 0); } + iterator end() { return iterator(this, nx * ny); } +protected: + uint nx; + uint ny; + std::vector<double> data; +}; +} + +#endif diff --git a/zfp/examples/diffusion.cpp b/zfp/examples/diffusion.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3512bff428c34891d155cd9f7768369e4e57f137 --- /dev/null +++ b/zfp/examples/diffusion.cpp @@ -0,0 +1,281 @@ +// forward Euler finite difference solution to the heat equation on a 2D grid + +#include <algorithm> +#include <cmath> +#include <cstdio> +#include <cstdlib> +#include <iomanip> +#include <iostream> +#include "zfparray2.h" +#include "array2d.h" + +#ifdef _OPENMP +#include <omp.h> +#endif + +// constants used in the solution +class Constants { +public: + Constants(int nx, int ny, int nt) : + nx(nx), + ny(ny), + nt(nt), + x0((nx - 1) / 2), + y0((ny - 1) / 2), + k(0.04), + dx(2.0 / (std::max(nx, ny) - 1)), + dy(2.0 / (std::max(nx, ny) - 1)), + dt(0.5 * (dx * dx + dy * dy) / (8 * k)), + tfinal(nt ? nt * dt : 1.0), + pi(3.14159265358979323846) + {} + + int nx; // grid points in x + int ny; // grid points in y + int nt; // number of time steps (0 for default) + int x0; // x location of heat source + int y0; // y location of heat source + double k; // diffusion constant + double dx; // grid spacing in x + double dy; // grid spacing in y + double dt; // time step + double tfinal; // minimum time to run solution to + double pi; // 3.141... +}; + +template <class array2d> +inline void +time_step_parallel(array2d& u, const Constants& c); + +// advance solution in parallel via thread-safe views +template <> +inline void +time_step_parallel(zfp::array2d& u, const Constants& c) +{ +#ifdef _OPENMP + // flush shared cache to ensure cache consistency across threads + u.flush_cache(); + // compute du/dt in parallel + zfp::array2d du(c.nx, c.ny, u.rate(), 0, u.cache_size()); + #pragma omp parallel + { + // create read-only private view of entire array u + zfp::array2d::private_const_view myu(&u); + // create read-write private view into rectangular subset of du + zfp::array2d::private_view mydu(&du); + mydu.partition(omp_get_thread_num(), omp_get_num_threads()); + // process rectangular region owned by this thread + for (uint j = 0; j < mydu.size_y(); j++) { + int y = mydu.global_y(j); + if (1 <= y && y <= c.ny - 2) + for (uint i = 0; i < mydu.size_x(); i++) { + int x = mydu.global_x(i); + if (1 <= x && x <= c.nx - 2) { + double uxx = (myu(x - 1, y) - 2 * myu(x, y) + myu(x + 1, y)) / (c.dx * c.dx); + double uyy = (myu(x, y - 1) - 2 * myu(x, y) + myu(x, y + 1)) / (c.dy * c.dy); + mydu(i, j) = c.dt * c.k * (uxx + uyy); + } + } + } + // compress all private cached blocks to shared storage + mydu.flush_cache(); + } + // take forward Euler step in serial + for (uint i = 0; i < u.size(); i++) + u[i] += du[i]; +#endif +} + +// dummy template instantiation; never executed +template <> +inline void +time_step_parallel(raw::array2d& u, const Constants& c) +{ +} + +// advance solution using integer array indices +template <class array2d> +inline void +time_step_indexed(array2d& u, const Constants& c) +{ + // compute du/dt + array2d du(c.nx, c.ny, u.rate(), 0, u.cache_size()); + for (int y = 1; y < c.ny - 1; y++) { + for (int x = 1; x < c.nx - 1; x++) { + double uxx = (u(x - 1, y) - 2 * u(x, y) + u(x + 1, y)) / (c.dx * c.dx); + double uyy = (u(x, y - 1) - 2 * u(x, y) + u(x, y + 1)) / (c.dy * c.dy); + du(x, y) = c.dt * c.k * (uxx + uyy); + } + } + // take forward Euler step + for (uint i = 0; i < u.size(); i++) + u[i] += du[i]; +} + +// advance solution using array iterators +template <class array2d> +inline void +time_step_iterated(array2d& u, const Constants& c) +{ + // compute du/dt + array2d du(c.nx, c.ny, u.rate(), 0, u.cache_size()); + for (typename array2d::iterator p = du.begin(); p != du.end(); p++) { + int x = p.i(); + int y = p.j(); + if (1 <= x && x <= c.nx - 2 && + 1 <= y && y <= c.ny - 2) { + double uxx = (u(x - 1, y) - 2 * u(x, y) + u(x + 1, y)) / (c.dx * c.dx); + double uyy = (u(x, y - 1) - 2 * u(x, y) + u(x, y + 1)) / (c.dy * c.dy); + *p = c.dt * c.k * (uxx + uyy); + } + } + // take forward Euler step + for (typename array2d::iterator p = u.begin(), q = du.begin(); p != u.end(); p++, q++) + *p += *q; +} + +// solve heat equation using +template <class array2d> +inline double +solve(array2d& u, const Constants& c, bool iterator, bool parallel) +{ + // initialize u with point heat source (u is assumed to be zero initialized) + u(c.x0, c.y0) = 1; + + // iterate until final time + double t; + for (t = 0; t < c.tfinal; t += c.dt) { + std::cerr << "t=" << std::setprecision(6) << std::fixed << t << std::endl; + if (parallel) + time_step_parallel(u, c); + else if (iterator) + time_step_iterated(u, c); + else + time_step_indexed(u, c); + } + + return t; +} + +// compute sum of array values +template <class array2d> +inline double +total(const array2d& u) +{ + double s = 0; + const int nx = u.size_x(); + const int ny = u.size_y(); + for (int y = 1; y < ny - 1; y++) + for (int x = 1; x < nx - 1; x++) + s += u(x, y); + return s; +} + +// compute root mean square error with respect to exact solution +template <class array2d> +inline double +error(const array2d& u, const Constants& c, double t) +{ + double e = 0; + for (int y = 1; y < c.ny - 1; y++) { + double py = c.dy * (y - c.y0); + for (int x = 1; x < c.nx - 1; x++) { + double px = c.dx * (x - c.x0); + double f = u(x, y); + double g = c.dx * c.dy * std::exp(-(px * px + py * py) / (4 * c.k * t)) / (4 * c.pi * c.k * t); + e += (f - g) * (f - g); + } + } + return std::sqrt(e / ((c.nx - 2) * (c.ny - 2))); +} + +inline int +usage() +{ + std::cerr << "Usage: diffusion [options]" << std::endl; + std::cerr << "Options:" << std::endl; + std::cerr << "-i : traverse arrays using iterators" << std::endl; + std::cerr << "-n <nx> <ny> : number of grid points" << std::endl; +#ifdef _OPENMP + std::cerr << "-p : use multithreading (only with compressed arrays)" << std::endl; +#endif + std::cerr << "-t <nt> : number of time steps" << std::endl; + std::cerr << "-r <rate> : use compressed arrays with 'rate' bits/value" << std::endl; + std::cerr << "-c <blocks> : use 'blocks' 4x4 blocks of cache" << std::endl; + return EXIT_FAILURE; +} + +int main(int argc, char* argv[]) +{ + int nx = 100; + int ny = 100; + int nt = 0; + double rate = 64; + bool iterator = false; + bool compression = false; + bool parallel = false; + int cache = 0; + + // parse command-line options + for (int i = 1; i < argc; i++) + if (std::string(argv[i]) == "-i") + iterator = true; + else if (std::string(argv[i]) == "-n") { + if (++i == argc || sscanf(argv[i], "%i", &nx) != 1 || + ++i == argc || sscanf(argv[i], "%i", &ny) != 1) + return usage(); + } +#ifdef _OPENMP + else if (std::string(argv[i]) == "-p") + parallel = true; +#endif + else if (std::string(argv[i]) == "-t") { + if (++i == argc || sscanf(argv[i], "%i", &nt) != 1) + return usage(); + } + else if (std::string(argv[i]) == "-r") { + if (++i == argc || sscanf(argv[i], "%lf", &rate) != 1) + return usage(); + compression = true; + } + else if (std::string(argv[i]) == "-c") { + if (++i == argc || sscanf(argv[i], "%i", &cache) != 1) + return usage(); + } + else + return usage(); + + if (parallel && !compression) { + fprintf(stderr, "multithreading requires compressed arrays\n"); + return EXIT_FAILURE; + } + if (parallel && iterator) { + fprintf(stderr, "multithreading does not support iterators\n"); + return EXIT_FAILURE; + } + + Constants c(nx, ny, nt); + + double sum; + double err; + if (compression) { + // solve problem using compressed arrays + zfp::array2d u(nx, ny, rate, 0, cache * 4 * 4 * sizeof(double)); + rate = u.rate(); + double t = solve(u, c, iterator, parallel); + sum = total(u); + err = error(u, c, t); + } + else { + // solve problem using uncompressed arrays + raw::array2d u(nx, ny); + double t = solve(u, c, iterator, parallel); + sum = total(u); + err = error(u, c, t); + } + + std::cerr.unsetf(std::ios::fixed); + std::cerr << "rate=" << rate << " sum=" << std::fixed << sum << " error=" << std::setprecision(6) << std::scientific << err << std::endl; + + return 0; +} diff --git a/zfp/examples/diffusionC.c b/zfp/examples/diffusionC.c new file mode 100644 index 0000000000000000000000000000000000000000..99a5c3db0de3bfee6726b41c7e8cd769d483aa80 --- /dev/null +++ b/zfp/examples/diffusionC.c @@ -0,0 +1,267 @@ +// forward Euler finite difference solution to the heat equation on a 2D grid +// (ported to C, from diffusion.cpp) + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> + +#include "cfparrays.h" +#define _ (CFP_NAMESPACE.array2d) + +#define MAX(x, y) (((nx) > (ny)) ? (nx) : (ny)) + +// constants used in the solution +typedef struct { + int nx; // grid points in x + int ny; // grid points in y + int nt; // number of time steps (0 for default) + int x0; // x location of heat source + int y0; // y location of heat source + double k; // diffusion constant + double dx; // grid spacing in x + double dy; // grid spacing in y + double dt; // time step + double tfinal; // minimum time to run solution to + double pi; // 3.141... +} constants; + +void +init_constants(constants* c, int nx, int ny, int nt) +{ + c->nx = nx; + c->ny = ny; + c->nt = nt; + c->x0 = (nx - 1) / 2; + c->y0 = (ny - 1) / 2; + c->k = 0.04; + c->dx = 2.0 / (MAX(nx, ny) - 1); + c->dy = 2.0 / (MAX(nx, ny) - 1); + c->dt = 0.5 * (c->dx * c->dx + c->dy * c->dy) / (8 * c->k); + c->tfinal = nt ? nt * c->dt : 1.0; + c->pi = 3.14159265358979323846; +} + +// advance solution using integer array indices +static void +time_step_indexed_compressed(cfp_array2d* u, const constants* c) +{ + // compute du/dt + cfp_array2d* du = _.ctor(c->nx, c->ny, _.rate(u), 0, _.cache_size(u)); + int x, y; + for (y = 1; y < c->ny - 1; y++) { + for (x = 1; x < c->nx - 1; x++) { + double uxx = (_.get(u, x - 1, y) - 2 * _.get(u, x, y) + _.get(u, x + 1, y)) / (c->dx * c->dx); + double uyy = (_.get(u, x, y - 1) - 2 * _.get(u, x, y) + _.get(u, x, y + 1)) / (c->dy * c->dy); + _.set(du, x, y, c->dt * c->k * (uxx + uyy)); + } + } + // take forward Euler step + uint i; + for (i = 0; i < _.size(u); i++) { + // u[i] += du[i] + double val = _.get_flat(u, i) + _.get_flat(du, i); + _.set_flat(u, i, val); + } + + _.dtor(du); +} + +// advance solution using integer array indices +static void +time_step_indexed(double* u, const constants* c) +{ + // compute du/dt + double* du = calloc(c->nx * c->ny, sizeof(double)); + int x, y; + for (y = 1; y < c->ny - 1; y++) { + for (x = 1; x < c->nx - 1; x++) { + double uxx = (u[y*c->nx + (x - 1)] - 2 * u[y*c->nx + x] + u[y*c->nx + (x + 1)]) / (c->dx * c->dx); + double uyy = (u[(y - 1)*c->nx + x] - 2 * u[y*c->nx + x] + u[(y + 1)*c->nx + x]) / (c->dy * c->dy); + du[y*c->nx + x] = c->dt * c->k * (uxx + uyy); + } + } + // take forward Euler step + uint i; + for (i = 0; i < (c->nx * c->ny); i++) { + // u[i] += du[i] + u[i] += du[i]; + } + + free(du); +} + +// solve heat equation using +static double +solve_compressed(cfp_array2d* u, const constants* c) +{ + // initialize u with point heat source (u is assumed to be zero initialized) + _.set(u, c->x0, c->y0, 1); + + // iterate until final time + double t; + for (t = 0; t < c->tfinal; t += c->dt) { + fprintf(stderr, "t=%lf\n", t); + time_step_indexed_compressed(u, c); + } + + return t; +} + +static double +solve(double* u, const constants* c) +{ + // initialize u with point heat source (u is assumed to be zero initialized) + u[c->y0*c->nx + c->x0] = 1; + + // iterate until final time + double t; + for (t = 0; t < c->tfinal; t += c->dt) { + fprintf(stderr, "t=%lf\n", t); + time_step_indexed(u, c); + } + + return t; +} + +// compute sum of array values +static double +total_compressed(const cfp_array2d* u) +{ + double s = 0; + const int nx = _.size_x(u); + const int ny = _.size_y(u); + int x, y; + for (y = 1; y < ny - 1; y++) + for (x = 1; x < nx - 1; x++) + s += _.get(u, x, y); + return s; +} + +// compute sum of array values +static double +total(const double* u, const int nx, const int ny) +{ + double s = 0; + int x, y; + for (y = 1; y < ny - 1; y++) + for (x = 1; x < nx - 1; x++) + s += u[y*nx + x]; + return s; +} + +// compute root mean square error with respect to exact solution +static double +error_compressed(const cfp_array2d* u, const constants* c, double t) +{ + double e = 0; + int x, y; + for (y = 1; y < c->ny - 1; y++) { + double py = c->dy * (y - c->y0); + for (x = 1; x < c->nx - 1; x++) { + double px = c->dx * (x - c->x0); + double f = _.get(u, x, y); + double g = c->dx * c->dy * exp(-(px * px + py * py) / (4 * c->k * t)) / (4 * c->pi * c->k * t); + e += (f - g) * (f - g); + } + } + return sqrt(e / ((c->nx - 2) * (c->ny - 2))); +} + +// compute root mean square error with respect to exact solution +static double +error(const double* u, const constants* c, double t) +{ + double e = 0; + int x, y; + for (y = 1; y < c->ny - 1; y++) { + double py = c->dy * (y - c->y0); + for (x = 1; x < c->nx - 1; x++) { + double px = c->dx * (x - c->x0); + double f = u[y*c->nx + x]; + double g = c->dx * c->dy * exp(-(px * px + py * py) / (4 * c->k * t)) / (4 * c->pi * c->k * t); + e += (f - g) * (f - g); + } + } + return sqrt(e / ((c->nx - 2) * (c->ny - 2))); +} + +static int +usage() +{ + fprintf(stderr, "Usage: diffusionC [options]\n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "-n <nx> <ny> : number of grid points\n"); + fprintf(stderr, "-t <nt> : number of time steps\n"); + fprintf(stderr, "-r <rate> : use compressed arrays with 'rate' bits/value\n"); + fprintf(stderr, "-c <blocks> : use 'blocks' 4x4 blocks of cache\n"); + return EXIT_FAILURE; +} + +int main(int argc, char* argv[]) +{ + int nx = 100; + int ny = 100; + int nt = 0; + double rate = 64; + int compression = 0; + int cache = 0; + + // parse command-line options + int i; + for (i = 1; i < argc; i++) { + if (argv[i][0] != '-' || argv[i][2]) + return usage(); + switch(argv[i][1]) { + case 'n': + if (++i == argc || sscanf(argv[i], "%d", &nx) != 1 || + ++i == argc || sscanf(argv[i], "%d", &ny) != 1) + return usage(); + break; + case 't': + if (++i == argc || sscanf(argv[i], "%d", &nt) != 1) + return usage(); + break; + case 'r': + if (++i == argc || sscanf(argv[i], "%lf", &rate) != 1) + return usage(); + compression = 1; + break; + case 'c': + if (++i == argc || sscanf(argv[i], "%d", &cache) != 1) + return usage(); + } + } + + constants* c = malloc(sizeof(constants)); + init_constants(c, nx, ny, nt); + + double sum; + double err; + if (compression) { + // solve problem using compressed arrays + cfp_array2d* u = _.ctor(nx, ny, rate, 0, cache * 4 * 4 * sizeof(double)); + + rate = _.rate(u); + double t = solve_compressed(u, c); + sum = total_compressed(u); + err = error_compressed(u, c, t); + + _.dtor(u); + } + else { + // solve problem using primitive arrays + double* u = calloc(nx * ny, sizeof(double)); + + double t = solve(u, c); + sum = total(u, nx, ny); + err = error(u, c, t); + + free(u); + } + + fprintf(stderr, "rate=%g sum=%g error=%.6e\n", rate, sum, err); + + free(c); + + return 0; +} diff --git a/zfp/examples/inplace.c b/zfp/examples/inplace.c new file mode 100644 index 0000000000000000000000000000000000000000..3764166b58c6653cfa4f0f087da29d7c08ef8088 --- /dev/null +++ b/zfp/examples/inplace.c @@ -0,0 +1,156 @@ +/* example illustrating in-place compression and decompression */ + +#include <limits.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "zfp.h" + +/* compress and decompress contiguous blocks */ +static int +process(double* buffer, uint blocks, double tolerance) +{ + zfp_stream* zfp; /* compressed stream */ + bitstream* stream; /* bit stream to write to or read from */ + size_t* offset; /* per-block bit offset in compressed stream */ + double* ptr; /* pointer to block being processed */ + size_t bufsize; /* byte size of uncompressed storage */ + size_t zfpsize; /* byte size of compressed stream */ + uint minbits; /* min bits per block */ + uint maxbits; /* max bits per block */ + uint maxprec; /* max precision */ + int minexp; /* min bit plane encoded */ + uint bits; /* size of compressed block */ + uint i; + + /* maintain offset to beginning of each variable-length block */ + offset = malloc(blocks * sizeof(size_t)); + + /* associate bit stream with same storage as input */ + bufsize = blocks * 4 * 4 * sizeof(*buffer); + stream = stream_open(buffer, bufsize); + + /* allocate meta data for a compressed stream */ + zfp = zfp_stream_open(stream); + + /* set tolerance for fixed-accuracy mode */ + zfp_stream_set_accuracy(zfp, tolerance); + + /* set maxbits to guard against prematurely overwriting the input */ + zfp_stream_params(zfp, &minbits, &maxbits, &maxprec, &minexp); + maxbits = 4 * 4 * sizeof(*buffer) * CHAR_BIT; + zfp_stream_set_params(zfp, minbits, maxbits, maxprec, minexp); + + /* compress one block at a time in sequential order */ + ptr = buffer; + for (i = 0; i < blocks; i++) { + offset[i] = stream_wtell(stream); + bits = zfp_encode_block_double_2(zfp, ptr); + if (!bits) { + fprintf(stderr, "compression failed\n"); + return 0; + } + printf("block #%u offset=%4u size=%4u\n", i, (uint)offset[i], bits); + ptr += 4 * 4; + } + /* important: flush any buffered compressed bits */ + stream_flush(stream); + + /* print out size */ + zfpsize = stream_size(stream); + printf("compressed %u bytes to %u bytes\n", (uint)bufsize, (uint)zfpsize); + + /* decompress one block at a time in reverse order */ + for (i = blocks; i--;) { + ptr -= 4 * 4; + stream_rseek(stream, offset[i]); + if (!zfp_decode_block_double_2(zfp, ptr)) { + fprintf(stderr, "decompression failed\n"); + return 0; + } + } + + /* clean up */ + zfp_stream_close(zfp); + stream_close(stream); + free(offset); + + return 1; +} + +int main(int argc, char* argv[]) +{ + double tolerance = 1e-6; + double* array; + double* buffer; + uint bx = 2; + uint by = 4; + uint nx = 4 * bx; + uint ny = 4 * by; + uint blocks = bx * by; + uint x, y; + uint i, j, k; + int status; + + switch (argc) { + case 2: + if (sscanf(argv[1], "%lf", &tolerance) != 1) + goto usage; + /* FALLTHROUGH */ + case 1: + break; + default: + usage: + fprintf(stderr, "Usage: inline [tolerance]\n"); + return EXIT_FAILURE; + } + + printf("tolerance=%g\n", tolerance); + + /* initialize array to be compressed */ + printf("original %ux%u array:\n", nx, ny); + array = malloc(nx * ny * sizeof(double)); + for (y = 0; y < ny; y++) { + for (x = 0; x < nx; x++) { + double u = 2 * (x + 0.5) / nx; + double v = asin(1.0) * (y + 0.5); + double f = exp(-u * u) * sin(v) / v; + printf("%9.6f%c", f, x == nx - 1 ? '\n' : ' '); + array[x + nx * y] = f; + } + } + + /* reorganize array into 4x4 blocks */ + buffer = malloc(blocks * 4 * 4 * sizeof(double)); + for (k = 0; k < blocks; k++) + for (j = 0; j < 4; j++) + for (i = 0; i < 4; i++) { + uint x = 4 * (k & 1) + i; + uint y = 4 * (k / 2) + j; + buffer[i + 4 * (j + 4 * k)] = array[x + nx * y]; + } + + status = process(buffer, blocks, tolerance); + if (status) { + /* reorganize blocks into array */ + for (k = 0; k < blocks; k++) + for (j = 0; j < 4; j++) + for (i = 0; i < 4; i++) { + uint x = 4 * (k & 1) + i; + uint y = 4 * (k / 2) + j; + array[x + nx * y] = buffer[i + 4 * (j + 4 * k)]; + } + + /* print out modified array*/ + printf("decompressed %ux%u array:\n", nx, ny); + for (y = 0; y < ny; y++) + for (x = 0; x < nx; x++) + printf("%9.6f%c", array[x + nx * y], x == nx - 1 ? '\n' : ' '); + } + + free(buffer); + free(array); + + return status ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/zfp/examples/iterator.cpp b/zfp/examples/iterator.cpp new file mode 100644 index 0000000000000000000000000000000000000000..698692ff2a13b82265a1ebb93c9523bb4537e30b --- /dev/null +++ b/zfp/examples/iterator.cpp @@ -0,0 +1,74 @@ +#include <algorithm> +#include <cstdlib> +#include <iostream> +#include "zfparray1.h" +#include "zfparray2.h" +#include "zfparray3.h" + +void print1(zfp::array1<double>::pointer p, size_t n) +{ + for (size_t i = 0; i < n; i++) + std::cout << p[i] << std::endl; +} + +void print2(zfp::array2<double>::pointer p, size_t n) +{ + while (n--) + std::cout << *p++ << std::endl; +} + +void print3(zfp::array1<double>::iterator begin, zfp::array1<double>::iterator end) +{ + for (zfp::array1<double>::iterator p = begin; p != end; p++) + std::cout << *p << std::endl; +} + +int main() +{ + // some fun with 1D arrays + zfp::array1<double> v(10, 64.0); + // initialize and print array of random values + for (zfp::array1<double>::iterator p = v.begin(); p != v.end(); p++) + *p = rand(); + std::cout << "random array" << std::endl; + print1(&v[0], v.size()); + std::cout << std::endl; + // sorting is possible via random access iterators (1D arrays only) + std::sort(v.begin(), v.end()); + // print array using iteration + std::cout << "sorted array" << std::endl; + print3(v.begin(), v.end()); + std::cout << std::endl; + + // some fun with 2D arrays + zfp::array2<double> a(5, 7, 64.0); + // print array indices visited in block-order traversal + std::cout << "block order (x, y) indices" << std::endl; + for (zfp::array2<double>::iterator p = a.begin(); p != a.end(); p++) { + std::cout << "(" << p.i() << ", " << p.j() << ")" << std::endl; + *p = p.i() + 10 * p.j(); + } + std::cout << std::endl; + // print array contents in row-major order + std::cout << "row-major order yx indices" << std::endl; + print2(&a[0], a.size()); + std::cout << std::endl; + // pointer arithmetic + std::cout << a.size_x() << " * " << a.size_y() << " = " << (&*a.end() - &*a.begin()) << std::endl; + // min and max values + std::cout << "min = " << *std::min_element(a.begin(), a.end()) << std::endl; + std::cout << "max = " << *std::max_element(a.begin(), a.end()) << std::endl; + std::cout << std::endl; + + // some fun with 3D arrays + zfp::array3<double> b(7, 2, 5, 64.0); + // print array indices visited in block-order traversal + std::cout << "block order (x, y, z) indices" << std::endl; + for (zfp::array3<double>::iterator p = b.begin(); p != b.end(); p++) + std::cout << "(" << p.i() << ", " << p.j() << ", " << p.k() << ")" << std::endl; + std::cout << std::endl; + // pointer arithmetic + std::cout << b.size_x() << " * " << b.size_y() << " * " << b.size_z() << " = " << (&*b.end() - &*b.begin()) << std::endl; + + return 0; +} diff --git a/zfp/examples/pgm.c b/zfp/examples/pgm.c new file mode 100644 index 0000000000000000000000000000000000000000..c23ecb2d5ef714fc8e8342754e738e80e6e30291 --- /dev/null +++ b/zfp/examples/pgm.c @@ -0,0 +1,112 @@ +/* simple example that shows how zfp can be used to compress pgm images */ + +#include <limits.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "zfp.h" + +int main(int argc, char* argv[]) +{ + double rate = 0; + uint nx, ny; + uint x, y; + char line[0x100]; + uchar* image; + zfp_field* field; + zfp_stream* zfp; + bitstream* stream; + void* buffer; + size_t bytes; + size_t size; + + switch (argc) { + case 2: + if (sscanf(argv[1], "%lf", &rate) != 1) + goto usage; + break; + default: + usage: + fprintf(stderr, "Usage: pgm <rate|-precision> <input.pgm >output.pgm\n"); + return EXIT_FAILURE; + } + + /* read pgm header */ + if (!fgets(line, sizeof(line), stdin) || strcmp(line, "P5\n") || + !fgets(line, sizeof(line), stdin) || sscanf(line, "%u%u", &nx, &ny) != 2 || + !fgets(line, sizeof(line), stdin) || strcmp(line, "255\n")) { + fprintf(stderr, "error opening image\n"); + return EXIT_FAILURE; + } + + if ((nx & 3u) || (ny & 3u)) { + fprintf(stderr, "image dimensions must be multiples of four\n"); + return EXIT_FAILURE; + } + + /* read image data */ + image = malloc(nx * ny); + if (fread(image, sizeof(*image), nx * ny, stdin) != nx * ny) { + fprintf(stderr, "error reading image\n"); + return EXIT_FAILURE; + } + + /* create input array */ + field = zfp_field_2d(image, zfp_type_int32, nx, ny); + + /* initialize compressed stream */ + zfp = zfp_stream_open(NULL); + if (rate < 0) + zfp_stream_set_precision(zfp, (uint)floor(0.5 - rate)); + else + zfp_stream_set_rate(zfp, rate, zfp_type_int32, 2, 0); + bytes = zfp_stream_maximum_size(zfp, field); + buffer = malloc(bytes); + stream = stream_open(buffer, bytes); + zfp_stream_set_bit_stream(zfp, stream); + zfp_field_free(field); + + /* compress */ + for (y = 0; y < ny; y += 4) + for (x = 0; x < nx; x += 4) { + uchar ublock[16]; + int32 iblock[16]; + uint i, j; + for (j = 0; j < 4; j++) + for (i = 0; i < 4; i++) + ublock[i + 4 * j] = image[x + i + nx * (y + j)]; + zfp_promote_uint8_to_int32(iblock, ublock, 2); + zfp_encode_block_int32_2(zfp, iblock); + } + + zfp_stream_flush(zfp); + size = zfp_stream_compressed_size(zfp); + fprintf(stderr, "%u compressed bytes (%.2f bps)\n", (uint)size, (double)size * CHAR_BIT / (nx * ny)); + + /* decompress */ + zfp_stream_rewind(zfp); + for (y = 0; y < ny; y += 4) + for (x = 0; x < nx; x += 4) { + int32 iblock[16]; + uchar ublock[16]; + uint i, j; + zfp_decode_block_int32_2(zfp, iblock); + zfp_demote_int32_to_uint8(ublock, iblock, 2); + for (j = 0; j < 4; j++) + for (i = 0; i < 4; i++) + image[x + i + nx * (y + j)] = ublock[i + 4 * j]; + } + zfp_stream_close(zfp); + stream_close(stream); + free(buffer); + + /* output reconstructed image */ + printf("P5\n"); + printf("%u %u\n", nx, ny); + printf("255\n"); + fwrite(image, sizeof(*image), nx * ny, stdout); + free(image); + + return 0; +} diff --git a/zfp/examples/simple.c b/zfp/examples/simple.c new file mode 100644 index 0000000000000000000000000000000000000000..2ccb597756a9031848e37b9bbe49582364a23456 --- /dev/null +++ b/zfp/examples/simple.c @@ -0,0 +1,99 @@ +/* minimal code example showing how to call the zfp (de)compressor */ + +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "zfp.h" + +/* compress or decompress array */ +static int +compress(double* array, int nx, int ny, int nz, double tolerance, int decompress) +{ + int status = 0; /* return value: 0 = success */ + zfp_type type; /* array scalar type */ + zfp_field* field; /* array meta data */ + zfp_stream* zfp; /* compressed stream */ + void* buffer; /* storage for compressed stream */ + size_t bufsize; /* byte size of compressed buffer */ + bitstream* stream; /* bit stream to write to or read from */ + size_t zfpsize; /* byte size of compressed stream */ + + /* allocate meta data for the 3D array a[nz][ny][nx] */ + type = zfp_type_double; + field = zfp_field_3d(array, type, nx, ny, nz); + + /* allocate meta data for a compressed stream */ + zfp = zfp_stream_open(NULL); + + /* set compression mode and parameters via one of three functions */ +/* zfp_stream_set_rate(zfp, rate, type, 3, 0); */ +/* zfp_stream_set_precision(zfp, precision); */ + zfp_stream_set_accuracy(zfp, tolerance); + + /* allocate buffer for compressed data */ + bufsize = zfp_stream_maximum_size(zfp, field); + buffer = malloc(bufsize); + + /* associate bit stream with allocated buffer */ + stream = stream_open(buffer, bufsize); + zfp_stream_set_bit_stream(zfp, stream); + zfp_stream_rewind(zfp); + + /* compress or decompress entire array */ + if (decompress) { + /* read compressed stream and decompress array */ + zfpsize = fread(buffer, 1, bufsize, stdin); + if (!zfp_decompress(zfp, field)) { + fprintf(stderr, "decompression failed\n"); + status = 1; + } + } + else { + /* compress array and output compressed stream */ + zfpsize = zfp_compress(zfp, field); + if (!zfpsize) { + fprintf(stderr, "compression failed\n"); + status = 1; + } + else + fwrite(buffer, 1, zfpsize, stdout); + } + + /* clean up */ + zfp_field_free(field); + zfp_stream_close(zfp); + stream_close(stream); + free(buffer); + free(array); + + return status; +} + +int main(int argc, char* argv[]) +{ + /* use -d to decompress rather than compress data */ + int decompress = (argc == 2 && !strcmp(argv[1], "-d")); + + /* allocate 100x100x100 array of doubles */ + int nx = 100; + int ny = 100; + int nz = 100; + double* array = malloc(nx * ny * nz * sizeof(double)); + + if (!decompress) { + /* initialize array to be compressed */ + int i, j, k; + for (k = 0; k < nz; k++) + for (j = 0; j < ny; j++) + for (i = 0; i < nx; i++) { + double x = 2.0 * i / nx; + double y = 2.0 * j / ny; + double z = 2.0 * k / nz; + array[i + nx * (j + ny * k)] = exp(-(x * x + y * y + z * z)); + } + } + + /* compress or decompress array */ + return compress(array, nx, ny, nz, 1e-3, decompress); +} diff --git a/zfp/examples/speed.c b/zfp/examples/speed.c new file mode 100644 index 0000000000000000000000000000000000000000..9332605d58eacfbc0709a51bfdbc2cd8a6de9226 --- /dev/null +++ b/zfp/examples/speed.c @@ -0,0 +1,136 @@ +/* measure the throughput of encoding and decoding 3D blocks of doubles */ + +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include "zfp.h" + +/* example 3D block of (reinterpreted) doubles */ +static const uint64 block[] = { +UINT64C(0xbf7c3a7bb8495ca9), +UINT64C(0xbf79f9d9058ffdaf), +UINT64C(0xbf77c7abd0b61999), +UINT64C(0xbf75a42c806bd1da), +UINT64C(0xbf738f8f740b8ea8), +UINT64C(0xbf718a050399fef8), +UINT64C(0xbf6f2772ff8c30fe), +UINT64C(0xbf6b59aa63d22f68), +UINT64C(0xbf67aaf8b80cff9e), +UINT64C(0xbf641b9e71983592), +UINT64C(0xbf60abd3f723f2b7), +UINT64C(0xbf5ab7934169cc04), +UINT64C(0xbf54574f6f4897d3), +UINT64C(0xbf4c6e39da7fb99b), +UINT64C(0xbf40ae5826a893d1), +UINT64C(0xbf25bce8e19d48e1), +UINT64C(0x3f253bfed65904d7), +UINT64C(0x3f3f18ab46a04cf3), +UINT64C(0x3f4948e7cb74278b), +UINT64C(0x3f51427b51aeec2e), +UINT64C(0x3f55a0716d8b4b6b), +UINT64C(0x3f59be96aeaac56f), +UINT64C(0x3f5d9d3ba7bfd327), +UINT64C(0x3f609e608469e93e), +UINT64C(0x3f624ecbcfa3832c), +UINT64C(0x3f63e0202ae84b4d), +UINT64C(0x3f6552a61a3f4812), +UINT64C(0x3f66a6ae305af268), +UINT64C(0x3f67dc910e9935bc), +UINT64C(0x3f68f4af65036ff7), +UINT64C(0x3f69ef71f24e7182), +UINT64C(0x3f6acd4983da7d43), +UINT64C(0x3f6b8eaef5b348a0), +UINT64C(0x3f6c3423328ffb7a), +UINT64C(0x3f6cbe2f33d33034), +UINT64C(0x3f6d2d64018af3ac), +UINT64C(0x3f6d825ab270c540), +UINT64C(0x3f6dbdb46be996cc), +UINT64C(0x3f6de01a6205cca9), +UINT64C(0x3f6dea3dd7813daf), +UINT64C(0x3f6ddcd81dc33335), +UINT64C(0x3f6db8aa94de690f), +UINT64C(0x3f6d7e7eab910d8f), +UINT64C(0x3f6d2f25df44c187), +UINT64C(0x3f6ccb79bc0e9844), +UINT64C(0x3f6c545bdcaf1795), +UINT64C(0x3f6bcab5ea9237c4), +UINT64C(0x3f6b2f799dcf639b), +UINT64C(0x3f6a83a0bd297862), +UINT64C(0x3f69c82d1e0ec5de), +UINT64C(0x3f68fe28a4990e53), +UINT64C(0x3f6826a5438d8685), +UINT64C(0x3f6742bcfc5cd5b2), +UINT64C(0x3f665391df231599), +UINT64C(0x3f655a4e0aa7d278), +UINT64C(0x3f645823ac5e0b09), +UINT64C(0x3f634e4d00643085), +UINT64C(0x3f623e0c518426a3), +UINT64C(0x3f6128abf933439a), +UINT64C(0x3f600f7e5f92501c), +UINT64C(0x3f5de7bbf6db0eb7), +UINT64C(0x3f5bae5aa4792e11), +UINT64C(0x3f5975adf0453ea2), +UINT64C(0x3f57409b1fdc65c4), +}; + +int main(int argc, char* argv[]) +{ + uint blocks = 0x200000; + double rate = 1; + zfp_field* field; + uint insize; + zfp_stream* zfp; + bitstream* stream; + void* buffer; + size_t bytes; + clock_t c; + double time; + uint i; + + switch (argc) { + case 3: + sscanf(argv[2], "%u", &blocks); + /* FALLTHROUGH */ + case 2: + sscanf(argv[1], "%lf", &rate); + break; + } + + /* declare array to compress */ + field = zfp_field_3d(NULL, zfp_type_double, 4, 4, 4 * blocks); + insize = blocks * sizeof(block); + + /* allocate storage for compressed bit stream */ + zfp = zfp_stream_open(NULL); + zfp_stream_set_rate(zfp, rate, zfp_field_type(field), zfp_field_dimensionality(field), 0); + bytes = zfp_stream_maximum_size(zfp, field); + buffer = malloc(bytes); + stream = stream_open(buffer, bytes); + zfp_stream_set_bit_stream(zfp, stream); + zfp_field_free(field); + + /* compress */ + c = clock(); + for (i = 0; i < blocks; i++) + zfp_encode_block_double_3(zfp, (const double*)block); + zfp_stream_flush(zfp); + time = (double)(clock() - c) / CLOCKS_PER_SEC; + printf("encode in=%u out=%u %.0f MB/s\n", insize, (uint)stream_size(stream), insize / (1024 * 1024 * time)); + + /* decompress */ + zfp_stream_rewind(zfp); + c = clock(); + for (i = 0; i < blocks; i++) { + double a[64]; + zfp_decode_block_double_3(zfp, a); + } + time = (double)(clock() - c) / CLOCKS_PER_SEC; + printf("decode in=%u out=%u %.0f MB/s\n", (uint)stream_size(stream), insize, insize / (1024 * 1024 * time)); + + zfp_stream_close(zfp); + stream_close(stream); + free(buffer); + + return 0; +} diff --git a/zfp/include/bitstream.h b/zfp/include/bitstream.h new file mode 100644 index 0000000000000000000000000000000000000000..ad5475fe6a719b24def0f2bc6dfe0a54bfba5108 --- /dev/null +++ b/zfp/include/bitstream.h @@ -0,0 +1,94 @@ +#ifndef ZFP_BITSTREAM_H +#define ZFP_BITSTREAM_H + +#include <stddef.h> +#include "zfp/types.h" +#include "zfp/system.h" + +/* forward declaration of opaque type */ +typedef struct bitstream bitstream; + +extern_ const size_t stream_word_bits; /* bit stream granularity */ + +#ifndef inline_ +#ifdef __cplusplus +extern "C" { +#endif + +/* allocate and initialize bit stream */ +bitstream* stream_open(void* buffer, size_t bytes); + +/* close and deallocate bit stream */ +void stream_close(bitstream* stream); + +/* make a copy of bit stream to shared memory buffer */ +bitstream* stream_clone(const bitstream* stream); + +/* pointer to beginning of stream */ +void* stream_data(const bitstream* stream); + +/* current byte size of stream (if flushed) */ +size_t stream_size(const bitstream* stream); + +/* byte capacity of stream */ +size_t stream_capacity(const bitstream* stream); + +/* number of words per block */ +size_t stream_stride_block(const bitstream* stream); + +/* number of blocks between consecutive blocks */ +ptrdiff_t stream_stride_delta(const bitstream* stream); + +/* read single bit (0 or 1) */ +uint stream_read_bit(bitstream* stream); + +/* write single bit */ +uint stream_write_bit(bitstream* stream, uint bit); + +/* read 0 <= n <= 64 bits */ +uint64 stream_read_bits(bitstream* stream, uint n); + +/* write 0 <= n <= 64 low bits of value and return remaining bits */ +uint64 stream_write_bits(bitstream* stream, uint64 value, uint n); + +/* return bit offset to next bit to be read */ +size_t stream_rtell(const bitstream* stream); + +/* return bit offset to next bit to be written */ +size_t stream_wtell(const bitstream* stream); + +/* rewind stream to beginning */ +void stream_rewind(bitstream* stream); + +/* position stream for reading at given bit offset */ +void stream_rseek(bitstream* stream, size_t offset); + +/* position stream for writing at given bit offset */ +void stream_wseek(bitstream* stream, size_t offset); + +/* skip over the next n bits */ +void stream_skip(bitstream* stream, uint n); + +/* append n zero-bits to stream */ +void stream_pad(bitstream* stream, uint n); + +/* align stream on next word boundary */ +size_t stream_align(bitstream* stream); + +/* flush out any remaining buffered bits */ +size_t stream_flush(bitstream* stream); + +/* copy n bits from one bit stream to another */ +void stream_copy(bitstream* dst, bitstream* src, size_t n); + +#ifdef BIT_STREAM_STRIDED +/* set block size in number of words and spacing in number of blocks */ +int stream_set_stride(bitstream* stream, size_t block, ptrdiff_t delta); +#endif + +#ifdef __cplusplus +} +#endif +#endif /* !inline_ */ + +#endif diff --git a/zfp/include/zfp.h b/zfp/include/zfp.h new file mode 100644 index 0000000000000000000000000000000000000000..2faca1c291e3c631b5d100a4abe64999334555a2 --- /dev/null +++ b/zfp/include/zfp.h @@ -0,0 +1,747 @@ +/* +** Copyright (c) 2014-2018, Lawrence Livermore National Security, LLC. +** Produced at the Lawrence Livermore National Laboratory. +** Authors: Peter Lindstrom, Markus Salasoo, Matt Larsen. +** LLNL-CODE-663824. +** All rights reserved. +** +** This file is part of the zfp library. +** For details, see http://computation.llnl.gov/casc/zfp/. +** +** Redistribution and use in source and binary forms, with or without +** modification, are permitted provided that the following conditions are met: +** +** 1. Redistributions of source code must retain the above copyright notice, +** this list of conditions and the disclaimer below. +** +** 2. Redistributions in binary form must reproduce the above copyright notice, +** this list of conditions and the disclaimer (as noted below) in the +** documentation and/or other materials provided with the distribution. +** +** 3. Neither the name of the LLNS/LLNL nor the names of its contributors may +** be used to endorse or promote products derived from this software without +** specific prior written permission. +** +** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +** ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, +** LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +** INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +** (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +** LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +** ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +** +** +** Additional BSD Notice +** +** 1. This notice is required to be provided under our contract with the U.S. +** Department of Energy (DOE). This work was produced at Lawrence Livermore +** National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. + +** 2. Neither the United States Government nor Lawrence Livermore National +** Security, LLC nor any of their employees, makes any warranty, express or +** implied, or assumes any liability or responsibility for the accuracy, +** completeness, or usefulness of any information, apparatus, product, or +** process disclosed, or represents that its use would not infringe +** privately-owned rights. +** +** 3. Also, reference herein to any specific commercial products, process, or +** services by trade name, trademark, manufacturer or otherwise does not +** necessarily constitute or imply its endorsement, recommendation, or +** favoring by the United States Government or Lawrence Livermore National +** Security, LLC. The views and opinions of authors expressed herein do not +** necessarily state or reflect those of the United States Government or +** Lawrence Livermore National Security, LLC, and shall not be used for +** advertising or product endorsement purposes. +*/ + +#ifndef ZFP_H +#define ZFP_H + +#include "zfp/types.h" +#include "zfp/system.h" +#include "bitstream.h" + +/* macros ------------------------------------------------------------------ */ + +/* stringification */ +#define _zfp_str_(x) # x +#define _zfp_str(x) _zfp_str_(x) + +/* library version information */ +#define ZFP_VERSION_MAJOR 0 /* library major version number */ +#define ZFP_VERSION_MINOR 5 /* library minor version number */ +#define ZFP_VERSION_PATCH 4 /* library patch version number */ +#define ZFP_VERSION_RELEASE ZFP_VERSION_PATCH + +/* codec version number (see also zfp_codec_version) */ +#define ZFP_CODEC 5 + +/* library version number (see also zfp_library_version) */ +#define ZFP_VERSION \ + ((ZFP_VERSION_MAJOR << 8) + \ + (ZFP_VERSION_MINOR << 4) + \ + (ZFP_VERSION_PATCH << 0)) + +/* library version string (see also zfp_version_string) */ +#define ZFP_VERSION_STRING \ + _zfp_str(ZFP_VERSION_MAJOR) "." \ + _zfp_str(ZFP_VERSION_MINOR) "." \ + _zfp_str(ZFP_VERSION_PATCH) + +/* default compression parameters */ +#define ZFP_MIN_BITS 1 /* minimum number of bits per block */ +#define ZFP_MAX_BITS 16651 /* maximum number of bits per block */ +#define ZFP_MAX_PREC 64 /* maximum precision supported */ +#define ZFP_MIN_EXP -1074 /* minimum floating-point base-2 exponent */ + +/* header masks (enable via bitwise or; reader must use same mask) */ +#define ZFP_HEADER_MAGIC 0x1u /* embed 64-bit magic */ +#define ZFP_HEADER_META 0x2u /* embed 52-bit field metadata */ +#define ZFP_HEADER_MODE 0x4u /* embed 12- or 64-bit compression mode */ +#define ZFP_HEADER_FULL 0x7u /* embed all of the above */ + +/* number of bits per header entry */ +#define ZFP_MAGIC_BITS 32 /* number of magic word bits */ +#define ZFP_META_BITS 52 /* number of field metadata bits */ +#define ZFP_MODE_SHORT_BITS 12 /* number of mode bits in short format */ +#define ZFP_MODE_LONG_BITS 64 /* number of mode bits in long format */ +#define ZFP_HEADER_MAX_BITS 148 /* max number of header bits */ +#define ZFP_MODE_SHORT_MAX ((1u << ZFP_MODE_SHORT_BITS) - 2) + +/* types ------------------------------------------------------------------- */ + +/* execution policy */ +typedef enum { + zfp_exec_serial = 0, /* serial execution (default) */ + zfp_exec_omp = 1, /* OpenMP multi-threaded execution */ + zfp_exec_cuda = 2 /* CUDA parallel execution */ +} zfp_exec_policy; + +/* OpenMP execution parameters */ +typedef struct { + uint threads; /* number of requested threads */ + uint chunk_size; /* number of blocks per chunk (1D only) */ +} zfp_exec_params_omp; + +/* execution parameters */ +typedef union { + zfp_exec_params_omp omp; /* OpenMP parameters */ +} zfp_exec_params; + +typedef struct { + zfp_exec_policy policy; /* execution policy (serial, omp, ...) */ + zfp_exec_params params; /* execution parameters */ +} zfp_execution; + +/* compressed stream; use accessors to get/set members */ +typedef struct { + uint minbits; /* minimum number of bits to store per block */ + uint maxbits; /* maximum number of bits to store per block */ + uint maxprec; /* maximum number of bit planes to store */ + int minexp; /* minimum floating point bit plane number to store */ + bitstream* stream; /* compressed bit stream */ + zfp_execution exec; /* execution policy and parameters */ +} zfp_stream; + +/* compression mode */ +typedef enum { + zfp_mode_null = 0, /* an invalid configuration of the 4 params */ + zfp_mode_expert = 1, /* expert mode (4 params set manually) */ + zfp_mode_fixed_rate = 2, /* fixed rate mode */ + zfp_mode_fixed_precision = 3, /* fixed precision mode */ + zfp_mode_fixed_accuracy = 4 /* fixed accuracy mode */ +} zfp_mode; + +/* scalar type */ +typedef enum { + zfp_type_none = 0, /* unspecified type */ + zfp_type_int32 = 1, /* 32-bit signed integer */ + zfp_type_int64 = 2, /* 64-bit signed integer */ + zfp_type_float = 3, /* single precision floating point */ + zfp_type_double = 4 /* double precision floating point */ +} zfp_type; + +/* uncompressed array; use accessors to get/set members */ +typedef struct { + zfp_type type; /* scalar type (e.g. int32, double) */ + uint nx, ny, nz, nw; /* sizes (zero for unused dimensions) */ + int sx, sy, sz, sw; /* strides (zero for contiguous array a[nw][nz][ny][nx]) */ + void* data; /* pointer to array data */ +} zfp_field; + +#ifdef __cplusplus +extern "C" { +#endif + +/* public data ------------------------------------------------------------- */ + +extern_ const uint zfp_codec_version; /* codec version ZFP_CODEC */ +extern_ const uint zfp_library_version; /* library version ZFP_VERSION */ +extern_ const char* const zfp_version_string; /* verbose version string */ + +/* high-level API: utility functions --------------------------------------- */ + +size_t /* byte size of scalar type */ +zfp_type_size( + zfp_type type /* scalar type */ +); + +/* high-level API: compressed stream construction/destruction -------------- */ + +/* open compressed stream and associate with bit stream */ +zfp_stream* /* allocated compressed stream */ +zfp_stream_open( + bitstream* stream /* bit stream to read from and write to (may be NULL) */ +); + +/* close and deallocate compressed stream (does not affect bit stream) */ +void +zfp_stream_close( + zfp_stream* stream /* compressed stream */ +); + +/* high-level API: compressed stream inspectors ---------------------------- */ + +/* bit stream associated with compressed stream */ +bitstream* /* bit stream associated with compressed stream */ +zfp_stream_bit_stream( + const zfp_stream* stream /* compressed stream */ +); + +/* returns enum of compression mode */ +zfp_mode /* enum for compression mode */ +zfp_stream_compression_mode( + const zfp_stream* zfp /* compressed stream */ +); + +/* get all compression parameters in a compact representation */ +uint64 /* 12- or 64-bit encoding of parameters */ +zfp_stream_mode( + const zfp_stream* zfp /* compressed stream */ +); + +/* get all compression parameters (pointers may be NULL) */ +void +zfp_stream_params( + const zfp_stream* stream, /* compressed stream */ + uint* minbits, /* minimum number of bits per 4^d block */ + uint* maxbits, /* maximum number of bits per 4^d block */ + uint* maxprec, /* maximum precision (# bit planes coded) */ + int* minexp /* minimum base-2 exponent; error <= 2^minexp */ +); + +/* byte size of sequentially compressed stream (call after compression) */ +size_t /* actual number of bytes of compressed storage */ +zfp_stream_compressed_size( + const zfp_stream* stream /* compressed stream */ +); + +/* conservative estimate of compressed size in bytes */ +size_t /* maximum number of bytes of compressed storage */ +zfp_stream_maximum_size( + const zfp_stream* stream, /* compressed stream */ + const zfp_field* field /* array to compress */ +); + +/* high-level API: initialization of compressed stream parameters ---------- */ + +/* associate bit stream with compressed stream */ +void +zfp_stream_set_bit_stream( + zfp_stream* stream, /* compressed stream */ + bitstream* bs /* bit stream to read from and write to */ +); + +/* set size in compressed bits/scalar (fixed-rate mode) */ +double /* actual rate in compressed bits/scalar */ +zfp_stream_set_rate( + zfp_stream* stream, /* compressed stream */ + double rate, /* desired rate in compressed bits/scalar */ + zfp_type type, /* scalar type to compress */ + uint dims, /* array dimensionality (1, 2, or 3) */ + int wra /* nonzero if write random access is needed */ +); + +/* set precision in uncompressed bits/scalar (fixed-precision mode) */ +uint /* actual precision */ +zfp_stream_set_precision( + zfp_stream* stream, /* compressed stream */ + uint precision /* desired precision in uncompressed bits/scalar */ +); + +/* set accuracy as absolute error tolerance (fixed-accuracy mode) */ +double /* actual error tolerance */ +zfp_stream_set_accuracy( + zfp_stream* stream, /* compressed stream */ + double tolerance /* desired error tolerance */ +); + +/* set all compression parameters from compact representation */ +/* compression params are only set on stream upon success */ +zfp_mode /* non (zfp_mode_null) upon success */ +zfp_stream_set_mode( + zfp_stream* stream, /* compressed stream */ + uint64 mode /* 12- or 64-bit encoding of parameters */ +); + +/* set all compression parameters (expert mode) */ +int /* nonzero upon success */ +zfp_stream_set_params( + zfp_stream* stream, /* compressed stream */ + uint minbits, /* minimum number of bits per 4^d block */ + uint maxbits, /* maximum number of bits per 4^d block */ + uint maxprec, /* maximum precision (# bit planes coded) */ + int minexp /* minimum base-2 exponent; error <= 2^minexp */ +); + +/* high-level API: execution policy ---------------------------------------- */ + +/* current execution policy */ +zfp_exec_policy +zfp_stream_execution( + const zfp_stream* stream /* compressed stream */ +); + +/* number of OpenMP threads to use */ +uint /* number of threads (0 for default) */ +zfp_stream_omp_threads( + const zfp_stream* stream /* compressed stream */ +); + +/* number of blocks per OpenMP chunk (1D only) */ +uint /* number of blocks per chunk (0 for default) */ +zfp_stream_omp_chunk_size( + const zfp_stream* stream /* compressed stream */ +); + +/* set execution policy */ +int /* nonzero upon success */ +zfp_stream_set_execution( + zfp_stream* stream, /* compressed stream */ + zfp_exec_policy policy /* execution policy */ +); + +/* set OpenMP execution policy and number of threads */ +int /* nonzero upon success */ +zfp_stream_set_omp_threads( + zfp_stream* stream, /* compressed stream */ + uint threads /* number of OpenMP threads to use (0 for default) */ +); + +/* set OpenMP execution policy and number of blocks per chunk (1D only) */ +int /* nonzero upon success */ +zfp_stream_set_omp_chunk_size( + zfp_stream* stream, /* compressed stream */ + uint chunk_size /* number of blocks per chunk (0 for default) */ +); + +/* high-level API: uncompressed array construction/destruction ------------- */ + +/* allocate field struct */ +zfp_field* /* pointer to default initialized field */ +zfp_field_alloc(); + +/* allocate metadata for 1D field f[nx] */ +zfp_field* /* allocated field metadata */ +zfp_field_1d( + void* pointer, /* pointer to uncompressed scalars (may be NULL) */ + zfp_type type, /* scalar type */ + uint nx /* number of scalars */ +); + +/* allocate metadata for 2D field f[ny][nx] */ +zfp_field* /* allocated field metadata */ +zfp_field_2d( + void* pointer, /* pointer to uncompressed scalars (may be NULL) */ + zfp_type type, /* scalar type */ + uint nx, /* number of scalars in x dimension */ + uint ny /* number of scalars in y dimension */ +); + +/* allocate metadata for 3D field f[nz][ny][nx] */ +zfp_field* /* allocated field metadata */ +zfp_field_3d( + void* pointer, /* pointer to uncompressed scalars (may be NULL) */ + zfp_type type, /* scalar type */ + uint nx, /* number of scalars in x dimension */ + uint ny, /* number of scalars in y dimension */ + uint nz /* number of scalars in z dimension */ +); + +/* allocate metadata for 4D field f[nw][nz][ny][nx] */ +zfp_field* /* allocated field metadata */ +zfp_field_4d( + void* pointer, /* pointer to uncompressed scalars (may be NULL) */ + zfp_type type, /* scalar type */ + uint nx, /* number of scalars in x dimension */ + uint ny, /* number of scalars in y dimension */ + uint nz, /* number of scalars in z dimension */ + uint nw /* number of scalars in w dimension */ +); + +/* deallocate field metadata */ +void +zfp_field_free( + zfp_field* field /* field metadata */ +); + +/* high-level API: uncompressed array inspectors --------------------------- */ + +/* pointer to first scalar in field */ +void* /* array pointer */ +zfp_field_pointer( + const zfp_field* field /* field metadata */ +); + +/* field scalar type */ +zfp_type /* scalar type */ +zfp_field_type( + const zfp_field* field /* field metadata */ +); + +/* precision of field scalar type */ +uint /* scalar type precision in number of bits */ +zfp_field_precision( + const zfp_field* field /* field metadata */ +); + +/* field dimensionality (1, 2, or 3) */ +uint /* number of dimensions */ +zfp_field_dimensionality( + const zfp_field* field /* field metadata */ +); + +/* field size in number of scalars */ +size_t /* total number of scalars */ +zfp_field_size( + const zfp_field* field, /* field metadata */ + uint* size /* number of scalars per dimension (may be NULL) */ +); + +/* field strides per dimension */ +int /* zero if array is contiguous */ +zfp_field_stride( + const zfp_field* field, /* field metadata */ + int* stride /* stride in scalars per dimension (may be NULL) */ +); + +/* field scalar type and dimensions */ +uint64 /* compact 52-bit encoding of metadata */ +zfp_field_metadata( + const zfp_field* field /* field metadata */ +); + +/* high-level API: uncompressed array specification ------------------------ */ + +/* set pointer to first scalar in field */ +void +zfp_field_set_pointer( + zfp_field* field, /* field metadata */ + void* pointer /* pointer to first scalar */ +); + +/* set field scalar type */ +zfp_type /* actual scalar type */ +zfp_field_set_type( + zfp_field* field, /* field metadata */ + zfp_type type /* desired scalar type */ +); + +/* set 1D field size */ +void +zfp_field_set_size_1d( + zfp_field* field, /* field metadata */ + uint nx /* number of scalars */ +); + +/* set 2D field size */ +void +zfp_field_set_size_2d( + zfp_field* field, /* field metadata */ + uint nx, /* number of scalars in x dimension */ + uint ny /* number of scalars in y dimension */ +); + +/* set 3D field size */ +void +zfp_field_set_size_3d( + zfp_field* field, /* field metadata */ + uint nx, /* number of scalars in x dimension */ + uint ny, /* number of scalars in y dimension */ + uint nz /* number of scalars in z dimension */ +); + +/* set 4D field size */ +void +zfp_field_set_size_4d( + zfp_field* field, /* field metadata */ + uint nx, /* number of scalars in x dimension */ + uint ny, /* number of scalars in y dimension */ + uint nz, /* number of scalars in z dimension */ + uint nw /* number of scalars in w dimension */ +); + +/* set 1D field stride in number of scalars */ +void +zfp_field_set_stride_1d( + zfp_field* field, /* field metadata */ + int sx /* stride in number of scalars: &f[1] - &f[0] */ +); + +/* set 2D field strides in number of scalars */ +void +zfp_field_set_stride_2d( + zfp_field* field, /* field metadata */ + int sx, /* stride in x dimension: &f[0][1] - &f[0][0] */ + int sy /* stride in y dimension: &f[1][0] - &f[0][0] */ +); + +/* set 3D field strides in number of scalars */ +void +zfp_field_set_stride_3d( + zfp_field* field, /* field metadata */ + int sx, /* stride in x dimension: &f[0][0][1] - &f[0][0][0] */ + int sy, /* stride in y dimension: &f[0][1][0] - &f[0][0][0] */ + int sz /* stride in z dimension: &f[1][0][0] - &f[0][0][0] */ +); + +/* set 4D field strides in number of scalars */ +void +zfp_field_set_stride_4d( + zfp_field* field, /* field metadata */ + int sx, /* stride in x dimension: &f[0][0][0][1] - &f[0][0][0][0] */ + int sy, /* stride in y dimension: &f[0][0][1][0] - &f[0][0][0][0] */ + int sz, /* stride in z dimension: &f[0][1][0][0] - &f[0][0][0][0] */ + int sw /* stride in w dimension: &f[1][0][0][0] - &f[0][0][0][0] */ +); + +/* set field scalar type and dimensions */ +int /* nonzero upon success */ +zfp_field_set_metadata( + zfp_field* field, /* field metadata */ + uint64 meta /* compact 52-bit encoding of metadata */ +); + +/* high-level API: compression and decompression --------------------------- */ + +/* compress entire field (nonzero return value upon success) */ +size_t /* cumulative number of bytes of compressed storage */ +zfp_compress( + zfp_stream* stream, /* compressed stream */ + const zfp_field* field /* field metadata */ +); + +/* decompress entire field (nonzero return value upon success) */ +size_t /* cumulative number of bytes of compressed storage */ +zfp_decompress( + zfp_stream* stream, /* compressed stream */ + zfp_field* field /* field metadata */ +); + +/* write compression parameters and field metadata (optional) */ +size_t /* number of bits written or zero upon failure */ +zfp_write_header( + zfp_stream* stream, /* compressed stream */ + const zfp_field* field, /* field metadata */ + uint mask /* information to write */ +); + +/* read compression parameters and field metadata when previously written */ +size_t /* number of bits read or zero upon failure */ +zfp_read_header( + zfp_stream* stream, /* compressed stream */ + zfp_field* field, /* field metadata */ + uint mask /* information to read */ +); + +/* low-level API: stream manipulation -------------------------------------- */ + +/* flush bit stream--must be called after last encode call or between seeks */ +size_t +zfp_stream_flush( + zfp_stream* stream /* compressed bit stream */ +); + +/* align bit stream on next word boundary (decoding analogy to flush) */ +size_t +zfp_stream_align( + zfp_stream* stream /* compressed bit stream */ +); + +/* rewind bit stream to beginning for compression or decompression */ +void +zfp_stream_rewind( + zfp_stream* stream /* compressed bit stream */ +); + +/* low-level API: encoder -------------------------------------------------- */ + +/* +The functions below all compress either a complete contiguous d-dimensional +block of 4^d scalars or a complete or partial block assembled from a strided +array. In the latter case, p points to the first scalar; (nx, ny, nz) specify +the size of the block, with 1 <= nx, ny, nz <= 4; and (sx, sy, sz) specify the +strides, i.e. the number of scalars to advance to get to the next scalar along +each dimension. The functions return the number of bits of compressed storage +needed for the compressed block. +*/ + +/* encode 1D contiguous block of 4 values */ +uint zfp_encode_block_int32_1(zfp_stream* stream, const int32* block); +uint zfp_encode_block_int64_1(zfp_stream* stream, const int64* block); +uint zfp_encode_block_float_1(zfp_stream* stream, const float* block); +uint zfp_encode_block_double_1(zfp_stream* stream, const double* block); + +/* encode 1D complete or partial block from strided array */ +uint zfp_encode_block_strided_int32_1(zfp_stream* stream, const int32* p, int sx); +uint zfp_encode_block_strided_int64_1(zfp_stream* stream, const int64* p, int sx); +uint zfp_encode_block_strided_float_1(zfp_stream* stream, const float* p, int sx); +uint zfp_encode_block_strided_double_1(zfp_stream* stream, const double* p, int sx); +uint zfp_encode_partial_block_strided_int32_1(zfp_stream* stream, const int32* p, uint nx, int sx); +uint zfp_encode_partial_block_strided_int64_1(zfp_stream* stream, const int64* p, uint nx, int sx); +uint zfp_encode_partial_block_strided_float_1(zfp_stream* stream, const float* p, uint nx, int sx); +uint zfp_encode_partial_block_strided_double_1(zfp_stream* stream, const double* p, uint nx, int sx); + +/* encode 2D contiguous block of 4x4 values */ +uint zfp_encode_block_int32_2(zfp_stream* stream, const int32* block); +uint zfp_encode_block_int64_2(zfp_stream* stream, const int64* block); +uint zfp_encode_block_float_2(zfp_stream* stream, const float* block); +uint zfp_encode_block_double_2(zfp_stream* stream, const double* block); + +/* encode 2D complete or partial block from strided array */ +uint zfp_encode_partial_block_strided_int32_2(zfp_stream* stream, const int32* p, uint nx, uint ny, int sx, int sy); +uint zfp_encode_partial_block_strided_int64_2(zfp_stream* stream, const int64* p, uint nx, uint ny, int sx, int sy); +uint zfp_encode_partial_block_strided_float_2(zfp_stream* stream, const float* p, uint nx, uint ny, int sx, int sy); +uint zfp_encode_partial_block_strided_double_2(zfp_stream* stream, const double* p, uint nx, uint ny, int sx, int sy); +uint zfp_encode_block_strided_int32_2(zfp_stream* stream, const int32* p, int sx, int sy); +uint zfp_encode_block_strided_int64_2(zfp_stream* stream, const int64* p, int sx, int sy); +uint zfp_encode_block_strided_float_2(zfp_stream* stream, const float* p, int sx, int sy); +uint zfp_encode_block_strided_double_2(zfp_stream* stream, const double* p, int sx, int sy); + +/* encode 3D contiguous block of 4x4x4 values */ +uint zfp_encode_block_int32_3(zfp_stream* stream, const int32* block); +uint zfp_encode_block_int64_3(zfp_stream* stream, const int64* block); +uint zfp_encode_block_float_3(zfp_stream* stream, const float* block); +uint zfp_encode_block_double_3(zfp_stream* stream, const double* block); + +/* encode 3D complete or partial block from strided array */ +uint zfp_encode_block_strided_int32_3(zfp_stream* stream, const int32* p, int sx, int sy, int sz); +uint zfp_encode_block_strided_int64_3(zfp_stream* stream, const int64* p, int sx, int sy, int sz); +uint zfp_encode_block_strided_float_3(zfp_stream* stream, const float* p, int sx, int sy, int sz); +uint zfp_encode_block_strided_double_3(zfp_stream* stream, const double* p, int sx, int sy, int sz); +uint zfp_encode_partial_block_strided_int32_3(zfp_stream* stream, const int32* p, uint nx, uint ny, uint nz, int sx, int sy, int sz); +uint zfp_encode_partial_block_strided_int64_3(zfp_stream* stream, const int64* p, uint nx, uint ny, uint nz, int sx, int sy, int sz); +uint zfp_encode_partial_block_strided_float_3(zfp_stream* stream, const float* p, uint nx, uint ny, uint nz, int sx, int sy, int sz); +uint zfp_encode_partial_block_strided_double_3(zfp_stream* stream, const double* p, uint nx, uint ny, uint nz, int sx, int sy, int sz); + +/* encode 4D contiguous block of 4x4x4x4 values */ +uint zfp_encode_block_int32_4(zfp_stream* stream, const int32* block); +uint zfp_encode_block_int64_4(zfp_stream* stream, const int64* block); +uint zfp_encode_block_float_4(zfp_stream* stream, const float* block); +uint zfp_encode_block_double_4(zfp_stream* stream, const double* block); + +/* encode 4D complete or partial block from strided array */ +uint zfp_encode_block_strided_int32_4(zfp_stream* stream, const int32* p, int sx, int sy, int sz, int sw); +uint zfp_encode_block_strided_int64_4(zfp_stream* stream, const int64* p, int sx, int sy, int sz, int sw); +uint zfp_encode_block_strided_float_4(zfp_stream* stream, const float* p, int sx, int sy, int sz, int sw); +uint zfp_encode_block_strided_double_4(zfp_stream* stream, const double* p, int sx, int sy, int sz, int sw); +uint zfp_encode_partial_block_strided_int32_4(zfp_stream* stream, const int32* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw); +uint zfp_encode_partial_block_strided_int64_4(zfp_stream* stream, const int64* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw); +uint zfp_encode_partial_block_strided_float_4(zfp_stream* stream, const float* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw); +uint zfp_encode_partial_block_strided_double_4(zfp_stream* stream, const double* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw); + +/* low-level API: decoder -------------------------------------------------- */ + +/* +Each function below decompresses a single block and returns the number of bits +of compressed storage consumed. See corresponding encoder functions above for +further details. +*/ + +/* decode 1D contiguous block of 4 values */ +uint zfp_decode_block_int32_1(zfp_stream* stream, int32* block); +uint zfp_decode_block_int64_1(zfp_stream* stream, int64* block); +uint zfp_decode_block_float_1(zfp_stream* stream, float* block); +uint zfp_decode_block_double_1(zfp_stream* stream, double* block); + +/* decode 1D complete or partial block from strided array */ +uint zfp_decode_block_strided_int32_1(zfp_stream* stream, int32* p, int sx); +uint zfp_decode_block_strided_int64_1(zfp_stream* stream, int64* p, int sx); +uint zfp_decode_block_strided_float_1(zfp_stream* stream, float* p, int sx); +uint zfp_decode_block_strided_double_1(zfp_stream* stream, double* p, int sx); +uint zfp_decode_partial_block_strided_int32_1(zfp_stream* stream, int32* p, uint nx, int sx); +uint zfp_decode_partial_block_strided_int64_1(zfp_stream* stream, int64* p, uint nx, int sx); +uint zfp_decode_partial_block_strided_float_1(zfp_stream* stream, float* p, uint nx, int sx); +uint zfp_decode_partial_block_strided_double_1(zfp_stream* stream, double* p, uint nx, int sx); + +/* decode 2D contiguous block of 4x4 values */ +uint zfp_decode_block_int32_2(zfp_stream* stream, int32* block); +uint zfp_decode_block_int64_2(zfp_stream* stream, int64* block); +uint zfp_decode_block_float_2(zfp_stream* stream, float* block); +uint zfp_decode_block_double_2(zfp_stream* stream, double* block); + +/* decode 2D complete or partial block from strided array */ +uint zfp_decode_block_strided_int32_2(zfp_stream* stream, int32* p, int sx, int sy); +uint zfp_decode_block_strided_int64_2(zfp_stream* stream, int64* p, int sx, int sy); +uint zfp_decode_block_strided_float_2(zfp_stream* stream, float* p, int sx, int sy); +uint zfp_decode_block_strided_double_2(zfp_stream* stream, double* p, int sx, int sy); +uint zfp_decode_partial_block_strided_int32_2(zfp_stream* stream, int32* p, uint nx, uint ny, int sx, int sy); +uint zfp_decode_partial_block_strided_int64_2(zfp_stream* stream, int64* p, uint nx, uint ny, int sx, int sy); +uint zfp_decode_partial_block_strided_float_2(zfp_stream* stream, float* p, uint nx, uint ny, int sx, int sy); +uint zfp_decode_partial_block_strided_double_2(zfp_stream* stream, double* p, uint nx, uint ny, int sx, int sy); + +/* decode 3D contiguous block of 4x4x4 values */ +uint zfp_decode_block_int32_3(zfp_stream* stream, int32* block); +uint zfp_decode_block_int64_3(zfp_stream* stream, int64* block); +uint zfp_decode_block_float_3(zfp_stream* stream, float* block); +uint zfp_decode_block_double_3(zfp_stream* stream, double* block); + +/* decode 3D complete or partial block from strided array */ +uint zfp_decode_block_strided_int32_3(zfp_stream* stream, int32* p, int sx, int sy, int sz); +uint zfp_decode_block_strided_int64_3(zfp_stream* stream, int64* p, int sx, int sy, int sz); +uint zfp_decode_block_strided_float_3(zfp_stream* stream, float* p, int sx, int sy, int sz); +uint zfp_decode_block_strided_double_3(zfp_stream* stream, double* p, int sx, int sy, int sz); +uint zfp_decode_partial_block_strided_int32_3(zfp_stream* stream, int32* p, uint nx, uint ny, uint nz, int sx, int sy, int sz); +uint zfp_decode_partial_block_strided_int64_3(zfp_stream* stream, int64* p, uint nx, uint ny, uint nz, int sx, int sy, int sz); +uint zfp_decode_partial_block_strided_float_3(zfp_stream* stream, float* p, uint nx, uint ny, uint nz, int sx, int sy, int sz); +uint zfp_decode_partial_block_strided_double_3(zfp_stream* stream, double* p, uint nx, uint ny, uint nz, int sx, int sy, int sz); + +/* decode 4D contiguous block of 4x4x4x4 values */ +uint zfp_decode_block_int32_4(zfp_stream* stream, int32* block); +uint zfp_decode_block_int64_4(zfp_stream* stream, int64* block); +uint zfp_decode_block_float_4(zfp_stream* stream, float* block); +uint zfp_decode_block_double_4(zfp_stream* stream, double* block); + +/* decode 4D complete or partial block from strided array */ +uint zfp_decode_block_strided_int32_4(zfp_stream* stream, int32* p, int sx, int sy, int sz, int sw); +uint zfp_decode_block_strided_int64_4(zfp_stream* stream, int64* p, int sx, int sy, int sz, int sw); +uint zfp_decode_block_strided_float_4(zfp_stream* stream, float* p, int sx, int sy, int sz, int sw); +uint zfp_decode_block_strided_double_4(zfp_stream* stream, double* p, int sx, int sy, int sz, int sw); +uint zfp_decode_partial_block_strided_int32_4(zfp_stream* stream, int32* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw); +uint zfp_decode_partial_block_strided_int64_4(zfp_stream* stream, int64* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw); +uint zfp_decode_partial_block_strided_float_4(zfp_stream* stream, float* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw); +uint zfp_decode_partial_block_strided_double_4(zfp_stream* stream, double* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw); + +/* low-level API: utility functions ---------------------------------------- */ + +/* convert dims-dimensional contiguous block to 32-bit integer type */ +void zfp_promote_int8_to_int32(int32* oblock, const int8* iblock, uint dims); +void zfp_promote_uint8_to_int32(int32* oblock, const uint8* iblock, uint dims); +void zfp_promote_int16_to_int32(int32* oblock, const int16* iblock, uint dims); +void zfp_promote_uint16_to_int32(int32* oblock, const uint16* iblock, uint dims); + +/* convert dims-dimensional contiguous block from 32-bit integer type */ +void zfp_demote_int32_to_int8(int8* oblock, const int32* iblock, uint dims); +void zfp_demote_int32_to_uint8(uint8* oblock, const int32* iblock, uint dims); +void zfp_demote_int32_to_int16(int16* oblock, const int32* iblock, uint dims); +void zfp_demote_int32_to_uint16(uint16* oblock, const int32* iblock, uint dims); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/zfp/include/zfp/macros.h b/zfp/include/zfp/macros.h new file mode 100644 index 0000000000000000000000000000000000000000..be3655c701356db28da52af3ee63bf2a13615b7e --- /dev/null +++ b/zfp/include/zfp/macros.h @@ -0,0 +1,7 @@ +#ifndef ZFP_MACROS_H +#define ZFP_MACROS_H + +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#define MAX(x, y) ((x) > (y) ? (x) : (y)) + +#endif diff --git a/zfp/include/zfp/system.h b/zfp/include/zfp/system.h new file mode 100644 index 0000000000000000000000000000000000000000..5394196482551e342c9f673d2f398469a487cd59 --- /dev/null +++ b/zfp/include/zfp/system.h @@ -0,0 +1,47 @@ +#ifndef ZFP_SYSTEM_H +#define ZFP_SYSTEM_H + +#if __STDC_VERSION__ >= 199901L + #define restrict_ restrict +#else + #define restrict_ +#endif + +/* macros for exporting and importing symbols */ +#ifdef _MSC_VER + #define export_ __declspec(dllexport) + /* export (import) symbols when ZFP_SOURCE is (is not) defined */ + #ifdef ZFP_SOURCE + #ifdef __cplusplus + #define extern_ extern "C" __declspec(dllexport) + #else + #define extern_ extern __declspec(dllexport) + #endif + #else + #ifdef __cplusplus + #define extern_ extern "C" __declspec(dllimport) + #else + #define extern_ extern __declspec(dllimport) + #endif + #endif +#else /* !_MSC_VER */ + #define export_ + #ifdef __cplusplus + #define extern_ extern "C" + #else + #define extern_ extern + #endif +#endif + +#ifdef __GNUC__ + /* L1 cache line size for alignment purposes */ + #ifndef ZFP_CACHE_LINE_SIZE + #define ZFP_CACHE_LINE_SIZE 0x100 + #endif + #define align_(n) __attribute__((aligned(n))) + #define cache_align_(x) x align_(ZFP_CACHE_LINE_SIZE) +#else + #define cache_align_(x) x +#endif + +#endif diff --git a/zfp/include/zfp/types.h b/zfp/include/zfp/types.h new file mode 100644 index 0000000000000000000000000000000000000000..b501ca2932d958023d75faf32e994593114b41de --- /dev/null +++ b/zfp/include/zfp/types.h @@ -0,0 +1,74 @@ +#ifndef ZFP_TYPES_H +#define ZFP_TYPES_H + +typedef unsigned char uchar; +typedef unsigned short ushort; +typedef unsigned int uint; + +#if __STDC_VERSION__ >= 199901L + /* C99: use standard integer types */ + #include <stdint.h> + #define INT64C(x) INT64_C(x) + #define UINT64C(x) UINT64_C(x) + typedef int8_t int8; + typedef uint8_t uint8; + typedef int16_t int16; + typedef uint16_t uint16; + typedef int32_t int32; + typedef uint32_t uint32; + typedef int64_t int64; + typedef uint64_t uint64; +#else + /* C89: assume common integer types */ + typedef signed char int8; + typedef unsigned char uint8; + typedef signed short int16; + typedef unsigned short uint16; + + /* assume 32-bit integers (LP64, LLP64) */ + typedef signed int int32; + typedef unsigned int uint32; + + /* determine 64-bit data model */ + #if defined(_WIN32) || defined(_WIN64) + /* assume ILP32 or LLP64 (MSVC, MinGW) */ + #define ZFP_LLP64 1 + #else + /* assume LP64 (Linux, macOS, ...) */ + #define ZFP_LP64 1 + #endif + + /* concatenation for literal suffixes */ + #define _zfp_cat_(x, y) x ## y + #define _zfp_cat(x, y) _zfp_cat_(x, y) + + /* signed 64-bit integers */ + #if defined(ZFP_INT64) && defined(ZFP_INT64_SUFFIX) + #define INT64C(x) _zfp_cat(x, ZFP_INT64_SUFFIX) + typedef ZFP_INT64 int64; + #elif ZFP_LP64 + #define INT64C(x) x ## l + typedef signed long int64; + #elif ZFP_LLP64 + #define INT64C(x) x ## ll + typedef signed long long int64; + #else + #error "unknown 64-bit signed integer type" + #endif + + /* unsigned 64-bit integers */ + #if defined(ZFP_UINT64) && defined(ZFP_UINT64_SUFFIX) + #define UINT64C(x) _zfp_cat(x, ZFP_UINT64_SUFFIX) + typedef ZFP_UINT64 uint64; + #elif ZFP_LP64 + #define UINT64C(x) x ## ul + typedef unsigned long uint64; + #elif ZFP_LLP64 + #define UINT64C(x) x ## ull + typedef unsigned long long uint64; + #else + #error "unknown 64-bit unsigned integer type" + #endif +#endif + +#endif diff --git a/zfp/src/CMakeLists.txt b/zfp/src/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4119245c0482c94a7fd4be15e76bfaf3266ffe26 --- /dev/null +++ b/zfp/src/CMakeLists.txt @@ -0,0 +1,63 @@ +if(ZFP_WITH_CUDA) + SET(CMAKE_CXX_FLAGS_PREVIOUS ${CMAKE_CXX_FLAGS}) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" ) + + add_subdirectory(cuda_zfp) + cuda_wrap_srcs(zfp OBJ zfp_cuda_backend_obj cuda_zfp/cuZFP.cu) + SET(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS_PREVIOUS}) + add_definitions(-DZFP_WITH_CUDA) +endif() + + +set(zfp_source + zfp.c + bitstream.c + traitsf.h traitsd.h block1.h block2.h block3.h block4.h + encode1f.c encode1d.c encode1i.c encode1l.c + decode1f.c decode1d.c decode1i.c decode1l.c + encode2f.c encode2d.c encode2i.c encode2l.c + decode2f.c decode2d.c decode2i.c decode2l.c + encode3f.c encode3d.c encode3i.c encode3l.c + decode3f.c decode3d.c decode3i.c decode3l.c + encode4f.c encode4d.c encode4i.c encode4l.c + decode4f.c decode4d.c decode4i.c decode4l.c) + +add_library(zfp ${zfp_source} + ${zfp_cuda_backend_obj}) +add_library(zfp::zfp ALIAS zfp) + +if(ZFP_WITH_OPENMP) + target_compile_options(zfp PRIVATE ${OpenMP_C_FLAGS}) + target_link_libraries(zfp PRIVATE ${OpenMP_C_LIBRARIES}) +endif() + +if(HAVE_LIBM_MATH) + target_link_libraries(zfp PRIVATE m) +endif() + +if(WIN32) + # Define ZFP_SOURCE when compiling libzfp to export symbols to Windows DLL + list(APPEND zfp_defs ZFP_SOURCE) +endif() + +if(ZFP_WITH_CUDA) + target_link_libraries(zfp PRIVATE ${CUDA_CUDART_LIBRARY} stdc++) +endif() + +target_compile_definitions(zfp PRIVATE ${zfp_defs}) + +target_include_directories(zfp + PUBLIC + $<BUILD_INTERFACE:${ZFP_SOURCE_DIR}/include> + $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}> + INTERFACE + $<BUILD_INTERFACE:${ZFP_SOURCE_DIR}/array>) + +set_property(TARGET zfp PROPERTY VERSION ${ZFP_VERSION}) +set_property(TARGET zfp PROPERTY SOVERSION ${ZFP_VERSION_MAJOR}) +set_property(TARGET zfp PROPERTY OUTPUT_NAME ${ZFP_LIBRARY_PREFIX}zfp) + +install(TARGETS zfp EXPORT zfp-targets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) diff --git a/zfp/src/Makefile b/zfp/src/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..e4347096d196ab851958b521afdb699be475af16 --- /dev/null +++ b/zfp/src/Makefile @@ -0,0 +1,31 @@ +#include ../Config +include ../../Make_include + +CFLAGS += -std=c99 -I../include + +LIBDIR = ../lib +TARGETS = $(LIBDIR)/libzfp.a $(LIBDIR)/libzfp.so +OBJECTS = bitstream.o decode1i.o decode1l.o decode1f.o decode1d.o encode1i.o encode1l.o encode1f.o encode1d.o decode2i.o decode2l.o decode2f.o decode2d.o encode2i.o encode2l.o encode2f.o encode2d.o decode3i.o decode3l.o decode3f.o decode3d.o encode3i.o encode3l.o encode3f.o encode3d.o decode4i.o decode4l.o decode4f.o decode4d.o encode4i.o encode4l.o encode4f.o encode4d.o zfp.o + +static: $(LIBDIR)/libzfp.a + +shared: $(LIBDIR)/libzfp.so + +clean: + rm -f $(OBJECTS) + +realclean: + rm -f $(TARGETS) $(OBJECTS) + +$(LIBDIR)/libzfp.a: $(OBJECTS) + mkdir -p $(LIBDIR) + ar rc $@ $^ + +# rm -f $@ +# +$(LIBDIR)/libzfp.so: $(OBJECTS) + mkdir -p $(LIBDIR) + $(CC) $(CFLAGS) -shared $^ -o $@ + +#.c.o: +# $(CC) $(CFLAGS) -c $< diff --git a/zfp/src/bitstream.c b/zfp/src/bitstream.c new file mode 100644 index 0000000000000000000000000000000000000000..05094c6d31befd5947e7b49164cc7eeb315fa95c --- /dev/null +++ b/zfp/src/bitstream.c @@ -0,0 +1,4 @@ +#include "bitstream.h" +#include "inline/bitstream.c" + +export_ const size_t stream_word_bits = wsize; diff --git a/zfp/src/block1.h b/zfp/src/block1.h new file mode 100644 index 0000000000000000000000000000000000000000..035d9c9523d63ff46b76dd648bf99b5503781016 --- /dev/null +++ b/zfp/src/block1.h @@ -0,0 +1 @@ +#define DIMS 1 diff --git a/zfp/src/block2.h b/zfp/src/block2.h new file mode 100644 index 0000000000000000000000000000000000000000..e87ab62995ad655de67ed64a1e466e7622a3a758 --- /dev/null +++ b/zfp/src/block2.h @@ -0,0 +1 @@ +#define DIMS 2 diff --git a/zfp/src/block3.h b/zfp/src/block3.h new file mode 100644 index 0000000000000000000000000000000000000000..a683568673a3f619d23ece9a3abd92b965a6aea6 --- /dev/null +++ b/zfp/src/block3.h @@ -0,0 +1 @@ +#define DIMS 3 diff --git a/zfp/src/block4.h b/zfp/src/block4.h new file mode 100644 index 0000000000000000000000000000000000000000..6737fb25eaf4ed27653c2fc737d6d73ee058ffe7 --- /dev/null +++ b/zfp/src/block4.h @@ -0,0 +1 @@ +#define DIMS 4 diff --git a/zfp/src/cuda_zfp/CMakeLists.txt b/zfp/src/cuda_zfp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fe402fa060f8fba83064356715fb1725fb9416c --- /dev/null +++ b/zfp/src/cuda_zfp/CMakeLists.txt @@ -0,0 +1,25 @@ +############################################################################### +# +# file: src/cuZFP/CMakeLists.txt +# +############################################################################### + +set(cuZFP_sources + cuZFP.cu # main entry point + decode.cuh + decode1.cuh + decode2.cuh + decode3.cuh + encode.cuh + encode1.cuh + encode2.cuh + encode3.cuh + pointers.cuh + type_info.cuh) + +set(cuZFP_headers + constant_setup.cuh + shared.h + cuZFP.h + ErrorCheck.h) + diff --git a/zfp/src/cuda_zfp/ErrorCheck.h b/zfp/src/cuda_zfp/ErrorCheck.h new file mode 100644 index 0000000000000000000000000000000000000000..90a7ac47f7609034429ef1462ff6c065a4b32027 --- /dev/null +++ b/zfp/src/cuda_zfp/ErrorCheck.h @@ -0,0 +1,35 @@ +#ifndef ERRORCHECK_H +#define ERRORCHECK_H +#include <iostream> +#include <string> +#include <sstream> + +using std::stringstream; +class ErrorCheck +{ +public: + ErrorCheck() + { + + } + + void chk(std::string msg) + { + error = cudaGetLastError(); + if (error != cudaSuccess) + { + std::cout << msg << " : " << error; + std::cout << " " << cudaGetErrorString(error) << std::endl; + } + } + + void chk() + { + chk(str.str()); + str.str(""); + } + cudaError error; + stringstream str; +}; + +#endif // ERRORCHECK_H diff --git a/zfp/src/cuda_zfp/constant_setup.cuh b/zfp/src/cuda_zfp/constant_setup.cuh new file mode 100644 index 0000000000000000000000000000000000000000..1c1221adda9bf2edf2dfc1b1a849682b6d20f29a --- /dev/null +++ b/zfp/src/cuda_zfp/constant_setup.cuh @@ -0,0 +1,39 @@ +#ifndef cuZFP_CONSTANT_SETUP +#define cuZFP_CONSTANT_SETUP + +#include "constants.h" +#include "shared.h" +#include "ErrorCheck.h" +#include "type_info.cuh" + +namespace cuZFP { + +class ConstantSetup +{ +public: + static void setup_3d() + { + ErrorCheck ec; + cudaMemcpyToSymbol(c_perm, perm_3d, sizeof(unsigned char) * 64, 0); + ec.chk("setupConst: c_perm"); + } + + static void setup_2d() + { + ErrorCheck ec; + cudaMemcpyToSymbol(c_perm_2, perm_2, sizeof(unsigned char) * 16, 0); + ec.chk("setupConst: c_perm_2"); + } + + static void setup_1d() + { + ErrorCheck ec; + cudaMemcpyToSymbol(c_perm_1, perm_1, sizeof(unsigned char) * 4, 0); + ec.chk("setupConst: c_perm_1"); + } +}; + + +} //namespace + +#endif diff --git a/zfp/src/cuda_zfp/constants.h b/zfp/src/cuda_zfp/constants.h new file mode 100644 index 0000000000000000000000000000000000000000..423ac91cc60018ce7b4e810f7984c2208e33171e --- /dev/null +++ b/zfp/src/cuda_zfp/constants.h @@ -0,0 +1,136 @@ +#ifndef cuZFP_CONSTANTS_H +#define cuZFP_CONSTANTS_H + +namespace cuZFP { + +#define index_3d(x, y, z) ((x) + 4 * ((y) + 4 * (z))) + +static const unsigned char +perm_3d[64] = { + index_3d(0, 0, 0), // 0 : 0 + + index_3d(1, 0, 0), // 1 : 1 + index_3d(0, 1, 0), // 2 : 1 + index_3d(0, 0, 1), // 3 : 1 + + index_3d(0, 1, 1), // 4 : 2 + index_3d(1, 0, 1), // 5 : 2 + index_3d(1, 1, 0), // 6 : 2 + + index_3d(2, 0, 0), // 7 : 2 + index_3d(0, 2, 0), // 8 : 2 + index_3d(0, 0, 2), // 9 : 2 + + index_3d(1, 1, 1), // 10 : 3 + + index_3d(2, 1, 0), // 11 : 3 + index_3d(2, 0, 1), // 12 : 3 + index_3d(0, 2, 1), // 13 : 3 + index_3d(1, 2, 0), // 14 : 3 + index_3d(1, 0, 2), // 15 : 3 + index_3d(0, 1, 2), // 16 : 3 + + index_3d(3, 0, 0), // 17 : 3 + index_3d(0, 3, 0), // 18 : 3 + index_3d(0, 0, 3), // 19 : 3 + + index_3d(2, 1, 1), // 20 : 4 + index_3d(1, 2, 1), // 21 : 4 + index_3d(1, 1, 2), // 22 : 4 + + index_3d(0, 2, 2), // 23 : 4 + index_3d(2, 0, 2), // 24 : 4 + index_3d(2, 2, 0), // 25 : 4 + + index_3d(3, 1, 0), // 26 : 4 + index_3d(3, 0, 1), // 27 : 4 + index_3d(0, 3, 1), // 28 : 4 + index_3d(1, 3, 0), // 29 : 4 + index_3d(1, 0, 3), // 30 : 4 + index_3d(0, 1, 3), // 31 : 4 + + index_3d(1, 2, 2), // 32 : 5 + index_3d(2, 1, 2), // 33 : 5 + index_3d(2, 2, 1), // 34 : 5 + + index_3d(3, 1, 1), // 35 : 5 + index_3d(1, 3, 1), // 36 : 5 + index_3d(1, 1, 3), // 37 : 5 + + index_3d(3, 2, 0), // 38 : 5 + index_3d(3, 0, 2), // 39 : 5 + index_3d(0, 3, 2), // 40 : 5 + index_3d(2, 3, 0), // 41 : 5 + index_3d(2, 0, 3), // 42 : 5 + index_3d(0, 2, 3), // 43 : 5 + + index_3d(2, 2, 2), // 44 : 6 + + index_3d(3, 2, 1), // 45 : 6 + index_3d(3, 1, 2), // 46 : 6 + index_3d(1, 3, 2), // 47 : 6 + index_3d(2, 3, 1), // 48 : 6 + index_3d(2, 1, 3), // 49 : 6 + index_3d(1, 2, 3), // 50 : 6 + + index_3d(0, 3, 3), // 51 : 6 + index_3d(3, 0, 3), // 52 : 6 + index_3d(3, 3, 0), // 53 : 6 + + index_3d(3, 2, 2), // 54 : 7 + index_3d(2, 3, 2), // 55 : 7 + index_3d(2, 2, 3), // 56 : 7 + + index_3d(1, 3, 3), // 57 : 7 + index_3d(3, 1, 3), // 58 : 7 + index_3d(3, 3, 1), // 59 : 7 + + index_3d(2, 3, 3), // 60 : 8 + index_3d(3, 2, 3), // 61 : 8 + index_3d(3, 3, 2), // 62 : 8 + + index_3d(3, 3, 3), // 63 : 9 +}; + +#undef index_3d + +static const unsigned char perm_1[4] = +{ + 0, 1, 2, 3 +}; + +#define index(i, j) ((i) + 4 * (j)) + +/* order coefficients (i, j) by i + j, then i^2 + j^2 */ +static const unsigned char perm_2[16] = { + index(0, 0), /* 0 : 0 */ + + index(1, 0), /* 1 : 1 */ + index(0, 1), /* 2 : 1 */ + + index(1, 1), /* 3 : 2 */ + + index(2, 0), /* 4 : 2 */ + index(0, 2), /* 5 : 2 */ + + index(2, 1), /* 6 : 3 */ + index(1, 2), /* 7 : 3 */ + + index(3, 0), /* 8 : 3 */ + index(0, 3), /* 9 : 3 */ + + index(2, 2), /* 10 : 4 */ + + index(3, 1), /* 11 : 4 */ + index(1, 3), /* 12 : 4 */ + + index(3, 2), /* 13 : 5 */ + index(2, 3), /* 14 : 5 */ + + index(3, 3), /* 15 : 6 */ +}; + +#undef index + +} // namespace cuZFP +#endif diff --git a/zfp/src/cuda_zfp/cuZFP.cu b/zfp/src/cuda_zfp/cuZFP.cu new file mode 100644 index 0000000000000000000000000000000000000000..46815a6ac0a25f2828bba07f1dfb700d39777b94 --- /dev/null +++ b/zfp/src/cuda_zfp/cuZFP.cu @@ -0,0 +1,447 @@ +#include <assert.h> + +#include "cuZFP.h" + +#include "encode1.cuh" +#include "encode2.cuh" +#include "encode3.cuh" + +#include "decode1.cuh" +#include "decode2.cuh" +#include "decode3.cuh" + +#include "ErrorCheck.h" + +#include "constant_setup.cuh" +#include "pointers.cuh" +#include "type_info.cuh" +#include <iostream> +#include <assert.h> + +// we need to know about bitstream, but we don't +// want duplicate symbols. +#ifndef inline_ + #define inline_ inline +#endif + +#include "../inline/bitstream.c" +namespace internal +{ + +bool is_contigous3d(const uint dims[3], const int3 &stride, long long int &offset) +{ + typedef long long int int64; + int64 idims[3]; + idims[0] = dims[0]; + idims[1] = dims[1]; + idims[2] = dims[2]; + + int64 imin = std::min(stride.x,0) * (idims[0] - 1) + + std::min(stride.y,0) * (idims[1] - 1) + + std::min(stride.z,0) * (idims[2] - 1); + + int64 imax = std::max(stride.x,0) * (idims[0] - 1) + + std::max(stride.y,0) * (idims[1] - 1) + + std::max(stride.z,0) * (idims[2] - 1); + offset = imin; + int64 ns = idims[0] * idims[1] * idims[2]; + + return (imax - imin + 1 == ns); +} + +bool is_contigous2d(const uint dims[3], const int3 &stride, long long int &offset) +{ + typedef long long int int64; + int64 idims[2]; + idims[0] = dims[0]; + idims[1] = dims[1]; + + int64 imin = std::min(stride.x,0) * (idims[0] - 1) + + std::min(stride.y,0) * (idims[1] - 1); + + int64 imax = std::max(stride.x,0) * (idims[0] - 1) + + std::max(stride.y,0) * (idims[1] - 1); + + offset = imin; + return (imax - imin + 1) == (idims[0] * idims[1]); +} + +bool is_contigous1d(uint dim, const int &stride, long long int &offset) +{ + offset = 0; + if(stride < 0) offset = stride * (int(dim) - 1); + return std::abs(stride) == 1; +} + +bool is_contigous(const uint dims[3], const int3 &stride, long long int &offset) +{ + int d = 0; + + if(dims[0] != 0) d++; + if(dims[1] != 0) d++; + if(dims[2] != 0) d++; + + if(d == 3) + { + return is_contigous3d(dims, stride, offset); + } + else if(d == 2) + { + return is_contigous2d(dims, stride, offset); + } + else + { + return is_contigous1d(dims[0], stride.x, offset); + } + +} +// +// encode expects device pointers +// +template<typename T> +size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_stream) +{ + + int d = 0; + size_t len = 1; + for(int i = 0; i < 3; ++i) + { + if(dims[i] != 0) + { + d++; + len *= dims[i]; + } + } + + ErrorCheck errors; + size_t stream_size = 0; + if(d == 1) + { + int dim = dims[0]; + int sx = stride.x; + cuZFP::ConstantSetup::setup_1d(); + stream_size = cuZFP::encode1<T>(dim, sx, d_data, d_stream, bits_per_block); + } + else if(d == 2) + { + uint2 ndims = make_uint2(dims[0], dims[1]); + int2 s; + s.x = stride.x; + s.y = stride.y; + cuZFP::ConstantSetup::setup_2d(); + stream_size = cuZFP::encode2<T>(ndims, s, d_data, d_stream, bits_per_block); + } + else if(d == 3) + { + int3 s; + s.x = stride.x; + s.y = stride.y; + s.z = stride.z; + uint3 ndims = make_uint3(dims[0], dims[1], dims[2]); + cuZFP::ConstantSetup::setup_3d(); + stream_size = cuZFP::encode<T>(ndims, s, d_data, d_stream, bits_per_block); + } + + errors.chk("Encode"); + + return stream_size; +} + +template<typename T> +size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *out) +{ + + int d = 0; + size_t out_size = 1; + size_t stream_bytes = 0; + for(int i = 0; i < 3; ++i) + { + if(ndims[i] != 0) + { + d++; + out_size *= ndims[i]; + } + } + + if(d == 3) + { + uint3 dims = make_uint3(ndims[0], ndims[1], ndims[2]); + + int3 s; + s.x = stride.x; + s.y = stride.y; + s.z = stride.z; + + cuZFP::ConstantSetup::setup_3d(); + stream_bytes = cuZFP::decode3<T>(dims, s, stream, out, bits_per_block); + } + else if(d == 1) + { + uint dim = ndims[0]; + int sx = stride.x; + + cuZFP::ConstantSetup::setup_1d(); + stream_bytes = cuZFP::decode1<T>(dim, sx, stream, out, bits_per_block); + + } + else if(d == 2) + { + uint2 dims; + dims.x = ndims[0]; + dims.y = ndims[1]; + + int2 s; + s.x = stride.x; + s.y = stride.y; + + cuZFP::ConstantSetup::setup_2d(); + stream_bytes = cuZFP::decode2<T>(dims, s, stream, out, bits_per_block); + } + else std::cerr<<" d == "<<d<<" not implemented\n"; + + return stream_bytes; +} + +Word *setup_device_stream(zfp_stream *stream,const zfp_field *field) +{ + bool stream_device = cuZFP::is_gpu_ptr(stream->stream->begin); + assert(sizeof(word) == sizeof(Word)); // "CUDA version currently only supports 64bit words"); + + if(stream_device) + { + return (Word*) stream->stream->begin; + } + + Word *d_stream = NULL; + // TODO: we we have a real stream we can just ask it how big it is + size_t max_size = zfp_stream_maximum_size(stream, field); + cudaMalloc(&d_stream, max_size); + cudaMemcpy(d_stream, stream->stream->begin, max_size, cudaMemcpyHostToDevice); + return d_stream; +} + +void * offset_void(zfp_type type, void *ptr, long long int offset) +{ + void * offset_ptr = NULL; + if(type == zfp_type_float) + { + float* data = (float*) ptr; + offset_ptr = (void*)(&data[offset]); + } + else if(type == zfp_type_double) + { + double* data = (double*) ptr; + offset_ptr = (void*)(&data[offset]); + } + else if(type == zfp_type_int32) + { + int * data = (int*) ptr; + offset_ptr = (void*)(&data[offset]); + } + else if(type == zfp_type_int64) + { + long long int * data = (long long int*) ptr; + offset_ptr = (void*)(&data[offset]); + } + return offset_ptr; +} + +void *setup_device_field(const zfp_field *field, const int3 &stride, long long int &offset) +{ + bool field_device = cuZFP::is_gpu_ptr(field->data); + + if(field_device) + { + offset = 0; + return field->data; + } + + uint dims[3]; + dims[0] = field->nx; + dims[1] = field->ny; + dims[2] = field->nz; + + size_t type_size = zfp_type_size(field->type); + + size_t field_size = 1; + for(int i = 0; i < 3; ++i) + { + if(dims[i] != 0) + { + field_size *= dims[i]; + } + } + + bool contig = internal::is_contigous(dims, stride, offset); + + void * host_ptr = offset_void(field->type, field->data, offset);; + + void *d_data = NULL; + if(contig) + { + size_t field_bytes = type_size * field_size; + cudaMalloc(&d_data, field_bytes); + + cudaMemcpy(d_data, host_ptr, field_bytes, cudaMemcpyHostToDevice); + } + return offset_void(field->type, d_data, -offset); +} + +void cleanup_device_ptr(void *orig_ptr, void *d_ptr, size_t bytes, long long int offset, zfp_type type) +{ + bool device = cuZFP::is_gpu_ptr(orig_ptr); + if(device) + { + return; + } + // from whence it came + void *d_offset_ptr = offset_void(type, d_ptr, offset); + void *h_offset_ptr = offset_void(type, orig_ptr, offset); + + if(bytes > 0) + { + cudaMemcpy(h_offset_ptr, d_offset_ptr, bytes, cudaMemcpyDeviceToHost); + } + + cudaFree(d_offset_ptr); +} + +} // namespace internal + +size_t +cuda_compress(zfp_stream *stream, const zfp_field *field) +{ + uint dims[3]; + dims[0] = field->nx; + dims[1] = field->ny; + dims[2] = field->nz; + + int3 stride; + stride.x = field->sx ? field->sx : 1; + stride.y = field->sy ? field->sy : field->nx; + stride.z = field->sz ? field->sz : field->nx * field->ny; + + size_t stream_bytes = 0; + long long int offset = 0; + void *d_data = internal::setup_device_field(field, stride, offset); + + if(d_data == NULL) + { + // null means the array is non-contiguous host mem which is not supported + return 0; + } + + Word *d_stream = internal::setup_device_stream(stream, field); + + if(field->type == zfp_type_float) + { + float* data = (float*) d_data; + stream_bytes = internal::encode<float>(dims, stride, (int)stream->maxbits, data, d_stream); + } + else if(field->type == zfp_type_double) + { + double* data = (double*) d_data; + stream_bytes = internal::encode<double>(dims, stride, (int)stream->maxbits, data, d_stream); + } + else if(field->type == zfp_type_int32) + { + int * data = (int*) d_data; + stream_bytes = internal::encode<int>(dims, stride, (int)stream->maxbits, data, d_stream); + } + else if(field->type == zfp_type_int64) + { + long long int * data = (long long int*) d_data; + stream_bytes = internal::encode<long long int>(dims, stride, (int)stream->maxbits, data, d_stream); + } + + internal::cleanup_device_ptr(stream->stream->begin, d_stream, stream_bytes, 0, field->type); + internal::cleanup_device_ptr(field->data, d_data, 0, offset, field->type); + + // zfp wants to flush the stream. + // set bits to wsize because we already did that. + size_t compressed_size = stream_bytes / sizeof(Word); + stream->stream->bits = wsize; + // set stream pointer to end of stream + stream->stream->ptr = stream->stream->begin + compressed_size; + + return stream_bytes; +} + +void +cuda_decompress(zfp_stream *stream, zfp_field *field) +{ + uint dims[3]; + dims[0] = field->nx; + dims[1] = field->ny; + dims[2] = field->nz; + + int3 stride; + stride.x = field->sx ? field->sx : 1; + stride.y = field->sy ? field->sy : field->nx; + stride.z = field->sz ? field->sz : field->nx * field->ny; + + size_t decoded_bytes = 0; + long long int offset = 0; + void *d_data = internal::setup_device_field(field, stride, offset); + + if(d_data == NULL) + { + // null means the array is non-contiguous host mem which is not supported + return; + } + + Word *d_stream = internal::setup_device_stream(stream, field); + + if(field->type == zfp_type_float) + { + float *data = (float*) d_data; + decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data); + d_data = (void*) data; + } + else if(field->type == zfp_type_double) + { + double *data = (double*) d_data; + decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data); + d_data = (void*) data; + } + else if(field->type == zfp_type_int32) + { + int *data = (int*) d_data; + decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data); + d_data = (void*) data; + } + else if(field->type == zfp_type_int64) + { + long long int *data = (long long int*) d_data; + decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data); + d_data = (void*) data; + } + else + { + std::cerr<<"Cannot decompress: type unknown\n"; + } + + + size_t type_size = zfp_type_size(field->type); + + size_t field_size = 1; + for(int i = 0; i < 3; ++i) + { + if(dims[i] != 0) + { + field_size *= dims[i]; + } + } + + size_t bytes = type_size * field_size; + internal::cleanup_device_ptr(stream->stream, d_stream,0, 0, field->type); + internal::cleanup_device_ptr(field->data, d_data, bytes, offset, field->type); + + // this is how zfp determins if this was a success + size_t words_read = decoded_bytes / sizeof(Word); + stream->stream->bits = wsize; + // set stream pointer to end of stream + stream->stream->ptr = stream->stream->begin + words_read; + +} + diff --git a/zfp/src/cuda_zfp/cuZFP.h b/zfp/src/cuda_zfp/cuZFP.h new file mode 100644 index 0000000000000000000000000000000000000000..c88fe1e4aa6383c048bc7306a020590af2ea4749 --- /dev/null +++ b/zfp/src/cuda_zfp/cuZFP.h @@ -0,0 +1,15 @@ +#ifndef cuZFP_h +#define cuZFP_h + +#include "zfp.h" + +#ifdef __cplusplus +extern "C" { +#endif + size_t cuda_compress(zfp_stream *stream, const zfp_field *field); + void cuda_decompress(zfp_stream *stream, zfp_field *field); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/zfp/src/cuda_zfp/decode.cuh b/zfp/src/cuda_zfp/decode.cuh new file mode 100644 index 0000000000000000000000000000000000000000..656cf0b36196a3aca1e2654d65976b9576e4eef6 --- /dev/null +++ b/zfp/src/cuda_zfp/decode.cuh @@ -0,0 +1,264 @@ +#ifndef CU_ZFP_DECODE_CUH +#define CU_ZFP_DECODE_CUH + +#include "shared.h" + +namespace cuZFP +{ + +/* map two's complement signed integer to negabinary unsigned integer */ +inline __device__ +long long int uint2int(unsigned long long int x) +{ + return (x ^0xaaaaaaaaaaaaaaaaull) - 0xaaaaaaaaaaaaaaaaull; +} + +inline __device__ +int uint2int(unsigned int x) +{ + return (x ^0xaaaaaaaau) - 0xaaaaaaaau; +} + +template<int block_size> +class BlockReader +{ +private: + const int m_maxbits; + int m_current_bit; + Word *m_words; + Word m_buffer; + bool m_valid_block; + int m_block_idx; + + __device__ BlockReader() + : m_maxbits(0) + { + } + +public: + __device__ BlockReader(Word *b, const int &maxbits, const int &block_idx, const int &num_blocks) + : m_maxbits(maxbits), m_valid_block(true) + { + if(block_idx >= num_blocks) m_valid_block = false; + int word_index = (block_idx * maxbits) / (sizeof(Word) * 8); + m_words = b + word_index; + m_buffer = *m_words; + m_current_bit = (block_idx * maxbits) % (sizeof(Word) * 8); + + m_buffer >>= m_current_bit; + m_block_idx = block_idx; + + } + inline __device__ + void print() + { + print_bits(m_buffer); + } + + inline __device__ + uint read_bit() + { + uint bit = m_buffer & 1; + ++m_current_bit; + m_buffer >>= 1; + // handle moving into next word + if(m_current_bit >= sizeof(Word) * 8) + { + m_current_bit = 0; + ++m_words; + m_buffer = *m_words; + } + return bit; + } + + + // note this assumes that n_bits is <= 64 + inline __device__ + uint64 read_bits(const uint &n_bits) + { + uint64 bits; + // rem bits will always be positive + int rem_bits = sizeof(Word) * 8 - m_current_bit; + + int first_read = min(rem_bits, n_bits); + // first mask + Word mask = ((Word)1<<((first_read)))-1; + bits = m_buffer & mask; + m_buffer >>= n_bits; + m_current_bit += first_read; + int next_read = 0; + if(n_bits >= rem_bits) + { + ++m_words; + m_buffer = *m_words; + m_current_bit = 0; + next_read = n_bits - first_read; + } + + // this is basically a no-op when first read constained + // all the bits. TODO: if we have aligned reads, this could + // be a conditional without divergence + mask = ((Word)1<<((next_read)))-1; + bits += (m_buffer & mask) << first_read; + m_buffer >>= next_read; + m_current_bit += next_read; + return bits; + } + +}; // block reader + +template<typename Scalar, int Size, typename UInt> +inline __device__ +void decode_ints(BlockReader<Size> &reader, uint &max_bits, UInt *data) +{ + const int intprec = get_precision<Scalar>(); + memset(data, 0, sizeof(UInt) * Size); + uint64 x; + // maxprec = 64; + const uint kmin = 0; //= intprec > maxprec ? intprec - maxprec : 0; + int bits = max_bits; + for (uint k = intprec, n = 0; bits && k-- > kmin;) + { + // read bit plane + uint m = MIN(n, bits); + bits -= m; + x = reader.read_bits(m); + for (; n < Size && bits && (bits--, reader.read_bit()); x += (Word) 1 << n++) + for (; n < (Size - 1) && bits && (bits--, !reader.read_bit()); n++); + + // deposit bit plane + #pragma unroll + for (int i = 0; x; i++, x >>= 1) + { + data[i] += (UInt)(x & 1u) << k; + } + } +} + + +template<int BlockSize> +struct inv_transform; + +template<> +struct inv_transform<64> +{ + template<typename Int> + __device__ void inv_xform(Int *p) + { + uint x, y, z; + /* transform along z */ + for (y = 0; y < 4; y++) + for (x = 0; x < 4; x++) + inv_lift<Int,16>(p + 1 * x + 4 * y); + /* transform along y */ + for (x = 0; x < 4; x++) + for (z = 0; z < 4; z++) + inv_lift<Int,4>(p + 16 * z + 1 * x); + /* transform along x */ + for (z = 0; z < 4; z++) + for (y = 0; y < 4; y++) + inv_lift<Int,1>(p + 4 * y + 16 * z); + } + +}; + +template<> +struct inv_transform<16> +{ + template<typename Int> + __device__ void inv_xform(Int *p) + { + + for(int x = 0; x < 4; ++x) + { + inv_lift<Int,4>(p + 1 * x); + } + for(int y = 0; y < 4; ++y) + { + inv_lift<Int,1>(p + 4 * y); + } + } + +}; + +template<> +struct inv_transform<4> +{ + template<typename Int> + __device__ void inv_xform(Int *p) + { + inv_lift<Int,1>(p); + } + +}; + +template<typename Scalar, int BlockSize> +__device__ void zfp_decode(BlockReader<BlockSize> &reader, Scalar *fblock, uint maxbits) +{ + typedef typename zfp_traits<Scalar>::UInt UInt; + typedef typename zfp_traits<Scalar>::Int Int; + + uint s_cont = 1; + // + // there is no skip path for integers so just continue + // + if(!is_int<Scalar>()) + { + s_cont = reader.read_bit(); + } + + if(s_cont) + { + uint ebits = get_ebits<Scalar>() + 1; + + uint emax; + if(!is_int<Scalar>()) + { + // read in the shared exponent + emax = reader.read_bits(ebits - 1) - get_ebias<Scalar>(); + } + else + { + // no exponent bits + ebits = 0; + } + + maxbits -= ebits; + + UInt ublock[BlockSize]; + + decode_ints<Scalar, BlockSize, UInt>(reader, maxbits, ublock); + + Int iblock[BlockSize]; + unsigned char *perm = get_perm<BlockSize>(); +#if (CUDART_VERSION < 8000) + #pragma unroll +#else + #pragma unroll BlockSize +#endif + for(int i = 0; i < BlockSize; ++i) + { + iblock[perm[i]] = uint2int(ublock[i]); + } + + inv_transform<BlockSize> trans; + trans.inv_xform(iblock); + + Scalar inv_w = dequantize<Int, Scalar>(1, emax); + +#if (CUDART_VERSION < 8000) + #pragma unroll +#else + #pragma unroll BlockSize +#endif + for(int i = 0; i < BlockSize; ++i) + { + fblock[i] = inv_w * (Scalar)iblock[i]; + } + + } +} + + +} // namespace cuZFP +#endif diff --git a/zfp/src/cuda_zfp/decode1.cuh b/zfp/src/cuda_zfp/decode1.cuh new file mode 100644 index 0000000000000000000000000000000000000000..996d9ed1ef17af1ff5dbf6c2e9fa7ee5a96bb484 --- /dev/null +++ b/zfp/src/cuda_zfp/decode1.cuh @@ -0,0 +1,155 @@ +#ifndef CUZFP_DECODE1_CUH +#define CUZFP_DECODE1_CUH + +#include "shared.h" +#include "decode.cuh" +#include "type_info.cuh" + +namespace cuZFP { + + +template<typename Scalar> +__device__ __host__ inline +void scatter_partial1(const Scalar* q, Scalar* p, int nx, int sx) +{ + uint x; + for (x = 0; x < nx; x++, p += sx) + *p = *q++; +} + +template<typename Scalar> +__device__ __host__ inline +void scatter1(const Scalar* q, Scalar* p, int sx) +{ + uint x; + for (x = 0; x < 4; x++, p += sx) + *p = *q++; +} + +template<class Scalar> +__global__ +void +cudaDecode1(Word *blocks, + Scalar *out, + const uint dim, + const int stride, + const uint padded_dim, + const uint total_blocks, + uint maxbits) +{ + typedef unsigned long long int ull; + typedef long long int ll; + typedef typename zfp_traits<Scalar>::UInt UInt; + typedef typename zfp_traits<Scalar>::Int Int; + + const int intprec = get_precision<Scalar>(); + + const ull blockId = blockIdx.x + + blockIdx.y * gridDim.x + + gridDim.x * gridDim.y * blockIdx.z; + + // each thread gets a block so the block index is + // the global thread index + const ull block_idx = blockId * blockDim.x + threadIdx.x; + + if(block_idx >= total_blocks) return; + + BlockReader<4> reader(blocks, maxbits, block_idx, total_blocks); + Scalar result[4] = {0,0,0,0}; + + zfp_decode(reader, result, maxbits); + + uint block; + block = block_idx * 4ull; + const ll offset = (ll)block * stride; + + bool partial = false; + if(block + 4 > dim) partial = true; + if(partial) + { + const uint nx = 4u - (padded_dim - dim); + scatter_partial1(result, out + offset, nx, stride); + } + else + { + scatter1(result, out + offset, stride); + } +} + +template<class Scalar> +size_t decode1launch(uint dim, + int stride, + Word *stream, + Scalar *d_data, + uint maxbits) +{ + const int cuda_block_size = 128; + + uint zfp_pad(dim); + if(zfp_pad % 4 != 0) zfp_pad += 4 - dim % 4; + + uint zfp_blocks = (zfp_pad) / 4; + + if(dim % 4 != 0) zfp_blocks = (dim + (4 - dim % 4)) / 4; + + int block_pad = 0; + if(zfp_blocks % cuda_block_size != 0) + { + block_pad = cuda_block_size - zfp_blocks % cuda_block_size; + } + + size_t total_blocks = block_pad + zfp_blocks; + size_t stream_bytes = calc_device_mem1d(zfp_pad, maxbits); + + dim3 block_size = dim3(cuda_block_size, 1, 1); + dim3 grid_size = calculate_grid_size(total_blocks, cuda_block_size); + +#ifdef CUDA_ZFP_RATE_PRINT + // setup some timing code + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); +#endif + + cudaDecode1<Scalar> << < grid_size, block_size >> > + (stream, + d_data, + dim, + stride, + zfp_pad, + zfp_blocks, // total blocks to decode + maxbits); + +#ifdef CUDA_ZFP_RATE_PRINT + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaStreamSynchronize(0); + + float miliseconds = 0; + cudaEventElapsedTime(&miliseconds, start, stop); + float seconds = miliseconds / 1000.f; + float rate = (float(dim) * sizeof(Scalar) ) / seconds; + rate /= 1024.f; + rate /= 1024.f; + rate /= 1024.f; + printf("Decode elapsed time: %.5f (s)\n", seconds); + printf("# decode1 rate: %.2f (GB / sec) %d\n", rate, maxbits); +#endif + return stream_bytes; +} + +template<class Scalar> +size_t decode1(int dim, + int stride, + Word *stream, + Scalar *d_data, + uint maxbits) +{ + return decode1launch<Scalar>(dim, stride, stream, d_data, maxbits); +} + +} // namespace cuZFP + +#endif diff --git a/zfp/src/cuda_zfp/decode2.cuh b/zfp/src/cuda_zfp/decode2.cuh new file mode 100644 index 0000000000000000000000000000000000000000..41e112b5a90a6ecb46c4cac4603fd6bd8e3ab172 --- /dev/null +++ b/zfp/src/cuda_zfp/decode2.cuh @@ -0,0 +1,172 @@ +#ifndef CUZFP_DECODE2_CUH +#define CUZFP_DECODE2_CUH + +#include "shared.h" +#include "decode.cuh" +#include "type_info.cuh" + +namespace cuZFP { + +template<typename Scalar> +__device__ __host__ inline +void scatter_partial2(const Scalar* q, Scalar* p, int nx, int ny, int sx, int sy) +{ + uint x, y; + for (y = 0; y < ny; y++, p += sy - nx * sx, q += 4 - nx) + for (x = 0; x < nx; x++, p += sx, q++) + *p = *q; +} + +template<typename Scalar> +__device__ __host__ inline +void scatter2(const Scalar* q, Scalar* p, int sx, int sy) +{ + uint x, y; + for (y = 0; y < 4; y++, p += sy - 4 * sx) + for (x = 0; x < 4; x++, p += sx) + *p = *q++; +} + + +template<class Scalar, int BlockSize> +__global__ +void +cudaDecode2(Word *blocks, + Scalar *out, + const uint2 dims, + const int2 stride, + const uint2 padded_dims, + uint maxbits) +{ + typedef unsigned long long int ull; + typedef long long int ll; + const ull blockId = blockIdx.x + + blockIdx.y * gridDim.x + + gridDim.x * gridDim.y * blockIdx.z; + + // each thread gets a block so the block index is + // the global thread index + const ull block_idx = blockId * blockDim.x + threadIdx.x; + + const int total_blocks = (padded_dims.x * padded_dims.y) / 16; + + if(block_idx >= total_blocks) + { + return; + } + + BlockReader<BlockSize> reader(blocks, maxbits, block_idx, total_blocks); + + Scalar result[BlockSize]; + memset(result, 0, sizeof(Scalar) * BlockSize); + + zfp_decode(reader, result, maxbits); + + // logical block dims + uint2 block_dims; + block_dims.x = padded_dims.x >> 2; + block_dims.y = padded_dims.y >> 2; + // logical pos in 3d array + uint2 block; + block.x = (block_idx % block_dims.x) * 4; + block.y = ((block_idx/ block_dims.x) % block_dims.y) * 4; + + const ll offset = (ll)block.x * stride.x + (ll)block.y * stride.y; + + bool partial = false; + if(block.x + 4 > dims.x) partial = true; + if(block.y + 4 > dims.y) partial = true; + if(partial) + { + const uint nx = block.x + 4 > dims.x ? dims.x - block.x : 4; + const uint ny = block.y + 4 > dims.y ? dims.y - block.y : 4; + scatter_partial2(result, out + offset, nx, ny, stride.x, stride.y); + } + else + { + scatter2(result, out + offset, stride.x, stride.y); + } +} + +template<class Scalar> +size_t decode2launch(uint2 dims, + int2 stride, + Word *stream, + Scalar *d_data, + uint maxbits) +{ + const int cuda_block_size = 128; + dim3 block_size; + block_size = dim3(cuda_block_size, 1, 1); + + uint2 zfp_pad(dims); + // ensure that we have block sizes + // that are a multiple of 4 + if(zfp_pad.x % 4 != 0) zfp_pad.x += 4 - dims.x % 4; + if(zfp_pad.y % 4 != 0) zfp_pad.y += 4 - dims.y % 4; + + const int zfp_blocks = (zfp_pad.x * zfp_pad.y) / 16; + + + // + // we need to ensure that we launch a multiple of the + // cuda block size + // + int block_pad = 0; + if(zfp_blocks % cuda_block_size != 0) + { + block_pad = cuda_block_size - zfp_blocks % cuda_block_size; + } + + + size_t stream_bytes = calc_device_mem2d(zfp_pad, maxbits); + size_t total_blocks = block_pad + zfp_blocks; + dim3 grid_size = calculate_grid_size(total_blocks, cuda_block_size); + +#ifdef CUDA_ZFP_RATE_PRINT + // setup some timing code + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); +#endif + + cudaDecode2<Scalar, 16> << < grid_size, block_size >> > + (stream, + d_data, + dims, + stride, + zfp_pad, + maxbits); + +#ifdef CUDA_ZFP_RATE_PRINT + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaStreamSynchronize(0); + + float miliseconds = 0; + cudaEventElapsedTime(&miliseconds, start, stop); + float seconds = miliseconds / 1000.f; + float rate = (float(dims.x * dims.y) * sizeof(Scalar) ) / seconds; + rate /= 1024.f; + rate /= 1024.f; + rate /= 1024.f; + printf("Decode elapsed time: %.5f (s)\n", seconds); + printf("# decode2 rate: %.2f (GB / sec) %d\n", rate, maxbits); +#endif + return stream_bytes; +} + +template<class Scalar> +size_t decode2(uint2 dims, + int2 stride, + Word *stream, + Scalar *d_data, + uint maxbits) +{ + return decode2launch<Scalar>(dims, stride, stream, d_data, maxbits); +} + +} // namespace cuZFP + +#endif diff --git a/zfp/src/cuda_zfp/decode3.cuh b/zfp/src/cuda_zfp/decode3.cuh new file mode 100644 index 0000000000000000000000000000000000000000..2a3ef00804de5261588622fa89d567da2f456f4c --- /dev/null +++ b/zfp/src/cuda_zfp/decode3.cuh @@ -0,0 +1,183 @@ +#ifndef CUZFP_DECODE3_CUH +#define CUZFP_DECODE3_CUH + +#include "shared.h" +#include "decode.cuh" +#include "type_info.cuh" + +namespace cuZFP { + +template<typename Scalar> +__device__ __host__ inline +void scatter_partial3(const Scalar* q, Scalar* p, int nx, int ny, int nz, int sx, int sy, int sz) +{ + uint x, y, z; + for (z = 0; z < nz; z++, p += sz - ny * sy, q += 4 * (4 - ny)) + for (y = 0; y < ny; y++, p += sy - nx * sx, q += 4 - nx) + for (x = 0; x < nx; x++, p += sx, q++) + *p = *q; +} + +template<typename Scalar> +__device__ __host__ inline +void scatter3(const Scalar* q, Scalar* p, int sx, int sy, int sz) +{ + uint x, y, z; + for (z = 0; z < 4; z++, p += sz - 4 * sy) + for (y = 0; y < 4; y++, p += sy - 4 * sx) + for (x = 0; x < 4; x++, p += sx) + *p = *q++; +} + + +template<class Scalar, int BlockSize> +__global__ +void +cudaDecode3(Word *blocks, + Scalar *out, + const uint3 dims, + const int3 stride, + const uint3 padded_dims, + uint maxbits) +{ + + typedef unsigned long long int ull; + typedef long long int ll; + + const ull blockId = blockIdx.x + + blockIdx.y * gridDim.x + + gridDim.x * gridDim.y * blockIdx.z; + // each thread gets a block so the block index is + // the global thread index + const ull block_idx = blockId * blockDim.x + threadIdx.x; + + const int total_blocks = (padded_dims.x * padded_dims.y * padded_dims.z) / 64; + + if(block_idx >= total_blocks) + { + return; + } + + BlockReader<BlockSize> reader(blocks, maxbits, block_idx, total_blocks); + + Scalar result[BlockSize]; + memset(result, 0, sizeof(Scalar) * BlockSize); + + zfp_decode<Scalar,BlockSize>(reader, result, maxbits); + + // logical block dims + uint3 block_dims; + block_dims.x = padded_dims.x >> 2; + block_dims.y = padded_dims.y >> 2; + block_dims.z = padded_dims.z >> 2; + // logical pos in 3d array + uint3 block; + block.x = (block_idx % block_dims.x) * 4; + block.y = ((block_idx/ block_dims.x) % block_dims.y) * 4; + block.z = (block_idx/ (block_dims.x * block_dims.y)) * 4; + + // default strides + const ll offset = (ll)block.x * stride.x + (ll)block.y * stride.y + (ll)block.z * stride.z; + + bool partial = false; + if(block.x + 4 > dims.x) partial = true; + if(block.y + 4 > dims.y) partial = true; + if(block.z + 4 > dims.z) partial = true; + if(partial) + { + const uint nx = block.x + 4u > dims.x ? dims.x - block.x : 4; + const uint ny = block.y + 4u > dims.y ? dims.y - block.y : 4; + const uint nz = block.z + 4u > dims.z ? dims.z - block.z : 4; + + scatter_partial3(result, out + offset, nx, ny, nz, stride.x, stride.y, stride.z); + } + else + { + scatter3(result, out + offset, stride.x, stride.y, stride.z); + } +} +template<class Scalar> +size_t decode3launch(uint3 dims, + int3 stride, + Word *stream, + Scalar *d_data, + uint maxbits) +{ + const int cuda_block_size = 128; + dim3 block_size; + block_size = dim3(cuda_block_size, 1, 1); + + uint3 zfp_pad(dims); + // ensure that we have block sizes + // that are a multiple of 4 + if(zfp_pad.x % 4 != 0) zfp_pad.x += 4 - dims.x % 4; + if(zfp_pad.y % 4 != 0) zfp_pad.y += 4 - dims.y % 4; + if(zfp_pad.z % 4 != 0) zfp_pad.z += 4 - dims.z % 4; + + const int zfp_blocks = (zfp_pad.x * zfp_pad.y * zfp_pad.z) / 64; + + + // + // we need to ensure that we launch a multiple of the + // cuda block size + // + int block_pad = 0; + if(zfp_blocks % cuda_block_size != 0) + { + block_pad = cuda_block_size - zfp_blocks % cuda_block_size; + } + + size_t total_blocks = block_pad + zfp_blocks; + size_t stream_bytes = calc_device_mem3d(zfp_pad, maxbits); + + dim3 grid_size = calculate_grid_size(total_blocks, cuda_block_size); + +#ifdef CUDA_ZFP_RATE_PRINT + // setup some timing code + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); +#endif + + cudaDecode3<Scalar, 64> << < grid_size, block_size >> > + (stream, + d_data, + dims, + stride, + zfp_pad, + maxbits); + +#ifdef CUDA_ZFP_RATE_PRINT + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaStreamSynchronize(0); + + float miliseconds = 0; + cudaEventElapsedTime(&miliseconds, start, stop); + float seconds = miliseconds / 1000.f; + float rate = (float(dims.x * dims.y * dims.z) * sizeof(Scalar) ) / seconds; + rate /= 1024.f; + rate /= 1024.f; + rate /= 1024.f; + printf("Decode elapsed time: %.5f (s)\n", seconds); + printf("# decode3 rate: %.2f (GB / sec) %d\n", rate, maxbits); +#endif + + return stream_bytes; +} + +template<class Scalar> +size_t decode3(uint3 dims, + int3 stride, + Word *stream, + Scalar *d_data, + uint maxbits) +{ + return decode3launch<Scalar>(dims, stride, stream, d_data, maxbits); +} + +} // namespace cuZFP + +#endif diff --git a/zfp/src/cuda_zfp/encode.cuh b/zfp/src/cuda_zfp/encode.cuh new file mode 100644 index 0000000000000000000000000000000000000000..c65bd356a2f93f4d5627aa1523c9acabecc0b6e1 --- /dev/null +++ b/zfp/src/cuda_zfp/encode.cuh @@ -0,0 +1,419 @@ +#ifndef CU_ZFP_ENCODE_CUH +#define CU_ZFP_ENCODE_CUH + +#include "shared.h" + +namespace cuZFP +{ + +// maximum number of bit planes to encode +__device__ +static int +precision(int maxexp, int maxprec, int minexp) +{ + return MIN(maxprec, MAX(0, maxexp - minexp + 8)); +} + +template<typename Scalar> +inline __device__ +void pad_block(Scalar *p, uint n, uint s) +{ + switch (n) + { + case 0: + p[0 * s] = 0; + /* FALLTHROUGH */ + case 1: + p[1 * s] = p[0 * s]; + /* FALLTHROUGH */ + case 2: + p[2 * s] = p[1 * s]; + /* FALLTHROUGH */ + case 3: + p[3 * s] = p[0 * s]; + /* FALLTHROUGH */ + default: + break; + } +} + +template<class Scalar> +__device__ +static int +exponent(Scalar x) +{ + if (x > 0) { + int e; + frexp(x, &e); + // clamp exponent in case x is denormalized + return max(e, 1 - get_ebias<Scalar>()); + } + return -get_ebias<Scalar>(); +} + +template<class Scalar, int BlockSize> +__device__ +static int +max_exponent(const Scalar* p) +{ + Scalar max_val = 0; + for(int i = 0; i < BlockSize; ++i) + { + Scalar f = fabs(p[i]); + max_val = max(max_val,f); + } + return exponent<Scalar>(max_val); +} + +// lifting transform of 4-vector +template <class Int, uint s> +__device__ +static void +fwd_lift(Int* p) +{ + Int x = *p; p += s; + Int y = *p; p += s; + Int z = *p; p += s; + Int w = *p; p += s; + + // default, non-orthogonal transform (preferred due to speed and quality) + // ( 4 4 4 4) (x) + // 1/16 * ( 5 1 -1 -5) (y) + // (-4 4 4 -4) (z) + // (-2 6 -6 2) (w) + x += w; x >>= 1; w -= x; + z += y; z >>= 1; y -= z; + x += z; x >>= 1; z -= x; + w += y; w >>= 1; y -= w; + w += y >> 1; y -= w >> 1; + + p -= s; *p = w; + p -= s; *p = z; + p -= s; *p = y; + p -= s; *p = x; +} + +template<typename Scalar> +Scalar +inline __device__ +quantize_factor(const int &exponent, Scalar); + +template<> +float +inline __device__ +quantize_factor<float>(const int &exponent, float) +{ + return LDEXP(1.0, get_precision<float>() - 2 - exponent); +} + +template<> +double +inline __device__ +quantize_factor<double>(const int &exponent, double) +{ + return LDEXP(1.0, get_precision<double>() - 2 - exponent); +} + +template<typename Scalar, typename Int, int BlockSize> +void __device__ fwd_cast(Int *iblock, const Scalar *fblock, int emax) +{ + Scalar s = quantize_factor(emax, Scalar()); + for(int i = 0; i < BlockSize; ++i) + { + iblock[i] = (Int) (s * fblock[i]); + } +} + +template<int BlockSize> +struct transform; + +template<> +struct transform<64> +{ + template<typename Int> + __device__ void fwd_xform(Int *p) + { + + uint x, y, z; + /* transform along x */ + for (z = 0; z < 4; z++) + for (y = 0; y < 4; y++) + fwd_lift<Int,1>(p + 4 * y + 16 * z); + /* transform along y */ + for (x = 0; x < 4; x++) + for (z = 0; z < 4; z++) + fwd_lift<Int,4>(p + 16 * z + 1 * x); + /* transform along z */ + for (y = 0; y < 4; y++) + for (x = 0; x < 4; x++) + fwd_lift<Int,16>(p + 1 * x + 4 * y); + + } + +}; + +template<> +struct transform<16> +{ + template<typename Int> + __device__ void fwd_xform(Int *p) + { + + uint x, y; + /* transform along x */ + for (y = 0; y < 4; y++) + fwd_lift<Int,1>(p + 4 * y); + /* transform along y */ + for (x = 0; x < 4; x++) + fwd_lift<Int,4>(p + 1 * x); + } + +}; + +template<> +struct transform<4> +{ + template<typename Int> + __device__ void fwd_xform(Int *p) + { + fwd_lift<Int,1>(p); + } + +}; + +template<typename Int, typename UInt, int BlockSize> +__device__ void fwd_order(UInt *ublock, const Int *iblock) +{ + unsigned char *perm = get_perm<BlockSize>(); + for(int i = 0; i < BlockSize; ++i) + { + ublock[i] = int2uint(iblock[perm[i]]); + } +} + +template<int block_size> +struct BlockWriter +{ + + uint m_word_index; + uint m_start_bit; + uint m_current_bit; + const int m_maxbits; + Word *m_stream; + + __device__ BlockWriter(Word *stream, const int &maxbits, const uint &block_idx) + : m_current_bit(0), + m_maxbits(maxbits), + m_stream(stream) + { + m_word_index = (block_idx * maxbits) / (sizeof(Word) * 8); + m_start_bit = uint((block_idx * maxbits) % (sizeof(Word) * 8)); + } + + template<typename T> + __device__ + void print_bits(T bits) + { + const int bit_size = sizeof(T) * 8; + for(int i = bit_size - 1; i >=0; --i) + { + T one = 1; + T mask = one << i; + int val = (bits & mask) >> i; + printf("%d", val); + } + printf("\n"); + } + __device__ + void print(int index) + { + print_bits(m_stream[index]); + } + + + __device__ + long long unsigned int + write_bits(const long long unsigned int &bits, const uint &n_bits) + { + const uint wbits = sizeof(Word) * 8; + uint seg_start = (m_start_bit + m_current_bit) % wbits; + uint write_index = m_word_index + uint((m_start_bit + m_current_bit) / wbits); + uint seg_end = seg_start + n_bits - 1; + uint shift = seg_start; + // we may be asked to write less bits than exist in 'bits' + // so we have to make sure that anything after n is zero. + // If this does not happen, then we may write into a zfp + // block not at the specified index + // uint zero_shift = sizeof(Word) * 8 - n_bits; + Word left = (bits >> n_bits) << n_bits; + + Word b = bits - left; + Word add = b << shift; + atomicAdd(&m_stream[write_index], add); + // n_bits straddles the word boundary + bool straddle = seg_start < sizeof(Word) * 8 && seg_end >= sizeof(Word) * 8; + if(straddle) + { + Word rem = b >> (sizeof(Word) * 8 - shift); + atomicAdd(&m_stream[write_index + 1], rem); + } + m_current_bit += n_bits; + return bits >> (Word)n_bits; + } + + __device__ + uint write_bit(const unsigned int &bit) + { + const uint wbits = sizeof(Word) * 8; + uint seg_start = (m_start_bit + m_current_bit) % wbits; + uint write_index = m_word_index + uint((m_start_bit + m_current_bit) / wbits); + uint shift = seg_start; + // we may be asked to write less bits than exist in 'bits' + // so we have to make sure that anything after n is zero. + // If this does not happen, then we may write into a zfp + // block not at the specified index + // uint zero_shift = sizeof(Word) * 8 - n_bits; + + Word add = (Word)bit << shift; + atomicAdd(&m_stream[write_index], add); + m_current_bit += 1; + + return bit; + } + +}; + +template<typename Int, int BlockSize> +void inline __device__ encode_block(BlockWriter<BlockSize> &stream, + int maxbits, + int maxprec, + Int *iblock) +{ + transform<BlockSize> tform; + tform.fwd_xform(iblock); + + typedef typename zfp_traits<Int>::UInt UInt; + UInt ublock[BlockSize]; + fwd_order<Int, UInt, BlockSize>(ublock, iblock); + + uint intprec = CHAR_BIT * (uint)sizeof(UInt); + uint kmin = intprec > maxprec ? intprec - maxprec : 0; + uint bits = maxbits; + uint i, k, m, n; + uint64 x; + + for (k = intprec, n = 0; bits && k-- > kmin;) { + /* step 1: extract bit plane #k to x */ + x = 0; + for (i = 0; i < BlockSize; i++) + { + x += (uint64)((ublock[i] >> k) & 1u) << i; + } + /* step 2: encode first n bits of bit plane */ + m = min(n, bits); + //uint temp = bits; + bits -= m; + x = stream.write_bits(x, m); + + /* step 3: unary run-length encode remainder of bit plane */ + for (; n < BlockSize && bits && (bits--, stream.write_bit(!!x)); x >>= 1, n++) + { + for (; n < BlockSize - 1 && bits && (bits--, !stream.write_bit(x & 1u)); x >>= 1, n++) + { + } + } + } + +} + +template<typename Scalar, int BlockSize> +void inline __device__ zfp_encode_block(Scalar *fblock, + const int maxbits, + const uint block_idx, + Word *stream) +{ + BlockWriter<BlockSize> block_writer(stream, maxbits, block_idx); + int emax = max_exponent<Scalar, BlockSize>(fblock); + int maxprec = precision(emax, get_precision<Scalar>(), get_min_exp<Scalar>()); + uint e = maxprec ? emax + get_ebias<Scalar>() : 0; + if(e) + { + const uint ebits = get_ebits<Scalar>()+1; + block_writer.write_bits(2 * e + 1, ebits); + typedef typename zfp_traits<Scalar>::Int Int; + Int iblock[BlockSize]; + fwd_cast<Scalar, Int, BlockSize>(iblock, fblock, emax); + + + encode_block<Int, BlockSize>(block_writer, maxbits - ebits, maxprec, iblock); + } +} + +template<> +void inline __device__ zfp_encode_block<int, 64>(int *fblock, + const int maxbits, + const uint block_idx, + Word *stream) +{ + BlockWriter<64> block_writer(stream, maxbits, block_idx); + const int intprec = get_precision<int>(); + encode_block<int, 64>(block_writer, maxbits, intprec, fblock); +} + +template<> +void inline __device__ zfp_encode_block<long long int, 64>(long long int *fblock, + const int maxbits, + const uint block_idx, + Word *stream) +{ + BlockWriter<64> block_writer(stream, maxbits, block_idx); + const int intprec = get_precision<long long int>(); + encode_block<long long int, 64>(block_writer, maxbits, intprec, fblock); +} + +template<> +void inline __device__ zfp_encode_block<int, 16>(int *fblock, + const int maxbits, + const uint block_idx, + Word *stream) +{ + BlockWriter<16> block_writer(stream, maxbits, block_idx); + const int intprec = get_precision<int>(); + encode_block<int, 16>(block_writer, maxbits, intprec, fblock); +} + +template<> +void inline __device__ zfp_encode_block<long long int, 16>(long long int *fblock, + const int maxbits, + const uint block_idx, + Word *stream) +{ + BlockWriter<16> block_writer(stream, maxbits, block_idx); + const int intprec = get_precision<long long int>(); + encode_block<long long int, 16>(block_writer, maxbits, intprec, fblock); +} + +template<> +void inline __device__ zfp_encode_block<int, 4>(int *fblock, + const int maxbits, + const uint block_idx, + Word *stream) +{ + BlockWriter<4> block_writer(stream, maxbits, block_idx); + const int intprec = get_precision<int>(); + encode_block<int, 4>(block_writer, maxbits, intprec, fblock); +} + +template<> +void inline __device__ zfp_encode_block<long long int, 4>(long long int *fblock, + const int maxbits, + const uint block_idx, + Word *stream) +{ + BlockWriter<4> block_writer(stream, maxbits, block_idx); + const int intprec = get_precision<long long int>(); + encode_block<long long int, 4>(block_writer, maxbits, intprec, fblock); +} + +} // namespace cuZFP +#endif diff --git a/zfp/src/cuda_zfp/encode1.cuh b/zfp/src/cuda_zfp/encode1.cuh new file mode 100644 index 0000000000000000000000000000000000000000..9353f8c02073fec44a7030d5ed39b66f3a62ff32 --- /dev/null +++ b/zfp/src/cuda_zfp/encode1.cuh @@ -0,0 +1,174 @@ +#ifndef CUZFP_ENCODE1_CUH +#define CUZFP_ENCODE1_CUH + +#include "cuZFP.h" +#include "shared.h" +#include "encode.cuh" +#include "type_info.cuh" + +#include <iostream> +#define ZFP_1D_BLOCK_SIZE 4 + +namespace cuZFP +{ + +template<typename Scalar> +__device__ __host__ inline +void gather_partial1(Scalar* q, const Scalar* p, int nx, int sx) +{ + uint x; + for (x = 0; x < nx; x++, p += sx) + q[x] = *p; + pad_block(q, nx, 1); +} + +template<typename Scalar> +__device__ __host__ inline +void gather1(Scalar* q, const Scalar* p, int sx) +{ + uint x; + for (x = 0; x < 4; x++, p += sx) + *q++ = *p; +} + +template<class Scalar> +__global__ +void +cudaEncode1(const uint maxbits, + const Scalar* scalars, + Word *stream, + const uint dim, + const int sx, + const uint padded_dim, + const uint tot_blocks) +{ + + typedef unsigned long long int ull; + typedef long long int ll; + const ull blockId = blockIdx.x + + blockIdx.y * gridDim.x + + gridDim.x * gridDim.y * blockIdx.z; + + // each thread gets a block so the block index is + // the global thread index + const uint block_idx = blockId * blockDim.x + threadIdx.x; + + if(block_idx >= tot_blocks) + { + // we can't launch the exact number of blocks + // so just exit if this isn't real + return; + } + + uint block_dim; + block_dim = padded_dim >> 2; + + // logical pos in 3d array + uint block; + block = (block_idx % block_dim) * 4; + + const ll offset = (ll)block * sx; + + Scalar fblock[ZFP_1D_BLOCK_SIZE]; + + bool partial = false; + if(block + 4 > dim) partial = true; + + if(partial) + { + uint nx = 4 - (padded_dim - dim); + gather_partial1(fblock, scalars + offset, nx, sx); + } + else + { + gather1(fblock, scalars + offset, sx); + } + + zfp_encode_block<Scalar, ZFP_1D_BLOCK_SIZE>(fblock, maxbits, block_idx, stream); + +} +// +// Launch the encode kernel +// +template<class Scalar> +size_t encode1launch(uint dim, + int sx, + const Scalar *d_data, + Word *stream, + const int maxbits) +{ + const int cuda_block_size = 128; + dim3 block_size = dim3(cuda_block_size, 1, 1); + + uint zfp_pad(dim); + if(zfp_pad % 4 != 0) zfp_pad += 4 - dim % 4; + + const uint zfp_blocks = (zfp_pad) / 4; + // + // we need to ensure that we launch a multiple of the + // cuda block size + // + int block_pad = 0; + if(zfp_blocks % cuda_block_size != 0) + { + block_pad = cuda_block_size - zfp_blocks % cuda_block_size; + } + + size_t total_blocks = block_pad + zfp_blocks; + + dim3 grid_size = calculate_grid_size(total_blocks, cuda_block_size); + + // + size_t stream_bytes = calc_device_mem1d(zfp_pad, maxbits); + // ensure we have zeros + cudaMemset(stream, 0, stream_bytes); + +#ifdef CUDA_ZFP_RATE_PRINT + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); +#endif + + cudaEncode1<Scalar> << <grid_size, block_size>> > + (maxbits, + d_data, + stream, + dim, + sx, + zfp_pad, + zfp_blocks); + +#ifdef CUDA_ZFP_RATE_PRINT + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaStreamSynchronize(0); + + float miliseconds = 0.f; + cudaEventElapsedTime(&miliseconds, start, stop); + float seconds = miliseconds / 1000.f; + float gb = (float(dim) * float(sizeof(Scalar))) / (1024.f * 1024.f * 1024.f); + float rate = gb / seconds; + printf("Encode elapsed time: %.5f (s)\n", seconds); + printf("# encode1 rate: %.2f (GB / sec) %d\n", rate, maxbits); +#endif + return stream_bytes; +} + +// +// Encode a host vector and output a encoded device vector +// +template<class Scalar> +size_t encode1(int dim, + int sx, + Scalar *d_data, + Word *stream, + const int maxbits) +{ + return encode1launch<Scalar>(dim, sx, d_data, stream, maxbits); +} + +} + +#endif diff --git a/zfp/src/cuda_zfp/encode2.cuh b/zfp/src/cuda_zfp/encode2.cuh new file mode 100644 index 0000000000000000000000000000000000000000..7d9ebfe07ddd1b78f876609a1d6da8f61f0a544f --- /dev/null +++ b/zfp/src/cuda_zfp/encode2.cuh @@ -0,0 +1,184 @@ +#ifndef CUZFP_ENCODE2_CUH +#define CUZFP_ENCODE2_CUH + +#include "cuZFP.h" +#include "shared.h" +#include "encode.cuh" +#include "ErrorCheck.h" +#include "type_info.cuh" + +#define ZFP_2D_BLOCK_SIZE 16 + +namespace cuZFP +{ + +template<typename Scalar> +__device__ __host__ inline +void gather_partial2(Scalar* q, const Scalar* p, int nx, int ny, int sx, int sy) +{ + uint x, y; + for (y = 0; y < ny; y++, p += sy - nx * sx) { + for (x = 0; x < nx; x++, p += sx) + q[4 * y + x] = *p; + pad_block(q + 4 * y, nx, 1); + } + for (x = 0; x < 4; x++) + pad_block(q + x, ny, 4); +} + +template<typename Scalar> +__device__ __host__ inline +void gather2(Scalar* q, const Scalar* p, int sx, int sy) +{ + uint x, y; + for (y = 0; y < 4; y++, p += sy - 4 * sx) + for (x = 0; x < 4; x++, p += sx) + *q++ = *p; +} + +template<class Scalar> +__global__ +void +cudaEncode2(const uint maxbits, + const Scalar* scalars, + Word *stream, + const uint2 dims, + const int2 stride, + const uint2 padded_dims, + const uint tot_blocks) +{ + + typedef unsigned long long int ull; + typedef long long int ll; + const ull blockId = blockIdx.x + + blockIdx.y * gridDim.x + + gridDim.x * gridDim.y * blockIdx.z; + + // each thread gets a block so the block index is + // the global thread index + const uint block_idx = blockId * blockDim.x + threadIdx.x; + + if(block_idx >= tot_blocks) + { + // we can't launch the exact number of blocks + // so just exit if this isn't real + return; + } + + uint2 block_dims; + block_dims.x = padded_dims.x >> 2; + block_dims.y = padded_dims.y >> 2; + + // logical pos in 3d array + uint2 block; + block.x = (block_idx % block_dims.x) * 4; + block.y = ((block_idx/ block_dims.x) % block_dims.y) * 4; + + const ll offset = (ll)block.x * stride.x + (ll)block.y * stride.y; + + Scalar fblock[ZFP_2D_BLOCK_SIZE]; + + bool partial = false; + if(block.x + 4 > dims.x) partial = true; + if(block.y + 4 > dims.y) partial = true; + + if(partial) + { + const uint nx = block.x + 4 > dims.x ? dims.x - block.x : 4; + const uint ny = block.y + 4 > dims.y ? dims.y - block.y : 4; + gather_partial2(fblock, scalars + offset, nx, ny, stride.x, stride.y); + + } + else + { + gather2(fblock, scalars + offset, stride.x, stride.y); + } + + zfp_encode_block<Scalar, ZFP_2D_BLOCK_SIZE>(fblock, maxbits, block_idx, stream); + +} + +// +// Launch the encode kernel +// +template<class Scalar> +size_t encode2launch(uint2 dims, + int2 stride, + const Scalar *d_data, + Word *stream, + const int maxbits) +{ + const int cuda_block_size = 128; + dim3 block_size = dim3(cuda_block_size, 1, 1); + + uint2 zfp_pad(dims); + if(zfp_pad.x % 4 != 0) zfp_pad.x += 4 - dims.x % 4; + if(zfp_pad.y % 4 != 0) zfp_pad.y += 4 - dims.y % 4; + + const uint zfp_blocks = (zfp_pad.x * zfp_pad.y) / 16; + + // + // we need to ensure that we launch a multiple of the + // cuda block size + // + int block_pad = 0; + if(zfp_blocks % cuda_block_size != 0) + { + block_pad = cuda_block_size - zfp_blocks % cuda_block_size; + } + + size_t total_blocks = block_pad + zfp_blocks; + + dim3 grid_size = calculate_grid_size(total_blocks, cuda_block_size); + + // + size_t stream_bytes = calc_device_mem2d(zfp_pad, maxbits); + // ensure we have zeros + cudaMemset(stream, 0, stream_bytes); + +#ifdef CUDA_ZFP_RATE_PRINT + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); +#endif + + cudaEncode2<Scalar> << <grid_size, block_size>> > + (maxbits, + d_data, + stream, + dims, + stride, + zfp_pad, + zfp_blocks); + +#ifdef CUDA_ZFP_RATE_PRINT + cudaDeviceSynchronize(); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaStreamSynchronize(0); + + float miliseconds = 0.f; + cudaEventElapsedTime(&miliseconds, start, stop); + float seconds = miliseconds / 1000.f; + float mb = (float(dims.x * dims.y) * sizeof(Scalar)) / (1024.f * 1024.f *1024.f); + float rate = mb / seconds; + printf("Encode elapsed time: %.5f (s)\n", seconds); + printf("# encode2 rate: %.2f (GB / sec) %d\n", rate, maxbits); +#endif + return stream_bytes; +} + +template<class Scalar> +size_t encode2(uint2 dims, + int2 stride, + Scalar *d_data, + Word *stream, + const int maxbits) +{ + return encode2launch<Scalar>(dims, stride, d_data, stream, maxbits); +} + +} + +#endif diff --git a/zfp/src/cuda_zfp/encode3.cuh b/zfp/src/cuda_zfp/encode3.cuh new file mode 100644 index 0000000000000000000000000000000000000000..9fe7ddd24df1244db414a901cf55ef951e27e84a --- /dev/null +++ b/zfp/src/cuda_zfp/encode3.cuh @@ -0,0 +1,194 @@ +#ifndef CUZFP_ENCODE3_CUH +#define CUZFP_ENCODE3_CUH + +#include "cuZFP.h" +#include "shared.h" +#include "encode.cuh" +#include "type_info.cuh" + +#define ZFP_3D_BLOCK_SIZE 64 +namespace cuZFP{ + +template<typename Scalar> +__device__ __host__ inline +void gather_partial3(Scalar* q, const Scalar* p, int nx, int ny, int nz, int sx, int sy, int sz) +{ + uint x, y, z; + for (z = 0; z < nz; z++, p += sz - ny * sy) { + for (y = 0; y < ny; y++, p += sy - nx * sx) { + for (x = 0; x < nx; x++, p += sx) + q[16 * z + 4 * y + x] = *p; + pad_block(q + 16 * z + 4 * y, nx, 1); + } + for (x = 0; x < 4; x++) + pad_block(q + 16 * z + x, ny, 4); + } + for (y = 0; y < 4; y++) + for (x = 0; x < 4; x++) + pad_block(q + 4 * y + x, nz, 16); +} + +template<typename Scalar> +__device__ __host__ inline +void gather3(Scalar* q, const Scalar* p, int sx, int sy, int sz) +{ + uint x, y, z; + for (z = 0; z < 4; z++, p += sz - 4 * sy) + for (y = 0; y < 4; y++, p += sy - 4 * sx) + for (x = 0; x < 4; x++, p += sx) + *q++ = *p; +} + +template<class Scalar> +__global__ +void +cudaEncode(const uint maxbits, + const Scalar* scalars, + Word *stream, + const uint3 dims, + const int3 stride, + const uint3 padded_dims, + const uint tot_blocks) +{ + + typedef unsigned long long int ull; + typedef long long int ll; + const ull blockId = blockIdx.x + + blockIdx.y * gridDim.x + + gridDim.x * gridDim.y * blockIdx.z; + + // each thread gets a block so the block index is + // the global thread index + const uint block_idx = blockId * blockDim.x + threadIdx.x; + + if(block_idx >= tot_blocks) + { + // we can't launch the exact number of blocks + // so just exit if this isn't real + return; + } + + uint3 block_dims; + block_dims.x = padded_dims.x >> 2; + block_dims.y = padded_dims.y >> 2; + block_dims.z = padded_dims.z >> 2; + + // logical pos in 3d array + uint3 block; + block.x = (block_idx % block_dims.x) * 4; + block.y = ((block_idx/ block_dims.x) % block_dims.y) * 4; + block.z = (block_idx/ (block_dims.x * block_dims.y)) * 4; + + // default strides + ll offset = (ll)block.x * stride.x + (ll)block.y * stride.y + (ll)block.z * stride.z; + Scalar fblock[ZFP_3D_BLOCK_SIZE]; + + bool partial = false; + if(block.x + 4 > dims.x) partial = true; + if(block.y + 4 > dims.y) partial = true; + if(block.z + 4 > dims.z) partial = true; + + if(partial) + { + const uint nx = block.x + 4 > dims.x ? dims.x - block.x : 4; + const uint ny = block.y + 4 > dims.y ? dims.y - block.y : 4; + const uint nz = block.z + 4 > dims.z ? dims.z - block.z : 4; + gather_partial3(fblock, scalars + offset, nx, ny, nz, stride.x, stride.y, stride.z); + + } + else + { + gather3(fblock, scalars + offset, stride.x, stride.y, stride.z); + } + zfp_encode_block<Scalar, ZFP_3D_BLOCK_SIZE>(fblock, maxbits, block_idx, stream); + +} + +// +// Launch the encode kernel +// +template<class Scalar> +size_t encode3launch(uint3 dims, + int3 stride, + const Scalar *d_data, + Word *stream, + const int maxbits) +{ + + const int cuda_block_size = 128; + dim3 block_size = dim3(cuda_block_size, 1, 1); + + uint3 zfp_pad(dims); + if(zfp_pad.x % 4 != 0) zfp_pad.x += 4 - dims.x % 4; + if(zfp_pad.y % 4 != 0) zfp_pad.y += 4 - dims.y % 4; + if(zfp_pad.z % 4 != 0) zfp_pad.z += 4 - dims.z % 4; + + const uint zfp_blocks = (zfp_pad.x * zfp_pad.y * zfp_pad.z) / 64; + + // + // we need to ensure that we launch a multiple of the + // cuda block size + // + int block_pad = 0; + if(zfp_blocks % cuda_block_size != 0) + { + block_pad = cuda_block_size - zfp_blocks % cuda_block_size; + } + + size_t total_blocks = block_pad + zfp_blocks; + + dim3 grid_size = calculate_grid_size(total_blocks, cuda_block_size); + + size_t stream_bytes = calc_device_mem3d(zfp_pad, maxbits); + //ensure we start with 0s + cudaMemset(stream, 0, stream_bytes); + +#ifdef CUDA_ZFP_RATE_PRINT + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); +#endif + + cudaEncode<Scalar> << <grid_size, block_size>> > + (maxbits, + d_data, + stream, + dims, + stride, + zfp_pad, + zfp_blocks); + +#ifdef CUDA_ZFP_RATE_PRINT + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaStreamSynchronize(0); + + float miliseconds = 0; + cudaEventElapsedTime(&miliseconds, start, stop); + float seconds = miliseconds / 1000.f; + float rate = (float(dims.x * dims.y * dims.z) * sizeof(Scalar) ) / seconds; + rate /= 1024.f; + rate /= 1024.f; + rate /= 1024.f; + printf("Encode elapsed time: %.5f (s)\n", seconds); + printf("# encode3 rate: %.2f (GB / sec) \n", rate); +#endif + return stream_bytes; +} + +// +// Just pass the raw pointer to the "real" encode +// +template<class Scalar> +size_t encode(uint3 dims, + int3 stride, + Scalar *d_data, + Word *stream, + const int bits_per_block) +{ + return encode3launch<Scalar>(dims, stride, d_data, stream, bits_per_block); +} + +} +#endif diff --git a/zfp/src/cuda_zfp/pointers.cuh b/zfp/src/cuda_zfp/pointers.cuh new file mode 100644 index 0000000000000000000000000000000000000000..ee8d773bc806ced70d0d5bab707bbaec83e903af --- /dev/null +++ b/zfp/src/cuda_zfp/pointers.cuh @@ -0,0 +1,25 @@ +#ifndef CUZFP_POINTERS_CUH +#define CUZFP_POINTERS_CUH + +#include "ErrorCheck.h" +#include <iostream> + + +namespace cuZFP +{ +// https://gitlab.kitware.com/third-party/nvpipe/blob/master/encode.c +bool is_gpu_ptr(const void *ptr) +{ + cudaPointerAttributes atts; + const cudaError_t perr = cudaPointerGetAttributes(&atts, ptr); + + // clear last error so other error checking does + // not pick it up + cudaError_t error = cudaGetLastError(); + + return perr == cudaSuccess && atts.memoryType == cudaMemoryTypeDevice; +} + +} // namespace cuZFP + +#endif diff --git a/zfp/src/cuda_zfp/shared.h b/zfp/src/cuda_zfp/shared.h new file mode 100644 index 0000000000000000000000000000000000000000..52de03adc38cd2e9872ac0ae194698384ead298a --- /dev/null +++ b/zfp/src/cuda_zfp/shared.h @@ -0,0 +1,274 @@ +#ifndef CUZFP_SHARED_H +#define CUZFP_SHARED_H + +//#define CUDA_ZFP_RATE_PRINT 1 +typedef unsigned long long Word; +#define Wsize ((uint)(CHAR_BIT * sizeof(Word))) + +#include "type_info.cuh" +#include "zfp.h" +#include <stdio.h> + +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#define bitsize(x) (CHAR_BIT * (uint)sizeof(x)) + +#define LDEXP(x, e) ldexp(x, e) + +#define NBMASK 0xaaaaaaaaaaaaaaaaull + +__constant__ unsigned char c_perm_1[4]; +__constant__ unsigned char c_perm_2[16]; +__constant__ unsigned char c_perm[64]; + +namespace cuZFP +{ + +template<typename T> +__device__ void print_bits(const T &bits) +{ + const int bit_size = sizeof(T) * 8; + + for(int i = bit_size - 1; i >= 0; --i) + { + T one = 1; + T mask = one << i; + T val = (bits & mask) >> i ; + printf("%d", (int) val); + } + printf("\n"); +} + +size_t calc_device_mem1d(const int dim, + const int maxbits) +{ + + const size_t vals_per_block = 4; + size_t total_blocks = dim / vals_per_block; + if(dim % vals_per_block != 0) + { + total_blocks++; + } + const size_t bits_per_block = maxbits; + const size_t bits_per_word = sizeof(Word) * 8; + const size_t total_bits = bits_per_block * total_blocks; + size_t alloc_size = total_bits / bits_per_word; + if(total_bits % bits_per_word != 0) alloc_size++; + // ensure we have zeros + return alloc_size * sizeof(Word); +} + +size_t calc_device_mem2d(const uint2 dims, + const int maxbits) +{ + + const size_t vals_per_block = 16; + size_t total_blocks = (dims.x * dims.y) / vals_per_block; + if((dims.x * dims.y) % vals_per_block != 0) total_blocks++; + const size_t bits_per_block = maxbits; + const size_t bits_per_word = sizeof(Word) * 8; + const size_t total_bits = bits_per_block * total_blocks; + size_t alloc_size = total_bits / bits_per_word; + if(total_bits % bits_per_word != 0) alloc_size++; + return alloc_size * sizeof(Word); +} + +size_t calc_device_mem3d(const uint3 encoded_dims, + const int bits_per_block) +{ + const size_t vals_per_block = 64; + const size_t size = encoded_dims.x * encoded_dims.y * encoded_dims.z; + size_t total_blocks = size / vals_per_block; + const size_t bits_per_word = sizeof(Word) * 8; + const size_t total_bits = bits_per_block * total_blocks; + const size_t alloc_size = total_bits / bits_per_word; + return alloc_size * sizeof(Word); +} + +dim3 get_max_grid_dims() +{ + cudaDeviceProp prop; + int device = 0; + cudaGetDeviceProperties(&prop, device); + dim3 grid_dims; + grid_dims.x = prop.maxGridSize[0]; + grid_dims.y = prop.maxGridSize[1]; + grid_dims.z = prop.maxGridSize[2]; + return grid_dims; +} + +// size is assumed to have a pad to the nearest cuda block size +dim3 calculate_grid_size(size_t size, size_t cuda_block_size) +{ + size_t grids = size / cuda_block_size; // because of pad this will be exact + dim3 max_grid_dims = get_max_grid_dims(); + int dims = 1; + // check to see if we need to add more grids + if( grids > max_grid_dims.x) + { + dims = 2; + } + if(grids > max_grid_dims.x * max_grid_dims.y) + { + dims = 3; + } + + dim3 grid_size; + grid_size.x = 1; + grid_size.y = 1; + grid_size.z = 1; + + if(dims == 1) + { + grid_size.x = grids; + } + + if(dims == 2) + { + float sq_r = sqrt((float)grids); + float intpart = 0.; + modf(sq_r,&intpart); + uint base = intpart; + grid_size.x = base; + grid_size.y = base; + // figure out how many y to add + uint rem = (size - base * base); + uint y_rows = rem / base; + if(rem % base != 0) y_rows ++; + grid_size.y += y_rows; + } + + if(dims == 3) + { + float cub_r = pow((float)grids, 1.f/3.f);; + float intpart = 0.; + modf(cub_r,&intpart); + int base = intpart; + grid_size.x = base; + grid_size.y = base; + grid_size.z = base; + // figure out how many z to add + uint rem = (size - base * base * base); + uint z_rows = rem / (base * base); + if(rem % (base * base) != 0) z_rows ++; + grid_size.z += z_rows; + } + + + return grid_size; +} + + +// map two's complement signed integer to negabinary unsigned integer +inline __device__ +unsigned long long int int2uint(const long long int x) +{ + return (x + (unsigned long long int)0xaaaaaaaaaaaaaaaaull) ^ + (unsigned long long int)0xaaaaaaaaaaaaaaaaull; +} + +inline __device__ +unsigned int int2uint(const int x) +{ + return (x + (unsigned int)0xaaaaaaaau) ^ + (unsigned int)0xaaaaaaaau; +} + + +template<typename Int, typename Scalar> +__device__ +Scalar +dequantize(const Int &x, const int &e); + +template<> +__device__ +double +dequantize<long long int, double>(const long long int &x, const int &e) +{ + return LDEXP((double)x, e - (CHAR_BIT * scalar_sizeof<double>() - 2)); +} + +template<> +__device__ +float +dequantize<int, float>(const int &x, const int &e) +{ + return LDEXP((float)x, e - (CHAR_BIT * scalar_sizeof<float>() - 2)); +} + +template<> +__device__ +int +dequantize<int, int>(const int &x, const int &e) +{ + return 1; +} + +template<> +__device__ +long long int +dequantize<long long int, long long int>(const long long int &x, const int &e) +{ + return 1; +} + +/* inverse lifting transform of 4-vector */ +template<class Int, uint s> +__device__ +static void +inv_lift(Int* p) +{ + Int x, y, z, w; + x = *p; p += s; + y = *p; p += s; + z = *p; p += s; + w = *p; p += s; + + /* + ** non-orthogonal transform + ** ( 4 6 -4 -1) (x) + ** 1/4 * ( 4 2 4 5) (y) + ** ( 4 -2 4 -5) (z) + ** ( 4 -6 -4 1) (w) + */ + y += w >> 1; w -= y >> 1; + y += w; w <<= 1; w -= y; + z += x; x <<= 1; x -= z; + y += z; z <<= 1; z -= y; + w += x; x <<= 1; x -= w; + + p -= s; *p = w; + p -= s; *p = z; + p -= s; *p = y; + p -= s; *p = x; +} + + +template<int BlockSize> +__device__ +unsigned char* get_perm(); + +template<> +__device__ +unsigned char* get_perm<64>() +{ + return c_perm; +} + +template<> +__device__ +unsigned char* get_perm<16>() +{ + return c_perm_2; +} + +template<> +__device__ +unsigned char* get_perm<4>() +{ + return c_perm_1; +} + + +} // namespace cuZFP +#endif diff --git a/zfp/src/cuda_zfp/type_info.cuh b/zfp/src/cuda_zfp/type_info.cuh new file mode 100644 index 0000000000000000000000000000000000000000..969f5532eed77b705b40f8da3c1181f427e28226 --- /dev/null +++ b/zfp/src/cuda_zfp/type_info.cuh @@ -0,0 +1,92 @@ +#ifndef cuZFP_TYPE_INFO +#define cuZFP_TYPE_INFO + +namespace cuZFP { + +template<typename T> inline __host__ __device__ int get_ebias(); +template<> inline __host__ __device__ int get_ebias<double>() { return 1023; } +template<> inline __host__ __device__ int get_ebias<float>() { return 127; } +template<> inline __host__ __device__ int get_ebias<long long int>() { return 0; } +template<> inline __host__ __device__ int get_ebias<int>() { return 0; } + +template<typename T> inline __host__ __device__ int get_ebits(); +template<> inline __host__ __device__ int get_ebits<double>() { return 11; } +template<> inline __host__ __device__ int get_ebits<float>() { return 8; } +template<> inline __host__ __device__ int get_ebits<int>() { return 0; } +template<> inline __host__ __device__ int get_ebits<long long int>() { return 0; } + +template<typename T> inline __host__ __device__ int get_precision(); +template<> inline __host__ __device__ int get_precision<double>() { return 64; } +template<> inline __host__ __device__ int get_precision<long long int>() { return 64; } +template<> inline __host__ __device__ int get_precision<float>() { return 32; } +template<> inline __host__ __device__ int get_precision<int>() { return 32; } + +template<typename T> inline __host__ __device__ int get_min_exp(); +template<> inline __host__ __device__ int get_min_exp<double>() { return -1074; } +template<> inline __host__ __device__ int get_min_exp<float>() { return -1074; } +template<> inline __host__ __device__ int get_min_exp<long long int>() { return 0; } +template<> inline __host__ __device__ int get_min_exp<int>() { return 0; } + +template<typename T> inline __host__ __device__ int scalar_sizeof(); + +template<> inline __host__ __device__ int scalar_sizeof<double>() { return 8; } +template<> inline __host__ __device__ int scalar_sizeof<long long int>() { return 8; } +template<> inline __host__ __device__ int scalar_sizeof<float>() { return 4; } +template<> inline __host__ __device__ int scalar_sizeof<int>() { return 4; } + +template<typename T> struct zfp_traits; + +template<> struct zfp_traits<double> +{ + typedef unsigned long long int UInt; + typedef long long int Int; +}; + +template<> struct zfp_traits<long long int> +{ + typedef unsigned long long int UInt; + typedef long long int Int; +}; + +template<> struct zfp_traits<float> +{ + typedef unsigned int UInt; + typedef int Int; +}; + +template<> struct zfp_traits<int> +{ + typedef unsigned int UInt; + typedef int Int; +}; + +template<typename T> inline __host__ __device__ bool is_int() +{ + return false; +} + +template<> inline __host__ __device__ bool is_int<int>() +{ + return true; +} + +template<> inline __host__ __device__ bool is_int<long long int>() +{ + return true; +} + +template<int T> struct block_traits; + +template<> struct block_traits<1> +{ + typedef unsigned char PlaneType; +}; + +template<> struct block_traits<2> +{ + typedef unsigned short PlaneType; +}; + + +} // namespace cuZFP +#endif diff --git a/zfp/src/decode1d.c b/zfp/src/decode1d.c new file mode 100644 index 0000000000000000000000000000000000000000..93756bf2b6527e6983d1b6248a0b084227fe3233 --- /dev/null +++ b/zfp/src/decode1d.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block1.h" +#include "traitsd.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec1.c" +#include "template/decode.c" +#include "template/decodef.c" +#include "template/decode1.c" diff --git a/zfp/src/decode1f.c b/zfp/src/decode1f.c new file mode 100644 index 0000000000000000000000000000000000000000..55808b474f080058c443712dd2a1cc60615a5518 --- /dev/null +++ b/zfp/src/decode1f.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block1.h" +#include "traitsf.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec1.c" +#include "template/decode.c" +#include "template/decodef.c" +#include "template/decode1.c" diff --git a/zfp/src/decode1i.c b/zfp/src/decode1i.c new file mode 100644 index 0000000000000000000000000000000000000000..22529cc25d887d54e0c47e3374fb99f0a5033f0a --- /dev/null +++ b/zfp/src/decode1i.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block1.h" +#include "traitsi.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec1.c" +#include "template/decode.c" +#include "template/decodei.c" +#include "template/decode1.c" diff --git a/zfp/src/decode1l.c b/zfp/src/decode1l.c new file mode 100644 index 0000000000000000000000000000000000000000..b980cc5d697095560179177b803d73c91d4c9573 --- /dev/null +++ b/zfp/src/decode1l.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block1.h" +#include "traitsl.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec1.c" +#include "template/decode.c" +#include "template/decodei.c" +#include "template/decode1.c" diff --git a/zfp/src/decode2d.c b/zfp/src/decode2d.c new file mode 100644 index 0000000000000000000000000000000000000000..2f72c9fc6f5586a0145ecc4ae7aec53d1a5af56b --- /dev/null +++ b/zfp/src/decode2d.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block2.h" +#include "traitsd.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec2.c" +#include "template/decode.c" +#include "template/decodef.c" +#include "template/decode2.c" diff --git a/zfp/src/decode2f.c b/zfp/src/decode2f.c new file mode 100644 index 0000000000000000000000000000000000000000..a1caffb2a466a056753ad7ee68e1a25e8605c77b --- /dev/null +++ b/zfp/src/decode2f.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block2.h" +#include "traitsf.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec2.c" +#include "template/decode.c" +#include "template/decodef.c" +#include "template/decode2.c" diff --git a/zfp/src/decode2i.c b/zfp/src/decode2i.c new file mode 100644 index 0000000000000000000000000000000000000000..65de16ba8e30e97c30d8f5441fdd1ead6a365202 --- /dev/null +++ b/zfp/src/decode2i.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block2.h" +#include "traitsi.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec2.c" +#include "template/decode.c" +#include "template/decodei.c" +#include "template/decode2.c" diff --git a/zfp/src/decode2l.c b/zfp/src/decode2l.c new file mode 100644 index 0000000000000000000000000000000000000000..0ced03504bedd79f58679f51eecf959ad7f941da --- /dev/null +++ b/zfp/src/decode2l.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block2.h" +#include "traitsl.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec2.c" +#include "template/decode.c" +#include "template/decodei.c" +#include "template/decode2.c" diff --git a/zfp/src/decode3d.c b/zfp/src/decode3d.c new file mode 100644 index 0000000000000000000000000000000000000000..918741fc2dff7ed08fc270b1306a3f2d416f27fd --- /dev/null +++ b/zfp/src/decode3d.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block3.h" +#include "traitsd.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec3.c" +#include "template/decode.c" +#include "template/decodef.c" +#include "template/decode3.c" diff --git a/zfp/src/decode3f.c b/zfp/src/decode3f.c new file mode 100644 index 0000000000000000000000000000000000000000..30587a7709e015891a40a2e3d27a25a99172b14d --- /dev/null +++ b/zfp/src/decode3f.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block3.h" +#include "traitsf.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec3.c" +#include "template/decode.c" +#include "template/decodef.c" +#include "template/decode3.c" diff --git a/zfp/src/decode3i.c b/zfp/src/decode3i.c new file mode 100644 index 0000000000000000000000000000000000000000..aa30070dcd6ad637089338034b4595003bcde4ac --- /dev/null +++ b/zfp/src/decode3i.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block3.h" +#include "traitsi.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec3.c" +#include "template/decode.c" +#include "template/decodei.c" +#include "template/decode3.c" diff --git a/zfp/src/decode3l.c b/zfp/src/decode3l.c new file mode 100644 index 0000000000000000000000000000000000000000..1796b79355b6661795bf7f5a7df58376744945e6 --- /dev/null +++ b/zfp/src/decode3l.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block3.h" +#include "traitsl.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec3.c" +#include "template/decode.c" +#include "template/decodei.c" +#include "template/decode3.c" diff --git a/zfp/src/decode4d.c b/zfp/src/decode4d.c new file mode 100644 index 0000000000000000000000000000000000000000..500e802a69f783a8c89c5398e73413adbe46741b --- /dev/null +++ b/zfp/src/decode4d.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block4.h" +#include "traitsd.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec4.c" +#include "template/decode.c" +#include "template/decodef.c" +#include "template/decode4.c" diff --git a/zfp/src/decode4f.c b/zfp/src/decode4f.c new file mode 100644 index 0000000000000000000000000000000000000000..de15b84fa06f9599cb48a67d9a34be82035153d7 --- /dev/null +++ b/zfp/src/decode4f.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block4.h" +#include "traitsf.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec4.c" +#include "template/decode.c" +#include "template/decodef.c" +#include "template/decode4.c" diff --git a/zfp/src/decode4i.c b/zfp/src/decode4i.c new file mode 100644 index 0000000000000000000000000000000000000000..1bfe4aaf28ec2a744adc7e7633c6064e5266d4e1 --- /dev/null +++ b/zfp/src/decode4i.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block4.h" +#include "traitsi.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec4.c" +#include "template/decode.c" +#include "template/decodei.c" +#include "template/decode4.c" diff --git a/zfp/src/decode4l.c b/zfp/src/decode4l.c new file mode 100644 index 0000000000000000000000000000000000000000..950f8a0b1a963923fa723970a7a55d0bb48eb456 --- /dev/null +++ b/zfp/src/decode4l.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block4.h" +#include "traitsl.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec4.c" +#include "template/decode.c" +#include "template/decodei.c" +#include "template/decode4.c" diff --git a/zfp/src/encode1d.c b/zfp/src/encode1d.c new file mode 100644 index 0000000000000000000000000000000000000000..c96147497272674d1019c3d0ca2033ebedaa7f0e --- /dev/null +++ b/zfp/src/encode1d.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block1.h" +#include "traitsd.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec1.c" +#include "template/encode.c" +#include "template/encodef.c" +#include "template/encode1.c" diff --git a/zfp/src/encode1f.c b/zfp/src/encode1f.c new file mode 100644 index 0000000000000000000000000000000000000000..9e922e516e9ac3d9ab4d804e12f11a77d4c5a27c --- /dev/null +++ b/zfp/src/encode1f.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block1.h" +#include "traitsf.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec1.c" +#include "template/encode.c" +#include "template/encodef.c" +#include "template/encode1.c" diff --git a/zfp/src/encode1i.c b/zfp/src/encode1i.c new file mode 100644 index 0000000000000000000000000000000000000000..2d4a8b6a5eec2ec115e282e79030ce4f3ce29a83 --- /dev/null +++ b/zfp/src/encode1i.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block1.h" +#include "traitsi.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec1.c" +#include "template/encode.c" +#include "template/encodei.c" +#include "template/encode1.c" diff --git a/zfp/src/encode1l.c b/zfp/src/encode1l.c new file mode 100644 index 0000000000000000000000000000000000000000..746539bbef13cf1a1348af5efa12dbc4b1f5dd00 --- /dev/null +++ b/zfp/src/encode1l.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block1.h" +#include "traitsl.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec1.c" +#include "template/encode.c" +#include "template/encodei.c" +#include "template/encode1.c" diff --git a/zfp/src/encode2d.c b/zfp/src/encode2d.c new file mode 100644 index 0000000000000000000000000000000000000000..053efe5e51dbddf40ba6a9b69a4e49dddb3be082 --- /dev/null +++ b/zfp/src/encode2d.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block2.h" +#include "traitsd.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec2.c" +#include "template/encode.c" +#include "template/encodef.c" +#include "template/encode2.c" diff --git a/zfp/src/encode2f.c b/zfp/src/encode2f.c new file mode 100644 index 0000000000000000000000000000000000000000..52321e798dbddba9c169ab74dab252ee108d8e40 --- /dev/null +++ b/zfp/src/encode2f.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block2.h" +#include "traitsf.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec2.c" +#include "template/encode.c" +#include "template/encodef.c" +#include "template/encode2.c" diff --git a/zfp/src/encode2i.c b/zfp/src/encode2i.c new file mode 100644 index 0000000000000000000000000000000000000000..c67d0ed019a87c284fdbda56884bb7458aad3a66 --- /dev/null +++ b/zfp/src/encode2i.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block2.h" +#include "traitsi.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec2.c" +#include "template/encode.c" +#include "template/encodei.c" +#include "template/encode2.c" diff --git a/zfp/src/encode2l.c b/zfp/src/encode2l.c new file mode 100644 index 0000000000000000000000000000000000000000..990bc0104dac9124a146345160aa13bf9257119b --- /dev/null +++ b/zfp/src/encode2l.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block2.h" +#include "traitsl.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec2.c" +#include "template/encode.c" +#include "template/encodei.c" +#include "template/encode2.c" diff --git a/zfp/src/encode3d.c b/zfp/src/encode3d.c new file mode 100644 index 0000000000000000000000000000000000000000..4d82484906f22cfe503383fbcf60eb466d946a0f --- /dev/null +++ b/zfp/src/encode3d.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block3.h" +#include "traitsd.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec3.c" +#include "template/encode.c" +#include "template/encodef.c" +#include "template/encode3.c" diff --git a/zfp/src/encode3f.c b/zfp/src/encode3f.c new file mode 100644 index 0000000000000000000000000000000000000000..0a95c899781bd80d355fec90d1ad424dde58b6bd --- /dev/null +++ b/zfp/src/encode3f.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block3.h" +#include "traitsf.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec3.c" +#include "template/encode.c" +#include "template/encodef.c" +#include "template/encode3.c" diff --git a/zfp/src/encode3i.c b/zfp/src/encode3i.c new file mode 100644 index 0000000000000000000000000000000000000000..6c78aac34aa4973aad965580cb1d3938408ba19b --- /dev/null +++ b/zfp/src/encode3i.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block3.h" +#include "traitsi.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec3.c" +#include "template/encode.c" +#include "template/encodei.c" +#include "template/encode3.c" diff --git a/zfp/src/encode3l.c b/zfp/src/encode3l.c new file mode 100644 index 0000000000000000000000000000000000000000..931c7424b85e1ad99ddba1b8ae2fa592e935d8e4 --- /dev/null +++ b/zfp/src/encode3l.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block3.h" +#include "traitsl.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec3.c" +#include "template/encode.c" +#include "template/encodei.c" +#include "template/encode3.c" diff --git a/zfp/src/encode4d.c b/zfp/src/encode4d.c new file mode 100644 index 0000000000000000000000000000000000000000..5ff58e7c4f2b421262759b7e6a17f36e6b9d5f68 --- /dev/null +++ b/zfp/src/encode4d.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block4.h" +#include "traitsd.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec4.c" +#include "template/encode.c" +#include "template/encodef.c" +#include "template/encode4.c" diff --git a/zfp/src/encode4f.c b/zfp/src/encode4f.c new file mode 100644 index 0000000000000000000000000000000000000000..ba24f586787370fe629d17b7b746c15948e7fbeb --- /dev/null +++ b/zfp/src/encode4f.c @@ -0,0 +1,13 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block4.h" +#include "traitsf.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codecf.c" +#include "template/codec4.c" +#include "template/encode.c" +#include "template/encodef.c" +#include "template/encode4.c" diff --git a/zfp/src/encode4i.c b/zfp/src/encode4i.c new file mode 100644 index 0000000000000000000000000000000000000000..6e9fc1bfc3e2ec0c6ba39d5823ff4ff57ec30f5c --- /dev/null +++ b/zfp/src/encode4i.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block4.h" +#include "traitsi.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec4.c" +#include "template/encode.c" +#include "template/encodei.c" +#include "template/encode4.c" diff --git a/zfp/src/encode4l.c b/zfp/src/encode4l.c new file mode 100644 index 0000000000000000000000000000000000000000..d5bf86c0916e7f22d378de4f96b4276e3bf6becb --- /dev/null +++ b/zfp/src/encode4l.c @@ -0,0 +1,12 @@ +#include "inline/inline.h" +#include "zfp.h" +#include "zfp/macros.h" +#include "block4.h" +#include "traitsl.h" +#include "template/template.h" +#include "template/codec.h" +#include "inline/bitstream.c" +#include "template/codec4.c" +#include "template/encode.c" +#include "template/encodei.c" +#include "template/encode4.c" diff --git a/zfp/src/inline/bitstream.c b/zfp/src/inline/bitstream.c new file mode 100644 index 0000000000000000000000000000000000000000..6e96629254b33f39ea5bdb0baa6284c4992ea7cd --- /dev/null +++ b/zfp/src/inline/bitstream.c @@ -0,0 +1,450 @@ +/* +High-speed in-memory bit stream I/O that supports reading and writing between +0 and 64 bits at a time. The implementation, which relies heavily on bit +shifts, has been carefully written to ensure that all shifts are between +zero and one less the width of the type being shifted to avoid undefined +behavior. This occasionally causes somewhat convoluted code. + +The following assumptions and restrictions apply: + +1. The user must allocate a memory buffer large enough to hold the bit stream, + whether for reading, writing, or both. This buffer is associated with the + bit stream via stream_open(buffer, bytes), which allocates and returns a + pointer to an opaque bit stream struct. Call stream_close(stream) to + deallocate this struct. + +2. The stream is either in a read or write state (or, initially, in both + states). When done writing, call stream_flush(stream) before entering + read mode to ensure any buffered bits are output. To enter read mode, + call stream_rewind(stream) or stream_rseek(stream, offset) to position + the stream at the beginning or at a particular bit offset. Conversely, + stream_rewind(stream) or stream_wseek(stream, offset) positions the + stream for writing. In read mode, the following functions may be called: + + size_t stream_size(stream); + size_t stream_rtell(stream); + void stream_rewind(stream); + void stream_rseek(stream, offset); + void stream_skip(stream, uint n); + size_t stream_align(stream); + uint stream_read_bit(stream); + uint64 stream_read_bits(stream, n); + + Each of the above read calls has a corresponding write call: + + size_t stream_size(stream); + size_t stream_wtell(stream); + void stream_rewind(stream); + void stream_wseek(stream, offset); + void stream_pad(stream, n); + size_t stream_flush(stream); + uint stream_write_bit(stream, bit); + uint64 stream_write_bits(stream, value, n); + +3. The stream buffer is an unsigned integer of a user-specified type given + by the BIT_STREAM_WORD_TYPE macro. Bits are read and written in units of + this integer word type. Supported types are 8, 16, 32, or 64 bits wide. + The bit width of the buffer is denoted by 'wsize' and can be accessed via + the global constant stream_word_bits. A small wsize allows for fine + granularity reads and writes, and may be preferable when working with many + small blocks of data that require non-sequential access. The default + maximum size of 64 bits ensures maximum speed. Note that even when + wsize < 64, it is still possible to read and write up to 64 bits at a time + using stream_read_bits() and stream_write_bits(). + +4. If BIT_STREAM_STRIDED is defined, words read from or written to the stream + may be accessed noncontiguously by setting a power-of-two block size (which + by default is one word) and a block stride (defaults to zero blocks). The + word pointer is always incremented by one word each time a word is accessed. + Once advanced past a block boundary, the word pointer is also advanced by + the stride to the next block. This feature may be used to store blocks of + data interleaved, e.g. for progressive coding or for noncontiguous parallel + access to the bit stream Note that the block size is measured in words, + while the stride is measured in multiples of the block size. Strided access + can have a significant performance penalty. + +5. Multiple bits are read and written in order of least to most significant + bit. Thus, the statement + + value = stream_write_bits(stream, value, n); + + is essentially equivalent to (but faster than) + + for (i = 0; i < n; i++, value >>= 1) + stream_write_bit(value & 1); + + when 0 <= n <= 64. The same holds for read calls, and thus + + value = stream_read_bits(stream, n); + + is essentially equivalent to + + for (i = 0, value = 0; i < n; i++) + value += (uint64)stream_read_bit() << i; + + Note that it is possible to write fewer bits than the argument 'value' + holds (possibly even no bits), in which case any unwritten bits are + returned. + +6. Although the stream_wseek(stream, offset) call allows positioning the + stream for writing at any bit offset without any data loss (i.e. all + previously written bits preceding the offset remain valid), for efficiency + the stream_flush(stream) operation will zero all bits up to the next + multiple of wsize bits, thus overwriting bits that were previously stored + at that location. Consequently, random write access is effectively + supported only at wsize granularity. For sequential access, the largest + possible wsize is preferred due to higher speed. + +7. It is up to the user to adhere to these rules. For performance reasons, + no error checking is done, and in particular buffer overruns are not + caught. +*/ + +#include <limits.h> +#include <stdlib.h> + +#ifndef inline_ + #define inline_ +#endif + +/* bit stream word/buffer type; granularity of stream I/O operations */ +#ifdef BIT_STREAM_WORD_TYPE + /* may be 8-, 16-, 32-, or 64-bit unsigned integer type */ + typedef BIT_STREAM_WORD_TYPE word; +#else + /* use maximum word size by default for highest speed */ + typedef uint64 word; +#endif + +/* number of bits in a buffered word */ +#define wsize ((uint)(CHAR_BIT * sizeof(word))) + +/* bit stream structure (opaque to caller) */ +struct bitstream { + uint bits; /* number of buffered bits (0 <= bits < wsize) */ + word buffer; /* buffer for incoming/outgoing bits (buffer < 2^bits) */ + word* ptr; /* pointer to next word to be read/written */ + word* begin; /* beginning of stream */ + word* end; /* end of stream (currently unused) */ +#ifdef BIT_STREAM_STRIDED + size_t mask; /* one less the block size in number of words */ + ptrdiff_t delta; /* number of words between consecutive blocks */ +#endif +}; + +/* private functions ------------------------------------------------------- */ + +/* read a single word from memory */ +static word +stream_read_word(bitstream* s) +{ + word w = *s->ptr++; +#ifdef BIT_STREAM_STRIDED + if (!((s->ptr - s->begin) & s->mask)) + s->ptr += s->delta; +#endif + return w; +} + +/* write a single word to memory */ +static void +stream_write_word(bitstream* s, word value) +{ + *s->ptr++ = value; +#ifdef BIT_STREAM_STRIDED + if (!((s->ptr - s->begin) & s->mask)) + s->ptr += s->delta; +#endif +} + +/* public functions -------------------------------------------------------- */ + +/* pointer to beginning of stream */ +inline_ void* +stream_data(const bitstream* s) +{ + return s->begin; +} + +/* current byte size of stream (if flushed) */ +inline_ size_t +stream_size(const bitstream* s) +{ + return sizeof(word) * (s->ptr - s->begin); +} + +/* byte capacity of stream */ +inline_ size_t +stream_capacity(const bitstream* s) +{ + return sizeof(word) * (s->end - s->begin); +} + +/* number of words per block */ +inline_ size_t +stream_stride_block(const bitstream* s) +{ +#ifdef BIT_STREAM_STRIDED + return s->mask + 1; +#else + return 1; +#endif +} + +/* number of blocks between consecutive stream blocks */ +inline_ ptrdiff_t +stream_stride_delta(const bitstream* s) +{ +#ifdef BIT_STREAM_STRIDED + return s->delta / (s->mask + 1); +#else + return 0; +#endif +} + +/* read single bit (0 or 1) */ +inline_ uint +stream_read_bit(bitstream* s) +{ + uint bit; + if (!s->bits) { + s->buffer = stream_read_word(s); + s->bits = wsize; + } + s->bits--; + bit = (uint)s->buffer & 1u; + s->buffer >>= 1; + return bit; +} + +/* write single bit (must be 0 or 1) */ +inline_ uint +stream_write_bit(bitstream* s, uint bit) +{ + s->buffer += (word)bit << s->bits; + if (++s->bits == wsize) { + stream_write_word(s, s->buffer); + s->buffer = 0; + s->bits = 0; + } + return bit; +} + +/* read 0 <= n <= 64 bits */ +inline_ uint64 +stream_read_bits(bitstream* s, uint n) +{ + uint64 value = s->buffer; + if (s->bits < n) { + /* keep fetching wsize bits until enough bits are buffered */ + do { + /* assert: 0 <= s->bits < n <= 64 */ + s->buffer = stream_read_word(s); + value += (uint64)s->buffer << s->bits; + s->bits += wsize; + } while (sizeof(s->buffer) < sizeof(value) && s->bits < n); + /* assert: 1 <= n <= s->bits < n + wsize */ + s->bits -= n; + if (!s->bits) { + /* value holds exactly n bits; no need for masking */ + s->buffer = 0; + } + else { + /* assert: 1 <= s->bits < wsize */ + s->buffer >>= wsize - s->bits; + /* assert: 1 <= n <= 64 */ + value &= ((uint64)2 << (n - 1)) - 1; + } + } + else { + /* assert: 0 <= n <= s->bits < wsize <= 64 */ + s->bits -= n; + s->buffer >>= n; + value &= ((uint64)1 << n) - 1; + } + return value; +} + +/* write 0 <= n <= 64 low bits of value and return remaining bits */ +inline_ uint64 +stream_write_bits(bitstream* s, uint64 value, uint n) +{ + /* append bit string to buffer */ + s->buffer += (word)(value << s->bits); + s->bits += n; + /* is buffer full? */ + if (s->bits >= wsize) { + /* 1 <= n <= 64; decrement n to ensure valid right shifts below */ + value >>= 1; + n--; + /* assert: 0 <= n < 64; wsize <= s->bits <= wsize + n */ + do { + /* output wsize bits while buffer is full */ + s->bits -= wsize; + /* assert: 0 <= s->bits <= n */ + stream_write_word(s, s->buffer); + /* assert: 0 <= n - s->bits < 64 */ + s->buffer = (word)(value >> (n - s->bits)); + } while (sizeof(s->buffer) < sizeof(value) && s->bits >= wsize); + } + /* assert: 0 <= s->bits < wsize */ + s->buffer &= ((word)1 << s->bits) - 1; + /* assert: 0 <= n < 64 */ + return value >> n; +} + +/* return bit offset to next bit to be read */ +inline_ size_t +stream_rtell(const bitstream* s) +{ + return wsize * (s->ptr - s->begin) - s->bits; +} + +/* return bit offset to next bit to be written */ +inline_ size_t +stream_wtell(const bitstream* s) +{ + return wsize * (s->ptr - s->begin) + s->bits; +} + +/* position stream for reading or writing at beginning */ +inline_ void +stream_rewind(bitstream* s) +{ + s->ptr = s->begin; + s->buffer = 0; + s->bits = 0; +} + +/* position stream for reading at given bit offset */ +inline_ void +stream_rseek(bitstream* s, size_t offset) +{ + uint n = offset % wsize; + s->ptr = s->begin + offset / wsize; + if (n) { + s->buffer = stream_read_word(s) >> n; + s->bits = wsize - n; + } + else { + s->buffer = 0; + s->bits = 0; + } +} + +/* position stream for writing at given bit offset */ +inline_ void +stream_wseek(bitstream* s, size_t offset) +{ + uint n = offset % wsize; + s->ptr = s->begin + offset / wsize; + if (n) { + word buffer = *s->ptr; + buffer &= ((word)1 << n) - 1; + s->buffer = buffer; + s->bits = n; + } + else { + s->buffer = 0; + s->bits = 0; + } +} + +/* skip over the next n bits (n >= 0) */ +inline_ void +stream_skip(bitstream* s, uint n) +{ + stream_rseek(s, stream_rtell(s) + n); +} + +/* append n zero-bits to stream (n >= 0) */ +inline_ void +stream_pad(bitstream* s, uint n) +{ + for (s->bits += n; s->bits >= wsize; s->bits -= wsize) { + stream_write_word(s, s->buffer); + s->buffer = 0; + } +} + +/* align stream on next word boundary */ +inline_ size_t +stream_align(bitstream* s) +{ + uint bits = s->bits; + if (bits) + stream_skip(s, bits); + return bits; +} + +/* write any remaining buffered bits and align stream on next word boundary */ +inline_ size_t +stream_flush(bitstream* s) +{ + uint bits = (wsize - s->bits) % wsize; + if (bits) + stream_pad(s, bits); + return bits; +} + +/* copy n bits from one bit stream to another */ +inline_ void +stream_copy(bitstream* dst, bitstream* src, size_t n) +{ + while (n > wsize) { + word w = (word)stream_read_bits(src, wsize); + stream_write_bits(dst, w, wsize); + n -= wsize; + } + if (n) { + word w = (word)stream_read_bits(src, (uint)n); + stream_write_bits(dst, w, (uint)n); + } +} + +#ifdef BIT_STREAM_STRIDED +/* set block size in number of words and spacing in number of blocks */ +inline_ int +stream_set_stride(bitstream* s, size_t block, ptrdiff_t delta) +{ + /* ensure block size is a power of two */ + if (block & (block - 1)) + return 0; + s->mask = block - 1; + s->delta = delta * block; + return 1; +} +#endif + +/* allocate and initialize bit stream to user-allocated buffer */ +inline_ bitstream* +stream_open(void* buffer, size_t bytes) +{ + bitstream* s = (bitstream*)malloc(sizeof(bitstream)); + if (s) { + s->begin = (word*)buffer; + s->end = s->begin + bytes / sizeof(word); +#ifdef BIT_STREAM_STRIDED + stream_set_stride(s, 0, 0); +#endif + stream_rewind(s); + } + return s; +} + +/* close and deallocate bit stream */ +inline_ void +stream_close(bitstream* s) +{ + free(s); +} + +/* make a copy of bit stream to shared memory buffer */ +inline_ bitstream* +stream_clone(const bitstream* s) +{ + bitstream* c = (bitstream*)malloc(sizeof(bitstream)); + if (c) + *c = *s; + return c; +} diff --git a/zfp/src/inline/inline.h b/zfp/src/inline/inline.h new file mode 100644 index 0000000000000000000000000000000000000000..e9ade3f11d8f85b64711880cbee8198e737a6448 --- /dev/null +++ b/zfp/src/inline/inline.h @@ -0,0 +1,12 @@ +#ifndef INLINE_H +#define INLINE_H + +#ifndef inline_ + #if __STDC_VERSION__ >= 199901L + #define inline_ static inline + #else + #define inline_ static + #endif +#endif + +#endif diff --git a/zfp/src/share/omp.c b/zfp/src/share/omp.c new file mode 100644 index 0000000000000000000000000000000000000000..9ee26b9a68aae56f3fb6e75054fb4b4ddb60d786 --- /dev/null +++ b/zfp/src/share/omp.c @@ -0,0 +1,25 @@ +#ifdef _OPENMP +#include <omp.h> + +/* number of omp threads to use */ +static int +thread_count_omp(const zfp_stream* stream) +{ + int count = stream->exec.params.omp.threads; + /* if no thread count is specified, use default number of threads */ + if (!count) + count = omp_get_max_threads(); + return count; +} + +/* number of chunks to partition array into */ +static uint +chunk_count_omp(const zfp_stream* stream, uint blocks, uint threads) +{ + uint chunk_size = stream->exec.params.omp.chunk_size; + /* if no chunk size is specified, assign one chunk per thread */ + uint chunks = chunk_size ? (blocks + chunk_size - 1) / chunk_size : threads; + return MIN(chunks, blocks); +} + +#endif diff --git a/zfp/src/share/parallel.c b/zfp/src/share/parallel.c new file mode 100644 index 0000000000000000000000000000000000000000..8c67d8f49e77800b2f044125c4060f94b11c73b4 --- /dev/null +++ b/zfp/src/share/parallel.c @@ -0,0 +1,86 @@ +#ifdef _OPENMP + +/* block index at which chunk begins */ +static uint +chunk_offset(uint blocks, uint chunks, uint chunk) +{ + return (uint)((blocks * (uint64)chunk) / chunks); +} + +/* initialize per-thread bit streams for parallel compression */ +static bitstream** +compress_init_par(zfp_stream* stream, const zfp_field* field, uint chunks, uint blocks) +{ + bitstream** bs; + size_t size; + int copy = 0; + uint i; + + /* determine maximum size buffer needed per thread */ + zfp_field f = *field; + switch (zfp_field_dimensionality(field)) { + case 1: + f.nx = 4 * (blocks + chunks - 1) / chunks; + break; + case 2: + f.nx = 4; + f.ny = 4 * (blocks + chunks - 1) / chunks; + break; + case 3: + f.nx = 4; + f.ny = 4; + f.nz = 4 * (blocks + chunks - 1) / chunks; + break; + case 4: + f.nx = 4; + f.ny = 4; + f.nz = 4; + f.nw = 4 * (blocks + chunks - 1) / chunks; + break; + default: + return 0; + } + size = zfp_stream_maximum_size(stream, &f); + + /* avoid copies in fixed-rate mode when each bitstream is word aligned */ + copy |= stream->minbits != stream->maxbits; + copy |= (stream->maxbits % stream_word_bits) != 0; + copy |= (stream_wtell(stream->stream) % stream_word_bits) != 0; + + /* set up buffer for each thread to compress to */ + bs = (bitstream**)malloc(chunks * sizeof(bitstream*)); + for (i = 0; i < chunks; i++) { + uint block = chunk_offset(blocks, chunks, i); + void* buffer = copy ? malloc(size) : (uchar*)stream_data(stream->stream) + stream_size(stream->stream) + block * stream->maxbits / CHAR_BIT; + bs[i] = stream_open(buffer, size); + } + + return bs; +} + +/* flush and concatenate bit streams if needed */ +static void +compress_finish_par(zfp_stream* stream, bitstream** src, uint chunks) +{ + bitstream* dst = zfp_stream_bit_stream(stream); + int copy = (stream_data(dst) != stream_data(*src)); + size_t offset = stream_wtell(dst); + uint i; + for (i = 0; i < chunks; i++) { + size_t bits = stream_wtell(src[i]); + offset += bits; + stream_flush(src[i]); + /* concatenate streams if they are not already contiguous */ + if (copy) { + stream_rewind(src[i]); + stream_copy(dst, src[i], bits); + free(stream_data(src[i])); + } + stream_close(src[i]); + } + free(src); + if (!copy) + stream_wseek(dst, offset); +} + +#endif diff --git a/zfp/src/template/codec.h b/zfp/src/template/codec.h new file mode 100644 index 0000000000000000000000000000000000000000..e7149a98ea5f770e6db30a2cd1db9e59e9ed02c7 --- /dev/null +++ b/zfp/src/template/codec.h @@ -0,0 +1,3 @@ +#define PERM _t1(perm, DIMS) /* coefficient order */ +#define BLOCK_SIZE (1 << (2 * DIMS)) /* values per block */ +#define EBIAS ((1 << (EBITS - 1)) - 1) /* exponent bias */ diff --git a/zfp/src/template/codec1.c b/zfp/src/template/codec1.c new file mode 100644 index 0000000000000000000000000000000000000000..5a4786471a940610d67472fd90858177b7d452d6 --- /dev/null +++ b/zfp/src/template/codec1.c @@ -0,0 +1,4 @@ +/* order coefficients by polynomial degree/frequency */ +cache_align_(static const uchar perm_1[4]) = { + 0, 1, 2, 3 +}; diff --git a/zfp/src/template/codec2.c b/zfp/src/template/codec2.c new file mode 100644 index 0000000000000000000000000000000000000000..a0f977024d6e73203ecbd14e6b605c949d799023 --- /dev/null +++ b/zfp/src/template/codec2.c @@ -0,0 +1,32 @@ +#define index(i, j) ((i) + 4 * (j)) + +/* order coefficients (i, j) by i + j, then i^2 + j^2 */ +cache_align_(static const uchar perm_2[16]) = { + index(0, 0), /* 0 : 0 */ + + index(1, 0), /* 1 : 1 */ + index(0, 1), /* 2 : 1 */ + + index(1, 1), /* 3 : 2 */ + + index(2, 0), /* 4 : 2 */ + index(0, 2), /* 5 : 2 */ + + index(2, 1), /* 6 : 3 */ + index(1, 2), /* 7 : 3 */ + + index(3, 0), /* 8 : 3 */ + index(0, 3), /* 9 : 3 */ + + index(2, 2), /* 10 : 4 */ + + index(3, 1), /* 11 : 4 */ + index(1, 3), /* 12 : 4 */ + + index(3, 2), /* 13 : 5 */ + index(2, 3), /* 14 : 5 */ + + index(3, 3), /* 15 : 6 */ +}; + +#undef index diff --git a/zfp/src/template/codec3.c b/zfp/src/template/codec3.c new file mode 100644 index 0000000000000000000000000000000000000000..b95f302720317b13bea8b53814646dccc42b6de2 --- /dev/null +++ b/zfp/src/template/codec3.c @@ -0,0 +1,90 @@ +#define index(i, j, k) ((i) + 4 * ((j) + 4 * (k))) + +/* order coefficients (i, j, k) by i + j + k, then i^2 + j^2 + k^2 */ +cache_align_(static const uchar perm_3[64]) = { + index(0, 0, 0), /* 0 : 0 */ + + index(1, 0, 0), /* 1 : 1 */ + index(0, 1, 0), /* 2 : 1 */ + index(0, 0, 1), /* 3 : 1 */ + + index(0, 1, 1), /* 4 : 2 */ + index(1, 0, 1), /* 5 : 2 */ + index(1, 1, 0), /* 6 : 2 */ + + index(2, 0, 0), /* 7 : 2 */ + index(0, 2, 0), /* 8 : 2 */ + index(0, 0, 2), /* 9 : 2 */ + + index(1, 1, 1), /* 10 : 3 */ + + index(2, 1, 0), /* 11 : 3 */ + index(2, 0, 1), /* 12 : 3 */ + index(0, 2, 1), /* 13 : 3 */ + index(1, 2, 0), /* 14 : 3 */ + index(1, 0, 2), /* 15 : 3 */ + index(0, 1, 2), /* 16 : 3 */ + + index(3, 0, 0), /* 17 : 3 */ + index(0, 3, 0), /* 18 : 3 */ + index(0, 0, 3), /* 19 : 3 */ + + index(2, 1, 1), /* 20 : 4 */ + index(1, 2, 1), /* 21 : 4 */ + index(1, 1, 2), /* 22 : 4 */ + + index(0, 2, 2), /* 23 : 4 */ + index(2, 0, 2), /* 24 : 4 */ + index(2, 2, 0), /* 25 : 4 */ + + index(3, 1, 0), /* 26 : 4 */ + index(3, 0, 1), /* 27 : 4 */ + index(0, 3, 1), /* 28 : 4 */ + index(1, 3, 0), /* 29 : 4 */ + index(1, 0, 3), /* 30 : 4 */ + index(0, 1, 3), /* 31 : 4 */ + + index(1, 2, 2), /* 32 : 5 */ + index(2, 1, 2), /* 33 : 5 */ + index(2, 2, 1), /* 34 : 5 */ + + index(3, 1, 1), /* 35 : 5 */ + index(1, 3, 1), /* 36 : 5 */ + index(1, 1, 3), /* 37 : 5 */ + + index(3, 2, 0), /* 38 : 5 */ + index(3, 0, 2), /* 39 : 5 */ + index(0, 3, 2), /* 40 : 5 */ + index(2, 3, 0), /* 41 : 5 */ + index(2, 0, 3), /* 42 : 5 */ + index(0, 2, 3), /* 43 : 5 */ + + index(2, 2, 2), /* 44 : 6 */ + + index(3, 2, 1), /* 45 : 6 */ + index(3, 1, 2), /* 46 : 6 */ + index(1, 3, 2), /* 47 : 6 */ + index(2, 3, 1), /* 48 : 6 */ + index(2, 1, 3), /* 49 : 6 */ + index(1, 2, 3), /* 50 : 6 */ + + index(0, 3, 3), /* 51 : 6 */ + index(3, 0, 3), /* 52 : 6 */ + index(3, 3, 0), /* 53 : 6 */ + + index(3, 2, 2), /* 54 : 7 */ + index(2, 3, 2), /* 55 : 7 */ + index(2, 2, 3), /* 56 : 7 */ + + index(1, 3, 3), /* 57 : 7 */ + index(3, 1, 3), /* 58 : 7 */ + index(3, 3, 1), /* 59 : 7 */ + + index(2, 3, 3), /* 60 : 8 */ + index(3, 2, 3), /* 61 : 8 */ + index(3, 3, 2), /* 62 : 8 */ + + index(3, 3, 3), /* 63 : 9 */ +}; + +#undef index diff --git a/zfp/src/template/codec4.c b/zfp/src/template/codec4.c new file mode 100644 index 0000000000000000000000000000000000000000..b8314525cd42802a3ad8463191cea96bddef9661 --- /dev/null +++ b/zfp/src/template/codec4.c @@ -0,0 +1,297 @@ +#define index(i, j, k, l) ((i) + 4 * ((j) + 4 * ((k) + 4 * (l)))) + +/* order coefficients (i, j, k, l) by i + j + k + l, then i^2 + j^2 + k^2 + l^2 */ +cache_align_(static const uchar perm_4[256]) = { + index(0, 0, 0, 0), /* 0 : 0 */ + + index(1, 0, 0, 0), /* 1 : 1 */ + index(0, 1, 0, 0), /* 2 : 1 */ + index(0, 0, 1, 0), /* 3 : 1 */ + index(0, 0, 0, 1), /* 4 : 1 */ + + index(1, 1, 0, 0), /* 5 : 2 */ + index(0, 0, 1, 1), /* 6 : 2 */ + index(1, 0, 1, 0), /* 7 : 2 */ + index(0, 1, 0, 1), /* 8 : 2 */ + index(1, 0, 0, 1), /* 9 : 2 */ + index(0, 1, 1, 0), /* 10 : 2 */ + + index(2, 0, 0, 0), /* 11 : 2 */ + index(0, 2, 0, 0), /* 12 : 2 */ + index(0, 0, 2, 0), /* 13 : 2 */ + index(0, 0, 0, 2), /* 14 : 2 */ + + index(0, 1, 1, 1), /* 15 : 3 */ + index(1, 0, 1, 1), /* 16 : 3 */ + index(1, 1, 0, 1), /* 17 : 3 */ + index(1, 1, 1, 0), /* 18 : 3 */ + + index(2, 1, 0, 0), /* 19 : 3 */ + index(2, 0, 1, 0), /* 20 : 3 */ + index(2, 0, 0, 1), /* 21 : 3 */ + index(0, 2, 1, 0), /* 22 : 3 */ + index(0, 2, 0, 1), /* 23 : 3 */ + index(1, 2, 0, 0), /* 24 : 3 */ + index(0, 0, 2, 1), /* 25 : 3 */ + index(1, 0, 2, 0), /* 26 : 3 */ + index(0, 1, 2, 0), /* 27 : 3 */ + index(1, 0, 0, 2), /* 28 : 3 */ + index(0, 1, 0, 2), /* 29 : 3 */ + index(0, 0, 1, 2), /* 30 : 3 */ + + index(3, 0, 0, 0), /* 31 : 3 */ + index(0, 3, 0, 0), /* 32 : 3 */ + index(0, 0, 3, 0), /* 33 : 3 */ + index(0, 0, 0, 3), /* 34 : 3 */ + + index(1, 1, 1, 1), /* 35 : 4 */ + + index(2, 0, 1, 1), /* 36 : 4 */ + index(2, 1, 0, 1), /* 37 : 4 */ + index(2, 1, 1, 0), /* 38 : 4 */ + index(1, 2, 0, 1), /* 39 : 4 */ + index(1, 2, 1, 0), /* 40 : 4 */ + index(0, 2, 1, 1), /* 41 : 4 */ + index(1, 1, 2, 0), /* 42 : 4 */ + index(0, 1, 2, 1), /* 43 : 4 */ + index(1, 0, 2, 1), /* 44 : 4 */ + index(0, 1, 1, 2), /* 45 : 4 */ + index(1, 0, 1, 2), /* 46 : 4 */ + index(1, 1, 0, 2), /* 47 : 4 */ + + index(2, 2, 0, 0), /* 48 : 4 */ + index(0, 0, 2, 2), /* 49 : 4 */ + index(2, 0, 2, 0), /* 50 : 4 */ + index(0, 2, 0, 2), /* 51 : 4 */ + index(2, 0, 0, 2), /* 52 : 4 */ + index(0, 2, 2, 0), /* 53 : 4 */ + + index(3, 1, 0, 0), /* 54 : 4 */ + index(3, 0, 1, 0), /* 55 : 4 */ + index(3, 0, 0, 1), /* 56 : 4 */ + index(0, 3, 1, 0), /* 57 : 4 */ + index(0, 3, 0, 1), /* 58 : 4 */ + index(1, 3, 0, 0), /* 59 : 4 */ + index(0, 0, 3, 1), /* 60 : 4 */ + index(1, 0, 3, 0), /* 61 : 4 */ + index(0, 1, 3, 0), /* 62 : 4 */ + index(1, 0, 0, 3), /* 63 : 4 */ + index(0, 1, 0, 3), /* 64 : 4 */ + index(0, 0, 1, 3), /* 65 : 4 */ + + index(2, 1, 1, 1), /* 66 : 5 */ + index(1, 2, 1, 1), /* 67 : 5 */ + index(1, 1, 2, 1), /* 68 : 5 */ + index(1, 1, 1, 2), /* 69 : 5 */ + + index(1, 0, 2, 2), /* 70 : 5 */ + index(1, 2, 0, 2), /* 71 : 5 */ + index(1, 2, 2, 0), /* 72 : 5 */ + index(2, 1, 0, 2), /* 73 : 5 */ + index(2, 1, 2, 0), /* 74 : 5 */ + index(0, 1, 2, 2), /* 75 : 5 */ + index(2, 2, 1, 0), /* 76 : 5 */ + index(0, 2, 1, 2), /* 77 : 5 */ + index(2, 0, 1, 2), /* 78 : 5 */ + index(0, 2, 2, 1), /* 79 : 5 */ + index(2, 0, 2, 1), /* 80 : 5 */ + index(2, 2, 0, 1), /* 81 : 5 */ + + index(3, 0, 1, 1), /* 82 : 5 */ + index(3, 1, 0, 1), /* 83 : 5 */ + index(3, 1, 1, 0), /* 84 : 5 */ + index(1, 3, 0, 1), /* 85 : 5 */ + index(1, 3, 1, 0), /* 86 : 5 */ + index(0, 3, 1, 1), /* 87 : 5 */ + index(1, 1, 3, 0), /* 88 : 5 */ + index(0, 1, 3, 1), /* 89 : 5 */ + index(1, 0, 3, 1), /* 90 : 5 */ + index(0, 1, 1, 3), /* 91 : 5 */ + index(1, 0, 1, 3), /* 92 : 5 */ + index(1, 1, 0, 3), /* 93 : 5 */ + + index(3, 2, 0, 0), /* 94 : 5 */ + index(3, 0, 2, 0), /* 95 : 5 */ + index(3, 0, 0, 2), /* 96 : 5 */ + index(0, 3, 2, 0), /* 97 : 5 */ + index(0, 3, 0, 2), /* 98 : 5 */ + index(2, 3, 0, 0), /* 99 : 5 */ + index(0, 0, 3, 2), /* 100 : 5 */ + index(2, 0, 3, 0), /* 101 : 5 */ + index(0, 2, 3, 0), /* 102 : 5 */ + index(2, 0, 0, 3), /* 103 : 5 */ + index(0, 2, 0, 3), /* 104 : 5 */ + index(0, 0, 2, 3), /* 105 : 5 */ + + index(2, 2, 1, 1), /* 106 : 6 */ + index(1, 1, 2, 2), /* 107 : 6 */ + index(2, 1, 2, 1), /* 108 : 6 */ + index(1, 2, 1, 2), /* 109 : 6 */ + index(2, 1, 1, 2), /* 110 : 6 */ + index(1, 2, 2, 1), /* 111 : 6 */ + + index(0, 2, 2, 2), /* 112 : 6 */ + index(2, 0, 2, 2), /* 113 : 6 */ + index(2, 2, 0, 2), /* 114 : 6 */ + index(2, 2, 2, 0), /* 115 : 6 */ + + index(3, 1, 1, 1), /* 116 : 6 */ + index(1, 3, 1, 1), /* 117 : 6 */ + index(1, 1, 3, 1), /* 118 : 6 */ + index(1, 1, 1, 3), /* 119 : 6 */ + + index(3, 2, 1, 0), /* 120 : 6 */ + index(3, 2, 0, 1), /* 121 : 6 */ + index(3, 0, 2, 1), /* 122 : 6 */ + index(3, 1, 2, 0), /* 123 : 6 */ + index(3, 1, 0, 2), /* 124 : 6 */ + index(3, 0, 1, 2), /* 125 : 6 */ + index(0, 3, 2, 1), /* 126 : 6 */ + index(1, 3, 2, 0), /* 127 : 6 */ + index(1, 3, 0, 2), /* 128 : 6 */ + index(0, 3, 1, 2), /* 129 : 6 */ + index(2, 3, 1, 0), /* 130 : 6 */ + index(2, 3, 0, 1), /* 131 : 6 */ + index(1, 0, 3, 2), /* 132 : 6 */ + index(0, 1, 3, 2), /* 133 : 6 */ + index(2, 1, 3, 0), /* 134 : 6 */ + index(2, 0, 3, 1), /* 135 : 6 */ + index(0, 2, 3, 1), /* 136 : 6 */ + index(1, 2, 3, 0), /* 137 : 6 */ + index(2, 1, 0, 3), /* 138 : 6 */ + index(2, 0, 1, 3), /* 139 : 6 */ + index(0, 2, 1, 3), /* 140 : 6 */ + index(1, 2, 0, 3), /* 141 : 6 */ + index(1, 0, 2, 3), /* 142 : 6 */ + index(0, 1, 2, 3), /* 143 : 6 */ + + index(3, 3, 0, 0), /* 144 : 6 */ + index(0, 0, 3, 3), /* 145 : 6 */ + index(3, 0, 3, 0), /* 146 : 6 */ + index(0, 3, 0, 3), /* 147 : 6 */ + index(3, 0, 0, 3), /* 148 : 6 */ + index(0, 3, 3, 0), /* 149 : 6 */ + + index(1, 2, 2, 2), /* 150 : 7 */ + index(2, 1, 2, 2), /* 151 : 7 */ + index(2, 2, 1, 2), /* 152 : 7 */ + index(2, 2, 2, 1), /* 153 : 7 */ + + index(3, 2, 1, 1), /* 154 : 7 */ + index(3, 1, 2, 1), /* 155 : 7 */ + index(3, 1, 1, 2), /* 156 : 7 */ + index(1, 3, 2, 1), /* 157 : 7 */ + index(1, 3, 1, 2), /* 158 : 7 */ + index(2, 3, 1, 1), /* 159 : 7 */ + index(1, 1, 3, 2), /* 160 : 7 */ + index(2, 1, 3, 1), /* 161 : 7 */ + index(1, 2, 3, 1), /* 162 : 7 */ + index(2, 1, 1, 3), /* 163 : 7 */ + index(1, 2, 1, 3), /* 164 : 7 */ + index(1, 1, 2, 3), /* 165 : 7 */ + + index(3, 0, 2, 2), /* 166 : 7 */ + index(3, 2, 0, 2), /* 167 : 7 */ + index(3, 2, 2, 0), /* 168 : 7 */ + index(2, 3, 0, 2), /* 169 : 7 */ + index(2, 3, 2, 0), /* 170 : 7 */ + index(0, 3, 2, 2), /* 171 : 7 */ + index(2, 2, 3, 0), /* 172 : 7 */ + index(0, 2, 3, 2), /* 173 : 7 */ + index(2, 0, 3, 2), /* 174 : 7 */ + index(0, 2, 2, 3), /* 175 : 7 */ + index(2, 0, 2, 3), /* 176 : 7 */ + index(2, 2, 0, 3), /* 177 : 7 */ + + index(1, 0, 3, 3), /* 178 : 7 */ + index(1, 3, 0, 3), /* 179 : 7 */ + index(1, 3, 3, 0), /* 180 : 7 */ + index(3, 1, 0, 3), /* 181 : 7 */ + index(3, 1, 3, 0), /* 182 : 7 */ + index(0, 1, 3, 3), /* 183 : 7 */ + index(3, 3, 1, 0), /* 184 : 7 */ + index(0, 3, 1, 3), /* 185 : 7 */ + index(3, 0, 1, 3), /* 186 : 7 */ + index(0, 3, 3, 1), /* 187 : 7 */ + index(3, 0, 3, 1), /* 188 : 7 */ + index(3, 3, 0, 1), /* 189 : 7 */ + + index(2, 2, 2, 2), /* 190 : 8 */ + + index(3, 1, 2, 2), /* 191 : 8 */ + index(3, 2, 1, 2), /* 192 : 8 */ + index(3, 2, 2, 1), /* 193 : 8 */ + index(2, 3, 1, 2), /* 194 : 8 */ + index(2, 3, 2, 1), /* 195 : 8 */ + index(1, 3, 2, 2), /* 196 : 8 */ + index(2, 2, 3, 1), /* 197 : 8 */ + index(1, 2, 3, 2), /* 198 : 8 */ + index(2, 1, 3, 2), /* 199 : 8 */ + index(1, 2, 2, 3), /* 200 : 8 */ + index(2, 1, 2, 3), /* 201 : 8 */ + index(2, 2, 1, 3), /* 202 : 8 */ + + index(3, 3, 1, 1), /* 203 : 8 */ + index(1, 1, 3, 3), /* 204 : 8 */ + index(3, 1, 3, 1), /* 205 : 8 */ + index(1, 3, 1, 3), /* 206 : 8 */ + index(3, 1, 1, 3), /* 207 : 8 */ + index(1, 3, 3, 1), /* 208 : 8 */ + + index(2, 0, 3, 3), /* 209 : 8 */ + index(2, 3, 0, 3), /* 210 : 8 */ + index(2, 3, 3, 0), /* 211 : 8 */ + index(3, 2, 0, 3), /* 212 : 8 */ + index(3, 2, 3, 0), /* 213 : 8 */ + index(0, 2, 3, 3), /* 214 : 8 */ + index(3, 3, 2, 0), /* 215 : 8 */ + index(0, 3, 2, 3), /* 216 : 8 */ + index(3, 0, 2, 3), /* 217 : 8 */ + index(0, 3, 3, 2), /* 218 : 8 */ + index(3, 0, 3, 2), /* 219 : 8 */ + index(3, 3, 0, 2), /* 220 : 8 */ + + index(3, 2, 2, 2), /* 221 : 9 */ + index(2, 3, 2, 2), /* 222 : 9 */ + index(2, 2, 3, 2), /* 223 : 9 */ + index(2, 2, 2, 3), /* 224 : 9 */ + + index(2, 1, 3, 3), /* 225 : 9 */ + index(2, 3, 1, 3), /* 226 : 9 */ + index(2, 3, 3, 1), /* 227 : 9 */ + index(3, 2, 1, 3), /* 228 : 9 */ + index(3, 2, 3, 1), /* 229 : 9 */ + index(1, 2, 3, 3), /* 230 : 9 */ + index(3, 3, 2, 1), /* 231 : 9 */ + index(1, 3, 2, 3), /* 232 : 9 */ + index(3, 1, 2, 3), /* 233 : 9 */ + index(1, 3, 3, 2), /* 234 : 9 */ + index(3, 1, 3, 2), /* 235 : 9 */ + index(3, 3, 1, 2), /* 236 : 9 */ + + index(0, 3, 3, 3), /* 237 : 9 */ + index(3, 0, 3, 3), /* 238 : 9 */ + index(3, 3, 0, 3), /* 239 : 9 */ + index(3, 3, 3, 0), /* 240 : 9 */ + + index(3, 3, 2, 2), /* 241 : 10 */ + index(2, 2, 3, 3), /* 242 : 10 */ + index(3, 2, 3, 2), /* 243 : 10 */ + index(2, 3, 2, 3), /* 244 : 10 */ + index(3, 2, 2, 3), /* 245 : 10 */ + index(2, 3, 3, 2), /* 246 : 10 */ + + index(1, 3, 3, 3), /* 247 : 10 */ + index(3, 1, 3, 3), /* 248 : 10 */ + index(3, 3, 1, 3), /* 249 : 10 */ + index(3, 3, 3, 1), /* 250 : 10 */ + + index(2, 3, 3, 3), /* 251 : 11 */ + index(3, 2, 3, 3), /* 252 : 11 */ + index(3, 3, 2, 3), /* 253 : 11 */ + index(3, 3, 3, 2), /* 254 : 11 */ + + index(3, 3, 3, 3), /* 255 : 12 */ +}; + +#undef index diff --git a/zfp/src/template/codecf.c b/zfp/src/template/codecf.c new file mode 100644 index 0000000000000000000000000000000000000000..61003cfb286d6c78afe20e6cdd108c1855f5bf45 --- /dev/null +++ b/zfp/src/template/codecf.c @@ -0,0 +1,6 @@ +/* maximum number of bit planes to encode */ +static uint +precision(int maxexp, uint maxprec, int minexp, int dims) +{ + return MIN(maxprec, (uint)MAX(0, maxexp - minexp + 2 * (dims + 1))); +} diff --git a/zfp/src/template/compress.c b/zfp/src/template/compress.c new file mode 100644 index 0000000000000000000000000000000000000000..6a4b370c85630679f48924dca9af1ca90efa1224 --- /dev/null +++ b/zfp/src/template/compress.c @@ -0,0 +1,128 @@ +/* compress 1d contiguous array */ +static void +_t2(compress, Scalar, 1)(zfp_stream* stream, const zfp_field* field) +{ + const Scalar* data = (const Scalar*)field->data; + uint nx = field->nx; + uint mx = nx & ~3u; + uint x; + + /* compress array one block of 4 values at a time */ + for (x = 0; x < mx; x += 4, data += 4) + _t2(zfp_encode_block, Scalar, 1)(stream, data); + if (x < nx) + _t2(zfp_encode_partial_block_strided, Scalar, 1)(stream, data, nx - x, 1); +} + +#if 0 +/* compress 1d strided array */ +static void +_t2(compress_strided, Scalar, 1)(zfp_stream* stream, const zfp_field* field) +{ + const Scalar* data = (const Scalar*)field->data; + uint nx = field->nx; + uint mx = nx & ~3u; + int sx = field->sx ? field->sx : 1; + uint x; + + /* compress array one block of 4 values at a time */ + for (x = 0; x < mx; x += 4, data += 4 * sx) + _t2(zfp_encode_block_strided, Scalar, 1)(stream, data, sx); + if (x < nx) + _t2(zfp_encode_partial_block_strided, Scalar, 1)(stream, data, nx - x, sx); +} +#else +/* compress 1d strided array */ +static void +_t2(compress_strided, Scalar, 1)(zfp_stream* stream, const zfp_field* field) +{ + const Scalar* data = field->data; + uint nx = field->nx; + int sx = field->sx ? field->sx : 1; + uint x; + + /* compress array one block of 4 values at a time */ + for (x = 0; x < nx; x += 4) { + const Scalar* p = data + sx * (ptrdiff_t)x; + if (nx - x < 4) + _t2(zfp_encode_partial_block_strided, Scalar, 1)(stream, p, nx - x, sx); + else + _t2(zfp_encode_block_strided, Scalar, 1)(stream, p, sx); + } +} +#endif + +/* compress 2d strided array */ +static void +_t2(compress_strided, Scalar, 2)(zfp_stream* stream, const zfp_field* field) +{ + const Scalar* data = (const Scalar*)field->data; + uint nx = field->nx; + uint ny = field->ny; + int sx = field->sx ? field->sx : 1; + int sy = field->sy ? field->sy : nx; + uint x, y; + + /* compress array one block of 4x4 values at a time */ + for (y = 0; y < ny; y += 4) + for (x = 0; x < nx; x += 4) { + const Scalar* p = data + sx * (ptrdiff_t)x + sy * (ptrdiff_t)y; + if (nx - x < 4 || ny - y < 4) + _t2(zfp_encode_partial_block_strided, Scalar, 2)(stream, p, MIN(nx - x, 4u), MIN(ny - y, 4u), sx, sy); + else + _t2(zfp_encode_block_strided, Scalar, 2)(stream, p, sx, sy); + } +} + +/* compress 3d strided array */ +static void +_t2(compress_strided, Scalar, 3)(zfp_stream* stream, const zfp_field* field) +{ + const Scalar* data = (const Scalar*)field->data; + uint nx = field->nx; + uint ny = field->ny; + uint nz = field->nz; + int sx = field->sx ? field->sx : 1; + int sy = field->sy ? field->sy : nx; + int sz = field->sz ? field->sz : nx * ny; + uint x, y, z; + + /* compress array one block of 4x4x4 values at a time */ + for (z = 0; z < nz; z += 4) + for (y = 0; y < ny; y += 4) + for (x = 0; x < nx; x += 4) { + const Scalar* p = data + sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z; + if (nx - x < 4 || ny - y < 4 || nz - z < 4) + _t2(zfp_encode_partial_block_strided, Scalar, 3)(stream, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), sx, sy, sz); + else + _t2(zfp_encode_block_strided, Scalar, 3)(stream, p, sx, sy, sz); + } +} + +/* compress 4d strided array */ +static void +_t2(compress_strided, Scalar, 4)(zfp_stream* stream, const zfp_field* field) +{ + const Scalar* data = field->data; + uint nx = field->nx; + uint ny = field->ny; + uint nz = field->nz; + uint nw = field->nw; + int sx = field->sx ? field->sx : 1; + int sy = field->sy ? field->sy : nx; + int sz = field->sz ? field->sz : (ptrdiff_t)nx * ny; + int sw = field->sw ? field->sw : (ptrdiff_t)nx * ny * nz; + uint x, y, z, w; + + /* compress array one block of 4x4x4x4 values at a time */ + for (w = 0; w < nw; w += 4) + for (z = 0; z < nz; z += 4) + for (y = 0; y < ny; y += 4) + for (x = 0; x < nx; x += 4) { + const Scalar* p = data + sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z + sw * (ptrdiff_t)w; + if (nx - x < 4 || ny - y < 4 || nz - z < 4 || nw - w < 4) + _t2(zfp_encode_partial_block_strided, Scalar, 4)(stream, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), MIN(nw - w, 4u), sx, sy, sz, sw); + else + _t2(zfp_encode_block_strided, Scalar, 4)(stream, p, sx, sy, sz, sw); + } +} diff --git a/zfp/src/template/cudacompress.c b/zfp/src/template/cudacompress.c new file mode 100644 index 0000000000000000000000000000000000000000..1d685c92f395f87742f1bd5fe4f5776b5b24a5d2 --- /dev/null +++ b/zfp/src/template/cudacompress.c @@ -0,0 +1,44 @@ +#ifdef ZFP_WITH_CUDA + +#include "../cuda_zfp/cuZFP.h" + +static void +_t2(compress_cuda, Scalar, 1)(zfp_stream* stream, const zfp_field* field) +{ + if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate) + { + cuda_compress(stream, field); + } +} + +/* compress 1d strided array */ +static void +_t2(compress_strided_cuda, Scalar, 1)(zfp_stream* stream, const zfp_field* field) +{ + if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate) + { + cuda_compress(stream, field); + } +} + +/* compress 2d strided array */ +static void +_t2(compress_strided_cuda, Scalar, 2)(zfp_stream* stream, const zfp_field* field) +{ + if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate) + { + cuda_compress(stream, field); + } +} + +/* compress 3d strided array */ +static void +_t2(compress_strided_cuda, Scalar, 3)(zfp_stream* stream, const zfp_field* field) +{ + if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate) + { + cuda_compress(stream, field); + } +} + +#endif diff --git a/zfp/src/template/cudadecompress.c b/zfp/src/template/cudadecompress.c new file mode 100644 index 0000000000000000000000000000000000000000..4ea4e5bf04fe0d768f53fabc31ce883ca192dd47 --- /dev/null +++ b/zfp/src/template/cudadecompress.c @@ -0,0 +1,44 @@ +#ifdef ZFP_WITH_CUDA + +#include "../cuda_zfp/cuZFP.h" + +static void +_t2(decompress_cuda, Scalar, 1)(zfp_stream* stream, zfp_field* field) +{ + if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate) + { + cuda_decompress(stream, field); + } +} + +/* compress 1d strided array */ +static void +_t2(decompress_strided_cuda, Scalar, 1)(zfp_stream* stream, zfp_field* field) +{ + if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate) + { + cuda_decompress(stream, field); + } +} + +/* compress 2d strided array */ +static void +_t2(decompress_strided_cuda, Scalar, 2)(zfp_stream* stream, zfp_field* field) +{ + if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate) + { + cuda_decompress(stream, field); + } +} + +/* compress 3d strided array */ +static void +_t2(decompress_strided_cuda, Scalar, 3)(zfp_stream* stream, zfp_field* field) +{ + if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate) + { + cuda_decompress(stream, field); + } +} + +#endif diff --git a/zfp/src/template/decode.c b/zfp/src/template/decode.c new file mode 100644 index 0000000000000000000000000000000000000000..e2a2f276da0099f3435264621410d0089137bf94 --- /dev/null +++ b/zfp/src/template/decode.c @@ -0,0 +1,141 @@ +#include <limits.h> + +static void _t2(inv_xform, Int, DIMS)(Int* p); + +/* private functions ------------------------------------------------------- */ + +/* inverse lifting transform of 4-vector */ +static void +_t1(inv_lift, Int)(Int* p, uint s) +{ + Int x, y, z, w; + x = *p; p += s; + y = *p; p += s; + z = *p; p += s; + w = *p; p += s; + + /* + ** non-orthogonal transform + ** ( 4 6 -4 -1) (x) + ** 1/4 * ( 4 2 4 5) (y) + ** ( 4 -2 4 -5) (z) + ** ( 4 -6 -4 1) (w) + */ + y += w >> 1; w -= y >> 1; + y += w; w <<= 1; w -= y; + z += x; x <<= 1; x -= z; + y += z; z <<= 1; z -= y; + w += x; x <<= 1; x -= w; + + p -= s; *p = w; + p -= s; *p = z; + p -= s; *p = y; + p -= s; *p = x; +} + +/* map two's complement signed integer to negabinary unsigned integer */ +static Int +_t1(uint2int, UInt)(UInt x) +{ + return (Int)((x ^ NBMASK) - NBMASK); +} + +/* reorder unsigned coefficients and convert to signed integer */ +static void +_t1(inv_order, Int)(const UInt* ublock, Int* iblock, const uchar* perm, uint n) +{ + do + iblock[*perm++] = _t1(uint2int, UInt)(*ublock++); + while (--n); +} + +/* decompress sequence of size unsigned integers */ +static uint +_t1(decode_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec, UInt* restrict_ data, uint size) +{ + /* make a copy of bit stream to avoid aliasing */ + bitstream s = *stream; + uint intprec = CHAR_BIT * (uint)sizeof(UInt); + uint kmin = intprec > maxprec ? intprec - maxprec : 0; + uint bits = maxbits; + uint i, k, m, n; + uint64 x; + + /* initialize data array to all zeros */ + for (i = 0; i < size; i++) + data[i] = 0; + + /* decode one bit plane at a time from MSB to LSB */ + for (k = intprec, n = 0; bits && k-- > kmin;) { + /* decode first n bits of bit plane #k */ + m = MIN(n, bits); + bits -= m; + x = stream_read_bits(&s, m); + /* unary run-length decode remainder of bit plane */ + for (; n < size && bits && (bits--, stream_read_bit(&s)); x += (uint64)1 << n++) + for (; n < size - 1 && bits && (bits--, !stream_read_bit(&s)); n++) + ; + /* deposit bit plane from x */ + for (i = 0; x; i++, x >>= 1) + data[i] += (UInt)(x & 1u) << k; + } + + *stream = s; + return maxbits - bits; +} + +/* decompress sequence of size > 64 unsigned integers */ +static uint +_t1(decode_many_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec, UInt* restrict_ data, uint size) +{ + /* make a copy of bit stream to avoid aliasing */ + bitstream s = *stream; + uint intprec = CHAR_BIT * (uint)sizeof(UInt); + uint kmin = intprec > maxprec ? intprec - maxprec : 0; + uint bits = maxbits; + uint i, k, m, n; + + /* initialize data array to all zeros */ + for (i = 0; i < size; i++) + data[i] = 0; + + /* decode one bit plane at a time from MSB to LSB */ + for (k = intprec, n = 0; bits && k-- > kmin;) { + /* decode first n bits of bit plane #k */ + m = MIN(n, bits); + bits -= m; + for (i = 0; i < m; i++) + if (stream_read_bit(&s)) + data[i] += (UInt)1 << k; + /* unary run-length decode remainder of bit plane */ + for (; n < size && bits && (--bits, stream_read_bit(&s)); data[n] += (UInt)1 << k, n++) + for (; n < size - 1 && bits && (--bits, !stream_read_bit(&s)); n++) + ; + } + + *stream = s; + return maxbits - bits; +} + +/* decode block of integers */ +static uint +_t2(decode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, int maxprec, Int* iblock) +{ + int bits; + cache_align_(UInt ublock[BLOCK_SIZE]); + /* decode integer coefficients */ + if (BLOCK_SIZE <= 64) + bits = _t1(decode_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE); + else + bits = _t1(decode_many_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE); + /* read at least minbits bits */ + if (bits < minbits) { + stream_skip(stream, minbits - bits); + bits = minbits; + } + /* reorder unsigned coefficients and convert to signed integer */ + _t1(inv_order, Int)(ublock, iblock, PERM, BLOCK_SIZE); + /* perform decorrelating transform */ + _t2(inv_xform, Int, DIMS)(iblock); + return bits; +} diff --git a/zfp/src/template/decode1.c b/zfp/src/template/decode1.c new file mode 100644 index 0000000000000000000000000000000000000000..68ee0793e82a64c4733d59c255255c7588e3b424 --- /dev/null +++ b/zfp/src/template/decode1.c @@ -0,0 +1,53 @@ +/* private functions ------------------------------------------------------- */ + +/* scatter 4-value block to strided array */ +static void +_t2(scatter, Scalar, 1)(const Scalar* q, Scalar* p, int sx) +{ + uint x; + for (x = 0; x < 4; x++, p += sx) + *p = *q++; +} + +/* scatter nx-value block to strided array */ +static void +_t2(scatter_partial, Scalar, 1)(const Scalar* q, Scalar* p, uint nx, int sx) +{ + uint x; + for (x = 0; x < nx; x++, p += sx) + *p = *q++; +} + +/* inverse decorrelating 1D transform */ +static void +_t2(inv_xform, Int, 1)(Int* p) +{ + /* transform along x */ + _t1(inv_lift, Int)(p, 1); +} + +/* public functions -------------------------------------------------------- */ + +/* decode 4-value floating-point block and store at p using stride sx */ +uint +_t2(zfp_decode_block_strided, Scalar, 1)(zfp_stream* stream, Scalar* p, int sx) +{ + /* decode contiguous block */ + cache_align_(Scalar fblock[4]); + uint bits = _t2(zfp_decode_block, Scalar, 1)(stream, fblock); + /* scatter block to strided array */ + _t2(scatter, Scalar, 1)(fblock, p, sx); + return bits; +} + +/* decode nx-value floating-point block and store at p using stride sx */ +uint +_t2(zfp_decode_partial_block_strided, Scalar, 1)(zfp_stream* stream, Scalar* p, uint nx, int sx) +{ + /* decode contiguous block */ + cache_align_(Scalar fblock[4]); + uint bits = _t2(zfp_decode_block, Scalar, 1)(stream, fblock); + /* scatter block to strided array */ + _t2(scatter_partial, Scalar, 1)(fblock, p, nx, sx); + return bits; +} diff --git a/zfp/src/template/decode2.c b/zfp/src/template/decode2.c new file mode 100644 index 0000000000000000000000000000000000000000..23e1892cb5726a1483eccd86e81206f62820ea28 --- /dev/null +++ b/zfp/src/template/decode2.c @@ -0,0 +1,60 @@ +/* private functions ------------------------------------------------------- */ + +/* scatter 4*4 block to strided array */ +static void +_t2(scatter, Scalar, 2)(const Scalar* q, Scalar* p, int sx, int sy) +{ + uint x, y; + for (y = 0; y < 4; y++, p += sy - 4 * sx) + for (x = 0; x < 4; x++, p += sx) + *p = *q++; +} + +/* scatter nx*ny block to strided array */ +static void +_t2(scatter_partial, Scalar, 2)(const Scalar* q, Scalar* p, uint nx, uint ny, int sx, int sy) +{ + uint x, y; + for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx) + for (x = 0; x < nx; x++, p += sx, q++) + *p = *q; +} + +/* inverse decorrelating 2D transform */ +static void +_t2(inv_xform, Int, 2)(Int* p) +{ + uint x, y; + /* transform along y */ + for (x = 0; x < 4; x++) + _t1(inv_lift, Int)(p + 1 * x, 4); + /* transform along x */ + for (y = 0; y < 4; y++) + _t1(inv_lift, Int)(p + 4 * y, 1); +} + +/* public functions -------------------------------------------------------- */ + +/* decode 4*4 floating-point block and store at p using strides (sx, sy) */ +uint +_t2(zfp_decode_block_strided, Scalar, 2)(zfp_stream* stream, Scalar* p, int sx, int sy) +{ + /* decode contiguous block */ + cache_align_(Scalar fblock[16]); + uint bits = _t2(zfp_decode_block, Scalar, 2)(stream, fblock); + /* scatter block to strided array */ + _t2(scatter, Scalar, 2)(fblock, p, sx, sy); + return bits; +} + +/* decode nx*ny floating-point block and store at p using strides (sx, sy) */ +uint +_t2(zfp_decode_partial_block_strided, Scalar, 2)(zfp_stream* stream, Scalar* p, uint nx, uint ny, int sx, int sy) +{ + /* decode contiguous block */ + cache_align_(Scalar fblock[16]); + uint bits = _t2(zfp_decode_block, Scalar, 2)(stream, fblock); + /* scatter block to strided array */ + _t2(scatter_partial, Scalar, 2)(fblock, p, nx, ny, sx, sy); + return bits; +} diff --git a/zfp/src/template/decode3.c b/zfp/src/template/decode3.c new file mode 100644 index 0000000000000000000000000000000000000000..b48411821f1cfeb8be349d15645e6a137ec20a42 --- /dev/null +++ b/zfp/src/template/decode3.c @@ -0,0 +1,68 @@ +/* private functions ------------------------------------------------------- */ + +/* scatter 4*4*4 block to strided array */ +static void +_t2(scatter, Scalar, 3)(const Scalar* q, Scalar* p, int sx, int sy, int sz) +{ + uint x, y, z; + for (z = 0; z < 4; z++, p += sz - 4 * sy) + for (y = 0; y < 4; y++, p += sy - 4 * sx) + for (x = 0; x < 4; x++, p += sx) + *p = *q++; +} + +/* scatter nx*ny*nz block to strided array */ +static void +_t2(scatter_partial, Scalar, 3)(const Scalar* q, Scalar* p, uint nx, uint ny, uint nz, int sx, int sy, int sz) +{ + uint x, y, z; + for (z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 4 * (4 - ny)) + for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 1 * (4 - nx)) + for (x = 0; x < nx; x++, p += sx, q++) + *p = *q; +} + +/* inverse decorrelating 3D transform */ +static void +_t2(inv_xform, Int, 3)(Int* p) +{ + uint x, y, z; + /* transform along z */ + for (y = 0; y < 4; y++) + for (x = 0; x < 4; x++) + _t1(inv_lift, Int)(p + 1 * x + 4 * y, 16); + /* transform along y */ + for (x = 0; x < 4; x++) + for (z = 0; z < 4; z++) + _t1(inv_lift, Int)(p + 16 * z + 1 * x, 4); + /* transform along x */ + for (z = 0; z < 4; z++) + for (y = 0; y < 4; y++) + _t1(inv_lift, Int)(p + 4 * y + 16 * z, 1); +} + +/* public functions -------------------------------------------------------- */ + +/* decode 4*4*4 floating-point block and store at p using strides (sx, sy, sz) */ +uint +_t2(zfp_decode_block_strided, Scalar, 3)(zfp_stream* stream, Scalar* p, int sx, int sy, int sz) +{ + /* decode contiguous block */ + cache_align_(Scalar fblock[64]); + uint bits = _t2(zfp_decode_block, Scalar, 3)(stream, fblock); + /* scatter block to strided array */ + _t2(scatter, Scalar, 3)(fblock, p, sx, sy, sz); + return bits; +} + +/* decode nx*ny*nz floating-point block and store at p using strides (sx, sy, sz) */ +uint +_t2(zfp_decode_partial_block_strided, Scalar, 3)(zfp_stream* stream, Scalar* p, uint nx, uint ny, uint nz, int sx, int sy, int sz) +{ + /* decode contiguous block */ + cache_align_(Scalar fblock[64]); + uint bits = _t2(zfp_decode_block, Scalar, 3)(stream, fblock); + /* scatter block to strided array */ + _t2(scatter_partial, Scalar, 3)(fblock, p, nx, ny, nz, sx, sy, sz); + return bits; +} diff --git a/zfp/src/template/decode4.c b/zfp/src/template/decode4.c new file mode 100644 index 0000000000000000000000000000000000000000..8d34abfce25c99ba326da7bfbb8ba68a59830f10 --- /dev/null +++ b/zfp/src/template/decode4.c @@ -0,0 +1,78 @@ +/* private functions ------------------------------------------------------- */ + +/* scatter 4*4*4*4 block to strided array */ +static void +_t2(scatter, Scalar, 4)(const Scalar* q, Scalar* p, int sx, int sy, int sz, int sw) +{ + uint x, y, z, w; + for (w = 0; w < 4; w++, p += sw - 4 * sz) + for (z = 0; z < 4; z++, p += sz - 4 * sy) + for (y = 0; y < 4; y++, p += sy - 4 * sx) + for (x = 0; x < 4; x++, p += sx) + *p = *q++; +} + +/* scatter nx*ny*nz*nw block to strided array */ +static void +_t2(scatter_partial, Scalar, 4)(const Scalar* q, Scalar* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw) +{ + uint x, y, z, w; + for (w = 0; w < nw; w++, p += sw - (ptrdiff_t)nz * sz, q += 16 * (4 - nz)) + for (z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 4 * (4 - ny)) + for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 1 * (4 - nx)) + for (x = 0; x < nx; x++, p += sx, q++) + *p = *q; +} + +/* inverse decorrelating 4D transform */ +static void +_t2(inv_xform, Int, 4)(Int* p) +{ + uint x, y, z, w; + /* transform along w */ + for (z = 0; z < 4; z++) + for (y = 0; y < 4; y++) + for (x = 0; x < 4; x++) + _t1(inv_lift, Int)(p + 1 * x + 4 * y + 16 * z, 64); + /* transform along z */ + for (y = 0; y < 4; y++) + for (x = 0; x < 4; x++) + for (w = 0; w < 4; w++) + _t1(inv_lift, Int)(p + 64 * w + 1 * x + 4 * y, 16); + /* transform along y */ + for (x = 0; x < 4; x++) + for (w = 0; w < 4; w++) + for (z = 0; z < 4; z++) + _t1(inv_lift, Int)(p + 16 * z + 64 * w + 1 * x, 4); + /* transform along x */ + for (w = 0; w < 4; w++) + for (z = 0; z < 4; z++) + for (y = 0; y < 4; y++) + _t1(inv_lift, Int)(p + 4 * y + 16 * z + 64 * w, 1); +} + +/* public functions -------------------------------------------------------- */ + +/* decode 4*4*4*4 floating-point block and store at p using strides (sx, sy, sz, sw) */ +uint +_t2(zfp_decode_block_strided, Scalar, 4)(zfp_stream* stream, Scalar* p, int sx, int sy, int sz, int sw) +{ + /* decode contiguous block */ + cache_align_(Scalar fblock[256]); + uint bits = _t2(zfp_decode_block, Scalar, 4)(stream, fblock); + /* scatter block to strided array */ + _t2(scatter, Scalar, 4)(fblock, p, sx, sy, sz, sw); + return bits; +} + +/* decode nx*ny*nz*nw floating-point block and store at p using strides (sx, sy, sz, sw) */ +uint +_t2(zfp_decode_partial_block_strided, Scalar, 4)(zfp_stream* stream, Scalar* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw) +{ + /* decode contiguous block */ + cache_align_(Scalar fblock[256]); + uint bits = _t2(zfp_decode_block, Scalar, 4)(stream, fblock); + /* scatter block to strided array */ + _t2(scatter_partial, Scalar, 4)(fblock, p, nx, ny, nz, nw, sx, sy, sz, sw); + return bits; +} diff --git a/zfp/src/template/decodef.c b/zfp/src/template/decodef.c new file mode 100644 index 0000000000000000000000000000000000000000..b6abec939201fc0d94f79f77957519ec53e35ced --- /dev/null +++ b/zfp/src/template/decodef.c @@ -0,0 +1,56 @@ +#include <limits.h> +#include <math.h> + +/* private functions ------------------------------------------------------- */ + +/* map integer x relative to exponent e to floating-point number */ +static Scalar +_t1(dequantize, Scalar)(Int x, int e) +{ + return LDEXP((Scalar)x, e - (CHAR_BIT * (int)sizeof(Scalar) - 2)); +} + +/* inverse block-floating-point transform from signed integers */ +static void +_t1(inv_cast, Scalar)(const Int* iblock, Scalar* fblock, uint n, int emax) +{ + /* compute power-of-two scale factor s */ + Scalar s = _t1(dequantize, Scalar)(1, emax); + /* compute p-bit float x = s*y where |y| <= 2^(p-2) - 1 */ + do + *fblock++ = (Scalar)(s * *iblock++); + while (--n); +} + +/* public functions -------------------------------------------------------- */ + +/* decode contiguous floating-point block */ +uint +_t2(zfp_decode_block, Scalar, DIMS)(zfp_stream* zfp, Scalar* fblock) +{ + /* test if block has nonzero values */ + if (stream_read_bit(zfp->stream)) { + cache_align_(Int iblock[BLOCK_SIZE]); + /* decode common exponent */ + uint ebits = EBITS + 1; + int emax = (int)stream_read_bits(zfp->stream, ebits - 1) - EBIAS; + int maxprec = precision(emax, zfp->maxprec, zfp->minexp, DIMS); + /* decode integer block */ + uint bits = _t2(decode_block, Int, DIMS)(zfp->stream, zfp->minbits - ebits, zfp->maxbits - ebits, maxprec, iblock); + /* perform inverse block-floating-point transform */ + _t1(inv_cast, Scalar)(iblock, fblock, BLOCK_SIZE, emax); + return ebits + bits; + } + else { + /* set all values to zero */ + uint i; + for (i = 0; i < BLOCK_SIZE; i++) + *fblock++ = 0; + if (zfp->minbits > 1) { + stream_skip(zfp->stream, zfp->minbits - 1); + return zfp->minbits; + } + else + return 1; + } +} diff --git a/zfp/src/template/decodei.c b/zfp/src/template/decodei.c new file mode 100644 index 0000000000000000000000000000000000000000..b2fb4f440f93abfe65609df81e1a1f9d7da203e0 --- /dev/null +++ b/zfp/src/template/decodei.c @@ -0,0 +1,8 @@ +/* public functions -------------------------------------------------------- */ + +/* decode contiguous integer block */ +uint +_t2(zfp_decode_block, Int, DIMS)(zfp_stream* zfp, Int* iblock) +{ + return _t2(decode_block, Int, DIMS)(zfp->stream, zfp->minbits, zfp->maxbits, zfp->maxprec, iblock); +} diff --git a/zfp/src/template/decompress.c b/zfp/src/template/decompress.c new file mode 100644 index 0000000000000000000000000000000000000000..db7bb512fc4c29e1d20eb6849ca87a6ca2f6ffae --- /dev/null +++ b/zfp/src/template/decompress.c @@ -0,0 +1,128 @@ +/* decompress 1d contiguous array */ +static void +_t2(decompress, Scalar, 1)(zfp_stream* stream, zfp_field* field) +{ + Scalar* data = (Scalar*)field->data; + uint nx = field->nx; + uint mx = nx & ~3u; + uint x; + + /* decompress array one block of 4 values at a time */ + for (x = 0; x < mx; x += 4, data += 4) + _t2(zfp_decode_block, Scalar, 1)(stream, data); + if (x < nx) + _t2(zfp_decode_partial_block_strided, Scalar, 1)(stream, data, nx - x, 1); +} + +#if 0 +/* decompress 1d strided array */ +static void +_t2(decompress_strided, Scalar, 1)(zfp_stream* stream, zfp_field* field) +{ + Scalar* data = (Scalar*)field->data; + uint nx = field->nx; + uint mx = nx & ~3u; + int sx = field->sx ? field->sx : 1; + uint x; + + /* decompress array one block of 4 values at a time */ + for (x = 0; x < mx; x += 4, data += 4 * sx) + _t2(zfp_decode_block_strided, Scalar, 1)(stream, data, sx); + if (x < nx) + _t2(zfp_decode_partial_block_strided, Scalar, 1)(stream, data, nx - x, sx); +} +#else +/* decompress 1d strided array */ +static void +_t2(decompress_strided, Scalar, 1)(zfp_stream* stream, zfp_field* field) +{ + Scalar* data = field->data; + uint nx = field->nx; + int sx = field->sx ? field->sx : 1; + uint x; + + /* decompress array one block of 4 values at a time */ + for (x = 0; x < nx; x += 4) { + Scalar* p = data + sx * (ptrdiff_t)x; + if (nx - x < 4) + _t2(zfp_decode_partial_block_strided, Scalar, 1)(stream, p, nx - x, sx); + else + _t2(zfp_decode_block_strided, Scalar, 1)(stream, p, sx); + } +} +#endif + +/* decompress 2d strided array */ +static void +_t2(decompress_strided, Scalar, 2)(zfp_stream* stream, zfp_field* field) +{ + Scalar* data = (Scalar*)field->data; + uint nx = field->nx; + uint ny = field->ny; + int sx = field->sx ? field->sx : 1; + int sy = field->sy ? field->sy : nx; + uint x, y; + + /* decompress array one block of 4x4 values at a time */ + for (y = 0; y < ny; y += 4) + for (x = 0; x < nx; x += 4) { + Scalar* p = data + sx * (ptrdiff_t)x + sy * (ptrdiff_t)y; + if (nx - x < 4 || ny - y < 4) + _t2(zfp_decode_partial_block_strided, Scalar, 2)(stream, p, MIN(nx - x, 4u), MIN(ny - y, 4u), sx, sy); + else + _t2(zfp_decode_block_strided, Scalar, 2)(stream, p, sx, sy); + } +} + +/* decompress 3d strided array */ +static void +_t2(decompress_strided, Scalar, 3)(zfp_stream* stream, zfp_field* field) +{ + Scalar* data = (Scalar*)field->data; + uint nx = field->nx; + uint ny = field->ny; + uint nz = field->nz; + int sx = field->sx ? field->sx : 1; + int sy = field->sy ? field->sy : nx; + int sz = field->sz ? field->sz : nx * ny; + uint x, y, z; + + /* decompress array one block of 4x4x4 values at a time */ + for (z = 0; z < nz; z += 4) + for (y = 0; y < ny; y += 4) + for (x = 0; x < nx; x += 4) { + Scalar* p = data + sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z; + if (nx - x < 4 || ny - y < 4 || nz - z < 4) + _t2(zfp_decode_partial_block_strided, Scalar, 3)(stream, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), sx, sy, sz); + else + _t2(zfp_decode_block_strided, Scalar, 3)(stream, p, sx, sy, sz); + } +} + +/* decompress 4d strided array */ +static void +_t2(decompress_strided, Scalar, 4)(zfp_stream* stream, zfp_field* field) +{ + Scalar* data = field->data; + uint nx = field->nx; + uint ny = field->ny; + uint nz = field->nz; + uint nw = field->nw; + int sx = field->sx ? field->sx : 1; + int sy = field->sy ? field->sy : nx; + int sz = field->sz ? field->sz : (ptrdiff_t)nx * ny; + int sw = field->sw ? field->sw : (ptrdiff_t)nx * ny * nz; + uint x, y, z, w; + + /* decompress array one block of 4x4x4x4 values at a time */ + for (w = 0; w < nw; w += 4) + for (z = 0; z < nz; z += 4) + for (y = 0; y < ny; y += 4) + for (x = 0; x < nx; x += 4) { + Scalar* p = data + sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z + sw * (ptrdiff_t)w; + if (nx - x < 4 || ny - y < 4 || nz - z < 4 || nw - w < 4) + _t2(zfp_decode_partial_block_strided, Scalar, 4)(stream, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), MIN(nw - w, 4u), sx, sy, sz, sw); + else + _t2(zfp_decode_block_strided, Scalar, 4)(stream, p, sx, sy, sz, sw); + } +} diff --git a/zfp/src/template/encode.c b/zfp/src/template/encode.c new file mode 100644 index 0000000000000000000000000000000000000000..bba18f60dc6b644ae0f48f3a6ca229af0150e30e --- /dev/null +++ b/zfp/src/template/encode.c @@ -0,0 +1,159 @@ +#include <limits.h> + +static void _t2(fwd_xform, Int, DIMS)(Int* p); + +/* private functions ------------------------------------------------------- */ + +/* pad partial block of width n <= 4 and stride s */ +static void +_t1(pad_block, Scalar)(Scalar* p, uint n, uint s) +{ + switch (n) { + case 0: + p[0 * s] = 0; + /* FALLTHROUGH */ + case 1: + p[1 * s] = p[0 * s]; + /* FALLTHROUGH */ + case 2: + p[2 * s] = p[1 * s]; + /* FALLTHROUGH */ + case 3: + p[3 * s] = p[0 * s]; + /* FALLTHROUGH */ + default: + break; + } +} + +/* forward lifting transform of 4-vector */ +static void +_t1(fwd_lift, Int)(Int* p, uint s) +{ + Int x, y, z, w; + x = *p; p += s; + y = *p; p += s; + z = *p; p += s; + w = *p; p += s; + + /* + ** non-orthogonal transform + ** ( 4 4 4 4) (x) + ** 1/16 * ( 5 1 -1 -5) (y) + ** (-4 4 4 -4) (z) + ** (-2 6 -6 2) (w) + */ + x += w; x >>= 1; w -= x; + z += y; z >>= 1; y -= z; + x += z; x >>= 1; z -= x; + w += y; w >>= 1; y -= w; + w += y >> 1; y -= w >> 1; + + p -= s; *p = w; + p -= s; *p = z; + p -= s; *p = y; + p -= s; *p = x; +} + +/* map two's complement signed integer to negabinary unsigned integer */ +static UInt +_t1(int2uint, Int)(Int x) +{ + return ((UInt)x + NBMASK) ^ NBMASK; +} + +/* reorder signed coefficients and convert to unsigned integer */ +static void +_t1(fwd_order, Int)(UInt* ublock, const Int* iblock, const uchar* perm, uint n) +{ + do + *ublock++ = _t1(int2uint, Int)(iblock[*perm++]); + while (--n); +} + +/* compress sequence of size unsigned integers */ +static uint +_t1(encode_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec, const UInt* restrict_ data, uint size) +{ + /* make a copy of bit stream to avoid aliasing */ + bitstream s = *stream; + uint intprec = CHAR_BIT * (uint)sizeof(UInt); + uint kmin = intprec > maxprec ? intprec - maxprec : 0; + uint bits = maxbits; + uint i, k, m, n; + uint64 x; + + /* encode one bit plane at a time from MSB to LSB */ + for (k = intprec, n = 0; bits && k-- > kmin;) { + /* step 1: extract bit plane #k to x */ + x = 0; + for (i = 0; i < size; i++) + x += (uint64)((data[i] >> k) & 1u) << i; + /* step 2: encode first n bits of bit plane */ + m = MIN(n, bits); + bits -= m; + x = stream_write_bits(&s, x, m); + /* step 3: unary run-length encode remainder of bit plane */ + for (; n < size && bits && (bits--, stream_write_bit(&s, !!x)); x >>= 1, n++) + for (; n < size - 1 && bits && (bits--, !stream_write_bit(&s, x & 1u)); x >>= 1, n++) + ; + } + + *stream = s; + return maxbits - bits; +} + +/* compress sequence of size > 64 unsigned integers */ +static uint +_t1(encode_many_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec, const UInt* restrict_ data, uint size) +{ + /* make a copy of bit stream to avoid aliasing */ + bitstream s = *stream; + uint intprec = CHAR_BIT * (uint)sizeof(UInt); + uint kmin = intprec > maxprec ? intprec - maxprec : 0; + uint bits = maxbits; + uint i, k, m, n, c; + + /* encode one bit plane at a time from MSB to LSB */ + for (k = intprec, n = 0; bits && k-- > kmin;) { + /* step 1: encode first n bits of bit plane #k */ + m = MIN(n, bits); + bits -= m; + for (i = 0; i < m; i++) + stream_write_bit(&s, (data[i] >> k) & 1u); + /* step 2: count remaining one-bits in bit plane */ + c = 0; + for (i = m; i < size; i++) + c += (data[i] >> k) & 1u; + /* step 3: unary run-length encode remainder of bit plane */ + for (; n < size && bits && (--bits, stream_write_bit(&s, !!c)); c--, n++) + for (; n < size - 1 && bits && (--bits, !stream_write_bit(&s, (data[n] >> k) & 1u)); n++) + ; + } + + *stream = s; + return maxbits - bits; +} + +/* encode block of integers */ +static uint +_t2(encode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, int maxprec, Int* iblock) +{ + int bits; + cache_align_(UInt ublock[BLOCK_SIZE]); + /* perform decorrelating transform */ + _t2(fwd_xform, Int, DIMS)(iblock); + /* reorder signed coefficients and convert to unsigned integer */ + _t1(fwd_order, Int)(ublock, iblock, PERM, BLOCK_SIZE); + /* encode integer coefficients */ + if (BLOCK_SIZE <= 64) + bits = _t1(encode_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE); + else + bits = _t1(encode_many_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE); + /* write at least minbits bits by padding with zeros */ + if (bits < minbits) { + stream_pad(stream, minbits - bits); + bits = minbits; + } + return bits; +} diff --git a/zfp/src/template/encode1.c b/zfp/src/template/encode1.c new file mode 100644 index 0000000000000000000000000000000000000000..c61849299a4ffbcde0c99ee7b5c90546cfcc3c9c --- /dev/null +++ b/zfp/src/template/encode1.c @@ -0,0 +1,52 @@ +/* private functions ------------------------------------------------------- */ + +/* gather 4-value block from strided array */ +static void +_t2(gather, Scalar, 1)(Scalar* q, const Scalar* p, int sx) +{ + uint x; + for (x = 0; x < 4; x++, p += sx) + *q++ = *p; +} + +/* gather nx-value block from strided array */ +static void +_t2(gather_partial, Scalar, 1)(Scalar* q, const Scalar* p, uint nx, int sx) +{ + uint x; + for (x = 0; x < nx; x++, p += sx) + q[x] = *p; + _t1(pad_block, Scalar)(q, nx, 1); +} + +/* forward decorrelating 1D transform */ +static void +_t2(fwd_xform, Int, 1)(Int* p) +{ + /* transform along x */ + _t1(fwd_lift, Int)(p, 1); +} + +/* public functions -------------------------------------------------------- */ + +/* encode 4-value floating-point block stored at p using stride sx */ +uint +_t2(zfp_encode_block_strided, Scalar, 1)(zfp_stream* stream, const Scalar* p, int sx) +{ + /* gather block from strided array */ + cache_align_(Scalar fblock[4]); + _t2(gather, Scalar, 1)(fblock, p, sx); + /* encode floating-point block */ + return _t2(zfp_encode_block, Scalar, 1)(stream, fblock); +} + +/* encode nx-value floating-point block stored at p using stride sx */ +uint +_t2(zfp_encode_partial_block_strided, Scalar, 1)(zfp_stream* stream, const Scalar* p, uint nx, int sx) +{ + /* gather block from strided array */ + cache_align_(Scalar fblock[4]); + _t2(gather_partial, Scalar, 1)(fblock, p, nx, sx); + /* encode floating-point block */ + return _t2(zfp_encode_block, Scalar, 1)(stream, fblock); +} diff --git a/zfp/src/template/encode2.c b/zfp/src/template/encode2.c new file mode 100644 index 0000000000000000000000000000000000000000..4bec256a630c54e700c43e909bd6e479dbf5baf4 --- /dev/null +++ b/zfp/src/template/encode2.c @@ -0,0 +1,62 @@ +/* private functions ------------------------------------------------------- */ + +/* gather 4*4 block from strided array */ +static void +_t2(gather, Scalar, 2)(Scalar* q, const Scalar* p, int sx, int sy) +{ + uint x, y; + for (y = 0; y < 4; y++, p += sy - 4 * sx) + for (x = 0; x < 4; x++, p += sx) + *q++ = *p; +} + +/* gather nx*ny block from strided array */ +static void +_t2(gather_partial, Scalar, 2)(Scalar* q, const Scalar* p, uint nx, uint ny, int sx, int sy) +{ + uint x, y; + for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx) { + for (x = 0; x < nx; x++, p += sx) + q[4 * y + x] = *p; + _t1(pad_block, Scalar)(q + 4 * y, nx, 1); + } + for (x = 0; x < 4; x++) + _t1(pad_block, Scalar)(q + x, ny, 4); +} + +/* forward decorrelating 2D transform */ +static void +_t2(fwd_xform, Int, 2)(Int* p) +{ + uint x, y; + /* transform along x */ + for (y = 0; y < 4; y++) + _t1(fwd_lift, Int)(p + 4 * y, 1); + /* transform along y */ + for (x = 0; x < 4; x++) + _t1(fwd_lift, Int)(p + 1 * x, 4); +} + +/* public functions -------------------------------------------------------- */ + +/* encode 4*4 floating-point block stored at p using strides (sx, sy) */ +uint +_t2(zfp_encode_block_strided, Scalar, 2)(zfp_stream* stream, const Scalar* p, int sx, int sy) +{ + /* gather block from strided array */ + cache_align_(Scalar fblock[16]); + _t2(gather, Scalar, 2)(fblock, p, sx, sy); + /* encode floating-point block */ + return _t2(zfp_encode_block, Scalar, 2)(stream, fblock); +} + +/* encode nx*ny floating-point block stored at p using strides (sx, sy) */ +uint +_t2(zfp_encode_partial_block_strided, Scalar, 2)(zfp_stream* stream, const Scalar* p, uint nx, uint ny, int sx, int sy) +{ + /* gather block from strided array */ + cache_align_(Scalar fblock[16]); + _t2(gather_partial, Scalar, 2)(fblock, p, nx, ny, sx, sy); + /* encode floating-point block */ + return _t2(zfp_encode_block, Scalar, 2)(stream, fblock); +} diff --git a/zfp/src/template/encode3.c b/zfp/src/template/encode3.c new file mode 100644 index 0000000000000000000000000000000000000000..a16a8add97dd5f3ecd3036982f384f68d9b4dfb3 --- /dev/null +++ b/zfp/src/template/encode3.c @@ -0,0 +1,74 @@ +/* private functions ------------------------------------------------------- */ + +/* gather 4*4*4 block from strided array */ +static void +_t2(gather, Scalar, 3)(Scalar* q, const Scalar* p, int sx, int sy, int sz) +{ + uint x, y, z; + for (z = 0; z < 4; z++, p += sz - 4 * sy) + for (y = 0; y < 4; y++, p += sy - 4 * sx) + for (x = 0; x < 4; x++, p += sx) + *q++ = *p; +} + +/* gather nx*ny*nz block from strided array */ +static void +_t2(gather_partial, Scalar, 3)(Scalar* q, const Scalar* p, uint nx, uint ny, uint nz, int sx, int sy, int sz) +{ + uint x, y, z; + for (z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy) { + for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx) { + for (x = 0; x < nx; x++, p += sx) + q[16 * z + 4 * y + x] = *p; + _t1(pad_block, Scalar)(q + 16 * z + 4 * y, nx, 1); + } + for (x = 0; x < 4; x++) + _t1(pad_block, Scalar)(q + 16 * z + x, ny, 4); + } + for (y = 0; y < 4; y++) + for (x = 0; x < 4; x++) + _t1(pad_block, Scalar)(q + 4 * y + x, nz, 16); +} + +/* forward decorrelating 3D transform */ +static void +_t2(fwd_xform, Int, 3)(Int* p) +{ + uint x, y, z; + /* transform along x */ + for (z = 0; z < 4; z++) + for (y = 0; y < 4; y++) + _t1(fwd_lift, Int)(p + 4 * y + 16 * z, 1); + /* transform along y */ + for (x = 0; x < 4; x++) + for (z = 0; z < 4; z++) + _t1(fwd_lift, Int)(p + 16 * z + 1 * x, 4); + /* transform along z */ + for (y = 0; y < 4; y++) + for (x = 0; x < 4; x++) + _t1(fwd_lift, Int)(p + 1 * x + 4 * y, 16); +} + +/* public functions -------------------------------------------------------- */ + +/* encode 4*4*4 floating-point block stored at p using strides (sx, sy, sz) */ +uint +_t2(zfp_encode_block_strided, Scalar, 3)(zfp_stream* stream, const Scalar* p, int sx, int sy, int sz) +{ + /* gather block from strided array */ + cache_align_(Scalar fblock[64]); + _t2(gather, Scalar, 3)(fblock, p, sx, sy, sz); + /* encode floating-point block */ + return _t2(zfp_encode_block, Scalar, 3)(stream, fblock); +} + +/* encode nx*ny*nz floating-point block stored at p using strides (sx, sy, sz) */ +uint +_t2(zfp_encode_partial_block_strided, Scalar, 3)(zfp_stream* stream, const Scalar* p, uint nx, uint ny, uint nz, int sx, int sy, int sz) +{ + /* gather block from strided array */ + cache_align_(Scalar fblock[64]); + _t2(gather_partial, Scalar, 3)(fblock, p, nx, ny, nz, sx, sy, sz); + /* encode floating-point block */ + return _t2(zfp_encode_block, Scalar, 3)(stream, fblock); +} diff --git a/zfp/src/template/encode4.c b/zfp/src/template/encode4.c new file mode 100644 index 0000000000000000000000000000000000000000..c9ed5425a3b1a8b3cdfd13676904983bebb55427 --- /dev/null +++ b/zfp/src/template/encode4.c @@ -0,0 +1,89 @@ +/* private functions ------------------------------------------------------- */ + +/* gather 4*4*4*4 block from strided array */ +static void +_t2(gather, Scalar, 4)(Scalar* q, const Scalar* p, int sx, int sy, int sz, int sw) +{ + uint x, y, z, w; + for (w = 0; w < 4; w++, p += sw - 4 * sz) + for (z = 0; z < 4; z++, p += sz - 4 * sy) + for (y = 0; y < 4; y++, p += sy - 4 * sx) + for (x = 0; x < 4; x++, p += sx) + *q++ = *p; +} + +/* gather nx*ny*nz*nw block from strided array */ +static void +_t2(gather_partial, Scalar, 4)(Scalar* q, const Scalar* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw) +{ + uint x, y, z, w; + for (w = 0; w < nw; w++, p += sw - (ptrdiff_t)nz * sz) { + for (z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy) { + for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx) { + for (x = 0; x < nx; x++, p += sx) + q[64 * w + 16 * z + 4 * y + x] = *p; + _t1(pad_block, Scalar)(q + 64 * w + 16 * z + 4 * y, nx, 1); + } + for (x = 0; x < 4; x++) + _t1(pad_block, Scalar)(q + 64 * w + 16 * z + x, ny, 4); + } + for (y = 0; y < 4; y++) + for (x = 0; x < 4; x++) + _t1(pad_block, Scalar)(q + 64 * w + 4 * y + x, nz, 16); + } + for (z = 0; z < 4; z++) + for (y = 0; y < 4; y++) + for (x = 0; x < 4; x++) + _t1(pad_block, Scalar)(q + 16 * z + 4 * y + x, nw, 64); +} + +/* forward decorrelating 4D transform */ +static void +_t2(fwd_xform, Int, 4)(Int* p) +{ + uint x, y, z, w; + /* transform along x */ + for (w = 0; w < 4; w++) + for (z = 0; z < 4; z++) + for (y = 0; y < 4; y++) + _t1(fwd_lift, Int)(p + 4 * y + 16 * z + 64 * w, 1); + /* transform along y */ + for (x = 0; x < 4; x++) + for (w = 0; w < 4; w++) + for (z = 0; z < 4; z++) + _t1(fwd_lift, Int)(p + 16 * z + 64 * w + 1 * x, 4); + /* transform along z */ + for (y = 0; y < 4; y++) + for (x = 0; x < 4; x++) + for (w = 0; w < 4; w++) + _t1(fwd_lift, Int)(p + 64 * w + 1 * x + 4 * y, 16); + /* transform along w */ + for (z = 0; z < 4; z++) + for (y = 0; y < 4; y++) + for (x = 0; x < 4; x++) + _t1(fwd_lift, Int)(p + 1 * x + 4 * y + 16 * z, 64); +} + +/* public functions -------------------------------------------------------- */ + +/* encode 4*4*4*4 floating-point block stored at p using strides (sx, sy, sz, sw) */ +uint +_t2(zfp_encode_block_strided, Scalar, 4)(zfp_stream* stream, const Scalar* p, int sx, int sy, int sz, int sw) +{ + /* gather block from strided array */ + cache_align_(Scalar fblock[256]); + _t2(gather, Scalar, 4)(fblock, p, sx, sy, sz, sw); + /* encode floating-point block */ + return _t2(zfp_encode_block, Scalar, 4)(stream, fblock); +} + +/* encode nx*ny*nz*nw floating-point block stored at p using strides (sx, sy, sz, sw) */ +uint +_t2(zfp_encode_partial_block_strided, Scalar, 4)(zfp_stream* stream, const Scalar* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw) +{ + /* gather block from strided array */ + cache_align_(Scalar fblock[256]); + _t2(gather_partial, Scalar, 4)(fblock, p, nx, ny, nz, nw, sx, sy, sz, sw); + /* encode floating-point block */ + return _t2(zfp_encode_block, Scalar, 4)(stream, fblock); +} diff --git a/zfp/src/template/encodef.c b/zfp/src/template/encodef.c new file mode 100644 index 0000000000000000000000000000000000000000..874597a779a1edf014d4e810e1917b76a027d024 --- /dev/null +++ b/zfp/src/template/encodef.c @@ -0,0 +1,82 @@ +#include <limits.h> +#include <math.h> + +/* private functions ------------------------------------------------------- */ + +/* return normalized floating-point exponent for x >= 0 */ +static int +_t1(exponent, Scalar)(Scalar x) +{ + if (x > 0) { + int e; + FREXP(x, &e); + /* clamp exponent in case x is denormal */ + return MAX(e, 1 - EBIAS); + } + return -EBIAS; +} + +/* compute maximum floating-point exponent in block of n values */ +static int +_t1(exponent_block, Scalar)(const Scalar* p, uint n) +{ + Scalar max = 0; + do { + Scalar f = FABS(*p++); + if (max < f) + max = f; + } while (--n); + return _t1(exponent, Scalar)(max); +} + +/* map floating-point number x to integer relative to exponent e */ +static Scalar +_t1(quantize, Scalar)(Scalar x, int e) +{ + return LDEXP(x, (CHAR_BIT * (int)sizeof(Scalar) - 2) - e); +} + +/* forward block-floating-point transform to signed integers */ +static void +_t1(fwd_cast, Scalar)(Int* iblock, const Scalar* fblock, uint n, int emax) +{ + /* compute power-of-two scale factor s */ + Scalar s = _t1(quantize, Scalar)(1, emax); + /* compute p-bit int y = s*x where x is floating and |y| <= 2^(p-2) - 1 */ + do + *iblock++ = (Int)(s * *fblock++); + while (--n); +} + +/* public functions -------------------------------------------------------- */ + +/* encode contiguous floating-point block */ +uint +_t2(zfp_encode_block, Scalar, DIMS)(zfp_stream* zfp, const Scalar* fblock) +{ + /* compute maximum exponent */ + int emax = _t1(exponent_block, Scalar)(fblock, BLOCK_SIZE); + int maxprec = precision(emax, zfp->maxprec, zfp->minexp, DIMS); + uint e = maxprec ? emax + EBIAS : 0; + /* encode block only if biased exponent is nonzero */ + if (e) { + cache_align_(Int iblock[BLOCK_SIZE]); + /* encode common exponent; LSB indicates that exponent is nonzero */ + int ebits = EBITS + 1; + stream_write_bits(zfp->stream, 2 * e + 1, ebits); + /* perform forward block-floating-point transform */ + _t1(fwd_cast, Scalar)(iblock, fblock, BLOCK_SIZE, emax); + /* encode integer block */ + return ebits + _t2(encode_block, Int, DIMS)(zfp->stream, zfp->minbits - ebits, zfp->maxbits - ebits, maxprec, iblock); + } + else { + /* write single zero-bit to indicate that all values are zero */ + stream_write_bit(zfp->stream, 0); + if (zfp->minbits > 1) { + stream_pad(zfp->stream, zfp->minbits - 1); + return zfp->minbits; + } + else + return 1; + } +} diff --git a/zfp/src/template/encodei.c b/zfp/src/template/encodei.c new file mode 100644 index 0000000000000000000000000000000000000000..6f4cb2c11eef2b4e738afcfec6db093d1a851c47 --- /dev/null +++ b/zfp/src/template/encodei.c @@ -0,0 +1,13 @@ +/* public functions -------------------------------------------------------- */ + +/* encode contiguous integer block */ +uint +_t2(zfp_encode_block, Int, DIMS)(zfp_stream* zfp, const Int* iblock) +{ + cache_align_(Int block[BLOCK_SIZE]); + uint i; + /* copy block */ + for (i = 0; i < BLOCK_SIZE; i++) + block[i] = iblock[i]; + return _t2(encode_block, Int, DIMS)(zfp->stream, zfp->minbits, zfp->maxbits, zfp->maxprec, block); +} diff --git a/zfp/src/template/ompcompress.c b/zfp/src/template/ompcompress.c new file mode 100644 index 0000000000000000000000000000000000000000..a654ac91a7fe067fc88f7b09068123621b3fea01 --- /dev/null +++ b/zfp/src/template/ompcompress.c @@ -0,0 +1,265 @@ +#ifdef _OPENMP + +/* compress 1d contiguous array in parallel */ +static void +_t2(compress_omp, Scalar, 1)(zfp_stream* stream, const zfp_field* field) +{ + /* array metadata */ + const Scalar* data = (const Scalar*)field->data; + uint nx = field->nx; + + /* number of omp threads, blocks, and chunks */ + uint threads = thread_count_omp(stream); + uint blocks = (nx + 3) / 4; + uint chunks = chunk_count_omp(stream, blocks, threads); + + /* allocate per-thread streams */ + bitstream** bs = compress_init_par(stream, field, chunks, blocks); + + /* compress chunks of blocks in parallel */ + int chunk; + #pragma omp parallel for num_threads(threads) + for (chunk = 0; chunk < (int)chunks; chunk++) { + /* determine range of block indices assigned to this thread */ + uint bmin = chunk_offset(blocks, chunks, chunk + 0); + uint bmax = chunk_offset(blocks, chunks, chunk + 1); + uint block; + /* set up thread-local bit stream */ + zfp_stream s = *stream; + zfp_stream_set_bit_stream(&s, bs[chunk]); + /* compress sequence of blocks */ + for (block = bmin; block < bmax; block++) { + /* determine block origin x within array */ + const Scalar* p = data; + uint x = 4 * block; + p += x; + /* compress partial or full block */ + if (nx - x < 4) + _t2(zfp_encode_partial_block_strided, Scalar, 1)(&s, p, MIN(nx - x, 4u), 1); + else + _t2(zfp_encode_block, Scalar, 1)(&s, p); + } + } + + /* concatenate per-thread streams */ + compress_finish_par(stream, bs, chunks); +} + +/* compress 1d strided array in parallel */ +static void +_t2(compress_strided_omp, Scalar, 1)(zfp_stream* stream, const zfp_field* field) +{ + /* array metadata */ + const Scalar* data = (const Scalar*)field->data; + uint nx = field->nx; + int sx = field->sx ? field->sx : 1; + + /* number of omp threads, blocks, and chunks */ + uint threads = thread_count_omp(stream); + uint blocks = (nx + 3) / 4; + uint chunks = chunk_count_omp(stream, blocks, threads); + + /* allocate per-thread streams */ + bitstream** bs = compress_init_par(stream, field, chunks, blocks); + + /* compress chunks of blocks in parallel */ + int chunk; + #pragma omp parallel for num_threads(threads) + for (chunk = 0; chunk < (int)chunks; chunk++) { + /* determine range of block indices assigned to this thread */ + uint bmin = chunk_offset(blocks, chunks, chunk + 0); + uint bmax = chunk_offset(blocks, chunks, chunk + 1); + uint block; + /* set up thread-local bit stream */ + zfp_stream s = *stream; + zfp_stream_set_bit_stream(&s, bs[chunk]); + /* compress sequence of blocks */ + for (block = bmin; block < bmax; block++) { + /* determine block origin x within array */ + const Scalar* p = data; + uint x = 4 * block; + p += sx * (ptrdiff_t)x; + /* compress partial or full block */ + if (nx - x < 4) + _t2(zfp_encode_partial_block_strided, Scalar, 1)(&s, p, MIN(nx - x, 4u), sx); + else + _t2(zfp_encode_block_strided, Scalar, 1)(&s, p, sx); + } + } + + /* concatenate per-thread streams */ + compress_finish_par(stream, bs, chunks); +} + +/* compress 2d strided array in parallel */ +static void +_t2(compress_strided_omp, Scalar, 2)(zfp_stream* stream, const zfp_field* field) +{ + /* array metadata */ + const Scalar* data = (const Scalar*)field->data; + uint nx = field->nx; + uint ny = field->ny; + int sx = field->sx ? field->sx : 1; + int sy = field->sy ? field->sy : nx; + + /* number of omp threads, blocks, and chunks */ + uint threads = thread_count_omp(stream); + uint bx = (nx + 3) / 4; + uint by = (ny + 3) / 4; + uint blocks = bx * by; + uint chunks = chunk_count_omp(stream, blocks, threads); + + /* allocate per-thread streams */ + bitstream** bs = compress_init_par(stream, field, chunks, blocks); + + /* compress chunks of blocks in parallel */ + int chunk; + #pragma omp parallel for num_threads(threads) + for (chunk = 0; chunk < (int)chunks; chunk++) { + /* determine range of block indices assigned to this thread */ + uint bmin = chunk_offset(blocks, chunks, chunk + 0); + uint bmax = chunk_offset(blocks, chunks, chunk + 1); + uint block; + /* set up thread-local bit stream */ + zfp_stream s = *stream; + zfp_stream_set_bit_stream(&s, bs[chunk]); + /* compress sequence of blocks */ + for (block = bmin; block < bmax; block++) { + /* determine block origin (x, y) within array */ + const Scalar* p = data; + uint b = block; + uint x, y; + x = 4 * (b % bx); b /= bx; + y = 4 * b; + p += sx * (ptrdiff_t)x + sy * (ptrdiff_t)y; + /* compress partial or full block */ + if (nx - x < 4 || ny - y < 4) + _t2(zfp_encode_partial_block_strided, Scalar, 2)(&s, p, MIN(nx - x, 4u), MIN(ny - y, 4u), sx, sy); + else + _t2(zfp_encode_block_strided, Scalar, 2)(&s, p, sx, sy); + } + } + + /* concatenate per-thread streams */ + compress_finish_par(stream, bs, chunks); +} + +/* compress 3d strided array in parallel */ +static void +_t2(compress_strided_omp, Scalar, 3)(zfp_stream* stream, const zfp_field* field) +{ + /* array metadata */ + const Scalar* data = (const Scalar*)field->data; + uint nx = field->nx; + uint ny = field->ny; + uint nz = field->nz; + int sx = field->sx ? field->sx : 1; + int sy = field->sy ? field->sy : nx; + int sz = field->sz ? field->sz : (ptrdiff_t)nx * ny; + + /* number of omp threads, blocks, and chunks */ + uint threads = thread_count_omp(stream); + uint bx = (nx + 3) / 4; + uint by = (ny + 3) / 4; + uint bz = (nz + 3) / 4; + uint blocks = bx * by * bz; + uint chunks = chunk_count_omp(stream, blocks, threads); + + /* allocate per-thread streams */ + bitstream** bs = compress_init_par(stream, field, chunks, blocks); + + /* compress chunks of blocks in parallel */ + int chunk; + #pragma omp parallel for num_threads(threads) + for (chunk = 0; chunk < (int)chunks; chunk++) { + /* determine range of block indices assigned to this thread */ + uint bmin = chunk_offset(blocks, chunks, chunk + 0); + uint bmax = chunk_offset(blocks, chunks, chunk + 1); + uint block; + /* set up thread-local bit stream */ + zfp_stream s = *stream; + zfp_stream_set_bit_stream(&s, bs[chunk]); + /* compress sequence of blocks */ + for (block = bmin; block < bmax; block++) { + /* determine block origin (x, y, z) within array */ + const Scalar* p = data; + uint b = block; + uint x, y, z; + x = 4 * (b % bx); b /= bx; + y = 4 * (b % by); b /= by; + z = 4 * b; + p += sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z; + /* compress partial or full block */ + if (nx - x < 4 || ny - y < 4 || nz - z < 4) + _t2(zfp_encode_partial_block_strided, Scalar, 3)(&s, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), sx, sy, sz); + else + _t2(zfp_encode_block_strided, Scalar, 3)(&s, p, sx, sy, sz); + } + } + + /* concatenate per-thread streams */ + compress_finish_par(stream, bs, chunks); +} + +/* compress 4d strided array in parallel */ +static void +_t2(compress_strided_omp, Scalar, 4)(zfp_stream* stream, const zfp_field* field) +{ + /* array metadata */ + const Scalar* data = field->data; + uint nx = field->nx; + uint ny = field->ny; + uint nz = field->nz; + uint nw = field->nw; + int sx = field->sx ? field->sx : 1; + int sy = field->sy ? field->sy : nx; + int sz = field->sz ? field->sz : (ptrdiff_t)nx * ny; + int sw = field->sw ? field->sw : (ptrdiff_t)nx * ny * nz; + + /* number of omp threads, blocks, and chunks */ + uint threads = thread_count_omp(stream); + uint bx = (nx + 3) / 4; + uint by = (ny + 3) / 4; + uint bz = (nz + 3) / 4; + uint bw = (nw + 3) / 4; + uint blocks = bx * by * bz * bw; + uint chunks = chunk_count_omp(stream, blocks, threads); + + /* allocate per-thread streams */ + bitstream** bs = compress_init_par(stream, field, chunks, blocks); + + /* compress chunks of blocks in parallel */ + int chunk; + #pragma omp parallel for num_threads(threads) + for (chunk = 0; chunk < (int)chunks; chunk++) { + /* determine range of block indices assigned to this thread */ + uint bmin = chunk_offset(blocks, chunks, chunk + 0); + uint bmax = chunk_offset(blocks, chunks, chunk + 1); + uint block; + /* set up thread-local bit stream */ + zfp_stream s = *stream; + zfp_stream_set_bit_stream(&s, bs[chunk]); + /* compress sequence of blocks */ + for (block = bmin; block < bmax; block++) { + /* determine block origin (x, y, z, w) within array */ + const Scalar* p = data; + uint b = block; + uint x, y, z, w; + x = 4 * (b % bx); b /= bx; + y = 4 * (b % by); b /= by; + z = 4 * (b % bz); b /= bz; + w = 4 * b; + p += sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z + sw * (ptrdiff_t)w; + /* compress partial or full block */ + if (nx - x < 4 || ny - y < 4 || nz - z < 4 || nw - w < 4) + _t2(zfp_encode_partial_block_strided, Scalar, 4)(&s, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), MIN(nw - w, 4u), sx, sy, sz, sw); + else + _t2(zfp_encode_block_strided, Scalar, 4)(&s, p, sx, sy, sz, sw); + } + } + + /* concatenate per-thread streams */ + compress_finish_par(stream, bs, chunks); +} + +#endif diff --git a/zfp/src/template/template.h b/zfp/src/template/template.h new file mode 100644 index 0000000000000000000000000000000000000000..fd5becf7e4c25c9bfb326deb68c490b57ddcef41 --- /dev/null +++ b/zfp/src/template/template.h @@ -0,0 +1,12 @@ +#ifndef TEMPLATE_H +#define TEMPLATE_H + +/* concatenation */ +#define _cat2(x, y) x ## _ ## y +#define _cat3(x, y, z) x ## _ ## y ## _ ## z + +/* 1- and 2-argument function templates */ +#define _t1(function, arg) _cat2(function, arg) +#define _t2(function, type, dims) _cat3(function, type, dims) + +#endif diff --git a/zfp/src/traitsd.h b/zfp/src/traitsd.h new file mode 100644 index 0000000000000000000000000000000000000000..cc612b4934e689ad358bf1f4bc87c3b0a9895d4c --- /dev/null +++ b/zfp/src/traitsd.h @@ -0,0 +1,11 @@ +/* double-precision floating-point traits */ + +#define Scalar double /* floating-point type */ +#define Int int64 /* corresponding signed integer type */ +#define UInt uint64 /* corresponding unsigned integer type */ +#define EBITS 11 /* number of exponent bits */ +#define NBMASK UINT64C(0xaaaaaaaaaaaaaaaa) /* negabinary mask */ + +#define FABS(x) fabs(x) +#define FREXP(x, e) frexp(x, e) +#define LDEXP(x, e) ldexp(x, e) diff --git a/zfp/src/traitsf.h b/zfp/src/traitsf.h new file mode 100644 index 0000000000000000000000000000000000000000..ba262ac09f8da3efc5e0b67b144e6cc9f9f9cd72 --- /dev/null +++ b/zfp/src/traitsf.h @@ -0,0 +1,17 @@ +/* single-precision floating-point traits */ + +#define Scalar float /* floating-point type */ +#define Int int32 /* corresponding signed integer type */ +#define UInt uint32 /* corresponding unsigned integer type */ +#define EBITS 8 /* number of exponent bits */ +#define NBMASK 0xaaaaaaaau /* negabinary mask */ + +#if __STDC_VERSION__ >= 199901L + #define FABS(x) fabsf(x) + #define FREXP(x, e) frexpf(x, e) + #define LDEXP(x, e) ldexpf(x, e) +#else + #define FABS(x) (float)fabs(x) + #define FREXP(x, e) (void)frexp(x, e) + #define LDEXP(x, e) (float)ldexp(x, e) +#endif diff --git a/zfp/src/traitsi.h b/zfp/src/traitsi.h new file mode 100644 index 0000000000000000000000000000000000000000..1daca09f428056d7b9fa0979ae0fa03fc3e7cde9 --- /dev/null +++ b/zfp/src/traitsi.h @@ -0,0 +1,6 @@ +/* 32-bit integer traits */ + +#define Scalar int32 /* integer type */ +#define Int int32 /* corresponding signed integer type */ +#define UInt uint32 /* corresponding unsigned integer type */ +#define NBMASK 0xaaaaaaaau /* negabinary mask */ diff --git a/zfp/src/traitsl.h b/zfp/src/traitsl.h new file mode 100644 index 0000000000000000000000000000000000000000..c4c853467066db9522703023c10f75cad1538ed1 --- /dev/null +++ b/zfp/src/traitsl.h @@ -0,0 +1,6 @@ +/* 64-bit integer traits */ + +#define Scalar int64 /* integer type */ +#define Int int64 /* corresponding signed integer type */ +#define UInt uint64 /* corresponding unsigned integer type */ +#define NBMASK UINT64C(0xaaaaaaaaaaaaaaaa) /* negabinary mask */ diff --git a/zfp/src/zfp.c b/zfp/src/zfp.c new file mode 100644 index 0000000000000000000000000000000000000000..049a58692464ec123c8d5e6977504c50971c5490 --- /dev/null +++ b/zfp/src/zfp.c @@ -0,0 +1,1025 @@ +#include <limits.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include "zfp.h" +#include "zfp/macros.h" +#include "template/template.h" + +/* public data ------------------------------------------------------------- */ + +export_ const uint zfp_codec_version = ZFP_CODEC; +export_ const uint zfp_library_version = ZFP_VERSION; +export_ const char* const zfp_version_string = "zfp version " ZFP_VERSION_STRING " (October 1, 2018)"; + +/* private functions ------------------------------------------------------- */ + +static uint +type_precision(zfp_type type) +{ + switch (type) { + case zfp_type_int32: + return CHAR_BIT * (uint)sizeof(int32); + case zfp_type_int64: + return CHAR_BIT * (uint)sizeof(int64); + case zfp_type_float: + return CHAR_BIT * (uint)sizeof(float); + case zfp_type_double: + return CHAR_BIT * (uint)sizeof(double); + default: + return 0; + } +} + +/* shared code across template instances ------------------------------------*/ + +#include "share/parallel.c" +#include "share/omp.c" + +/* template instantiation of integer and float compressor -------------------*/ + +#define Scalar int32 +#include "template/compress.c" +#include "template/decompress.c" +#include "template/ompcompress.c" +#include "template/cudacompress.c" +#include "template/cudadecompress.c" +#undef Scalar + +#define Scalar int64 +#include "template/compress.c" +#include "template/decompress.c" +#include "template/ompcompress.c" +#include "template/cudacompress.c" +#include "template/cudadecompress.c" +#undef Scalar + +#define Scalar float +#include "template/compress.c" +#include "template/decompress.c" +#include "template/ompcompress.c" +#include "template/cudacompress.c" +#include "template/cudadecompress.c" +#undef Scalar + +#define Scalar double +#include "template/compress.c" +#include "template/decompress.c" +#include "template/ompcompress.c" +#include "template/cudacompress.c" +#include "template/cudadecompress.c" +#undef Scalar + +/* public functions: miscellaneous ----------------------------------------- */ + +size_t +zfp_type_size(zfp_type type) +{ + switch (type) { + case zfp_type_int32: + return sizeof(int32); + case zfp_type_int64: + return sizeof(int64); + case zfp_type_float: + return sizeof(float); + case zfp_type_double: + return sizeof(double); + default: + return 0; + } +} + +/* public functions: fields ------------------------------------------------ */ + +zfp_field* +zfp_field_alloc() +{ + zfp_field* field = (zfp_field*)malloc(sizeof(zfp_field)); + if (field) { + field->type = zfp_type_none; + field->nx = field->ny = field->nz = field->nw = 0; + field->sx = field->sy = field->sz = field->sw = 0; + field->data = 0; + } + return field; +} + +zfp_field* +zfp_field_1d(void* data, zfp_type type, uint nx) +{ + zfp_field* field = zfp_field_alloc(); + if (field) { + field->type = type; + field->nx = nx; + field->data = data; + } + return field; +} + +zfp_field* +zfp_field_2d(void* data, zfp_type type, uint nx, uint ny) +{ + zfp_field* field = zfp_field_alloc(); + if (field) { + field->type = type; + field->nx = nx; + field->ny = ny; + field->data = data; + } + return field; +} + +zfp_field* +zfp_field_3d(void* data, zfp_type type, uint nx, uint ny, uint nz) +{ + zfp_field* field = zfp_field_alloc(); + if (field) { + field->type = type; + field->nx = nx; + field->ny = ny; + field->nz = nz; + field->data = data; + } + return field; +} + +zfp_field* +zfp_field_4d(void* data, zfp_type type, uint nx, uint ny, uint nz, uint nw) +{ + zfp_field* field = zfp_field_alloc(); + if (field) { + field->type = type; + field->nx = nx; + field->ny = ny; + field->nz = nz; + field->nw = nw; + field->data = data; + } + return field; +} + +void +zfp_field_free(zfp_field* field) +{ + free(field); +} + +void* +zfp_field_pointer(const zfp_field* field) +{ + return field->data; +} + +zfp_type +zfp_field_type(const zfp_field* field) +{ + return field->type; +} + +uint +zfp_field_precision(const zfp_field* field) +{ + return type_precision(field->type); +} + +uint +zfp_field_dimensionality(const zfp_field* field) +{ + return field->nx ? field->ny ? field->nz ? field->nw ? 4 : 3 : 2 : 1 : 0; +} + +size_t +zfp_field_size(const zfp_field* field, uint* size) +{ + if (size) + switch (zfp_field_dimensionality(field)) { + case 4: + size[3] = field->nw; + /* FALLTHROUGH */ + case 3: + size[2] = field->nz; + /* FALLTHROUGH */ + case 2: + size[1] = field->ny; + /* FALLTHROUGH */ + case 1: + size[0] = field->nx; + break; + } + return (size_t)MAX(field->nx, 1u) * (size_t)MAX(field->ny, 1u) * (size_t)MAX(field->nz, 1u) * (size_t)MAX(field->nw, 1u); +} + +int +zfp_field_stride(const zfp_field* field, int* stride) +{ + if (stride) + switch (zfp_field_dimensionality(field)) { + case 4: + stride[3] = field->sw ? field->sw : field->nx * field->ny * field->nz; + /* FALLTHROUGH */ + case 3: + stride[2] = field->sz ? field->sz : field->nx * field->ny; + /* FALLTHROUGH */ + case 2: + stride[1] = field->sy ? field->sy : field->nx; + /* FALLTHROUGH */ + case 1: + stride[0] = field->sx ? field->sx : 1; + break; + } + return field->sx || field->sy || field->sz || field->sw; +} + +uint64 +zfp_field_metadata(const zfp_field* field) +{ + uint64 meta = 0; + /* 48 bits for dimensions */ + switch (zfp_field_dimensionality(field)) { + case 1: + meta <<= 48; meta += field->nx - 1; + break; + case 2: + meta <<= 24; meta += field->ny - 1; + meta <<= 24; meta += field->nx - 1; + break; + case 3: + meta <<= 16; meta += field->nz - 1; + meta <<= 16; meta += field->ny - 1; + meta <<= 16; meta += field->nx - 1; + break; + case 4: + meta <<= 12; meta += field->nw - 1; + meta <<= 12; meta += field->nz - 1; + meta <<= 12; meta += field->ny - 1; + meta <<= 12; meta += field->nx - 1; + break; + } + /* 2 bits for dimensionality (1D, 2D, 3D, 4D) */ + meta <<= 2; meta += zfp_field_dimensionality(field) - 1; + /* 2 bits for scalar type */ + meta <<= 2; meta += field->type - 1; + return meta; +} + +void +zfp_field_set_pointer(zfp_field* field, void* data) +{ + field->data = data; +} + +zfp_type +zfp_field_set_type(zfp_field* field, zfp_type type) +{ + switch (type) { + case zfp_type_int32: + case zfp_type_int64: + case zfp_type_float: + case zfp_type_double: + field->type = type; + return type; + default: + return zfp_type_none; + } +} + +void +zfp_field_set_size_1d(zfp_field* field, uint n) +{ + field->nx = n; + field->ny = 0; + field->nz = 0; + field->nw = 0; +} + +void +zfp_field_set_size_2d(zfp_field* field, uint nx, uint ny) +{ + field->nx = nx; + field->ny = ny; + field->nz = 0; + field->nw = 0; +} + +void +zfp_field_set_size_3d(zfp_field* field, uint nx, uint ny, uint nz) +{ + field->nx = nx; + field->ny = ny; + field->nz = nz; + field->nw = 0; +} + +void +zfp_field_set_size_4d(zfp_field* field, uint nx, uint ny, uint nz, uint nw) +{ + field->nx = nx; + field->ny = ny; + field->nz = nz; + field->nw = nw; +} + +void +zfp_field_set_stride_1d(zfp_field* field, int sx) +{ + field->sx = sx; + field->sy = 0; + field->sz = 0; + field->sw = 0; +} + +void +zfp_field_set_stride_2d(zfp_field* field, int sx, int sy) +{ + field->sx = sx; + field->sy = sy; + field->sz = 0; + field->sw = 0; +} + +void +zfp_field_set_stride_3d(zfp_field* field, int sx, int sy, int sz) +{ + field->sx = sx; + field->sy = sy; + field->sz = sz; + field->sw = 0; +} + +void +zfp_field_set_stride_4d(zfp_field* field, int sx, int sy, int sz, int sw) +{ + field->sx = sx; + field->sy = sy; + field->sz = sz; + field->sw = sw; +} + +int +zfp_field_set_metadata(zfp_field* field, uint64 meta) +{ + uint64 dims; + field->type = (zfp_type)((meta & 0x3u) + 1); meta >>= 2; + dims = (meta & 0x3u) + 1; meta >>= 2; + switch (dims) { + case 1: + /* currently dimensions are limited to 2^32 - 1 */ + field->nx = (meta & UINT64C(0x0000ffffffff)) + 1; meta >>= 48; + field->ny = 0; + field->nz = 0; + field->nw = 0; + break; + case 2: + field->nx = (meta & UINT64C(0xffffff)) + 1; meta >>= 24; + field->ny = (meta & UINT64C(0xffffff)) + 1; meta >>= 24; + field->nz = 0; + field->nw = 0; + break; + case 3: + field->nx = (meta & UINT64C(0xffff)) + 1; meta >>= 16; + field->ny = (meta & UINT64C(0xffff)) + 1; meta >>= 16; + field->nz = (meta & UINT64C(0xffff)) + 1; meta >>= 16; + field->nw = 0; + break; + case 4: + field->nx = (meta & UINT64C(0xfff)) + 1; meta >>= 12; + field->ny = (meta & UINT64C(0xfff)) + 1; meta >>= 12; + field->nz = (meta & UINT64C(0xfff)) + 1; meta >>= 12; + field->nw = (meta & UINT64C(0xfff)) + 1; meta >>= 12; + break; + } + field->sx = field->sy = field->sz = field->sw = 0; + return 1; +} + +/* public functions: zfp compressed stream --------------------------------- */ + +zfp_stream* +zfp_stream_open(bitstream* stream) +{ + zfp_stream* zfp = (zfp_stream*)malloc(sizeof(zfp_stream)); + if (zfp) { + zfp->stream = stream; + zfp->minbits = ZFP_MIN_BITS; + zfp->maxbits = ZFP_MAX_BITS; + zfp->maxprec = ZFP_MAX_PREC; + zfp->minexp = ZFP_MIN_EXP; + zfp->exec.policy = zfp_exec_serial; + } + return zfp; +} + +void +zfp_stream_close(zfp_stream* zfp) +{ + free(zfp); +} + +bitstream* +zfp_stream_bit_stream(const zfp_stream* zfp) +{ + return zfp->stream; +} + +zfp_mode +zfp_stream_compression_mode(const zfp_stream* zfp) +{ + if (zfp->minbits > zfp->maxbits || !(0 < zfp->maxprec && zfp->maxprec <= 64)) + return zfp_mode_null; + + /* default values are considered expert mode */ + if (zfp->minbits == ZFP_MIN_BITS && + zfp->maxbits == ZFP_MAX_BITS && + zfp->maxprec == ZFP_MAX_PREC && + zfp->minexp == ZFP_MIN_EXP) + return zfp_mode_expert; + + /* fixed rate? */ + if (zfp->minbits == zfp->maxbits && + 1 <= zfp->maxbits && zfp->maxbits <= ZFP_MAX_BITS && + zfp->maxprec >= ZFP_MAX_PREC && + zfp->minexp <= ZFP_MIN_EXP) + return zfp_mode_fixed_rate; + + /* fixed precision? */ + if (zfp->minbits <= ZFP_MIN_BITS && + zfp->maxbits >= ZFP_MAX_BITS && + zfp->maxprec >= 1 && + zfp->minexp <= ZFP_MIN_EXP) + return zfp_mode_fixed_precision; + + /* fixed accuracy? */ + if (zfp->minbits <= ZFP_MIN_BITS && + zfp->maxbits >= ZFP_MAX_BITS && + zfp->maxprec >= ZFP_MAX_PREC && + ZFP_MIN_EXP <= zfp->minexp) + return zfp_mode_fixed_accuracy; + + return zfp_mode_expert; +} + +uint64 +zfp_stream_mode(const zfp_stream* zfp) +{ + uint64 mode = 0; + uint minbits; + uint maxbits; + uint maxprec; + uint minexp; + + /* common configurations mapped to short representation */ + switch(zfp_stream_compression_mode(zfp)) { + case zfp_mode_fixed_rate: + if (zfp->maxbits <= 2048) + /* maxbits is [1, 2048] */ + /* returns [0, 2047] */ + return (zfp->maxbits - 1); + else + break; + + case zfp_mode_fixed_precision: + if (zfp->maxprec <= 128) + /* maxprec is [1, 128] */ + /* returns [2048, 2175] */ + return (zfp->maxprec - 1) + (2048); + else + break; + + case zfp_mode_fixed_accuracy: + if (zfp->minexp <= 843) + /* minexp is [ZFP_MIN_EXP=-1074, 843] */ + /* [2177, ZFP_MODE_SHORT_MAX=4094] */ + /* +1 because skipped 2176 */ + return (zfp->minexp - ZFP_MIN_EXP) + (2048 + 128 + 1); + + default: + break; + } + + /* encode each parameter separately */ + minbits = MAX(1, MIN(zfp->minbits, 0x8000u)) - 1; + maxbits = MAX(1, MIN(zfp->maxbits, 0x8000u)) - 1; + maxprec = MAX(1, MIN(zfp->maxprec, 0x0080u)) - 1; + minexp = MAX(0, MIN(zfp->minexp + 16495, 0x7fff)); + mode <<= 15; mode += minexp; + mode <<= 7; mode += maxprec; + mode <<= 15; mode += maxbits; + mode <<= 15; mode += minbits; + mode <<= 12; mode += 0xfffu; + + return mode; +} + +void +zfp_stream_params(const zfp_stream* zfp, uint* minbits, uint* maxbits, uint* maxprec, int* minexp) +{ + if (minbits) + *minbits = zfp->minbits; + if (maxbits) + *maxbits = zfp->maxbits; + if (maxprec) + *maxprec = zfp->maxprec; + if (minexp) + *minexp = zfp->minexp; +} + +size_t +zfp_stream_compressed_size(const zfp_stream* zfp) +{ + return stream_size(zfp->stream); +} + +size_t +zfp_stream_maximum_size(const zfp_stream* zfp, const zfp_field* field) +{ + uint dims = zfp_field_dimensionality(field); + uint mx = (MAX(field->nx, 1u) + 3) / 4; + uint my = (MAX(field->ny, 1u) + 3) / 4; + uint mz = (MAX(field->nz, 1u) + 3) / 4; + uint mw = (MAX(field->nw, 1u) + 3) / 4; + size_t blocks = (size_t)mx * (size_t)my * (size_t)mz * (size_t)mw; + uint values = 1u << (2 * dims); + uint maxbits = 1; + + if (!dims) + return 0; + switch (field->type) { + case zfp_type_none: + return 0; + case zfp_type_float: + maxbits += 8; + break; + case zfp_type_double: + maxbits += 11; + break; + default: + break; + } + maxbits += values - 1 + values * MIN(zfp->maxprec, type_precision(field->type)); + maxbits = MIN(maxbits, zfp->maxbits); + maxbits = MAX(maxbits, zfp->minbits); + return ((ZFP_HEADER_MAX_BITS + blocks * maxbits + stream_word_bits - 1) & ~(stream_word_bits - 1)) / CHAR_BIT; +} + +void +zfp_stream_set_bit_stream(zfp_stream* zfp, bitstream* stream) +{ + zfp->stream = stream; +} + +double +zfp_stream_set_rate(zfp_stream* zfp, double rate, zfp_type type, uint dims, int wra) +{ + uint n = 1u << (2 * dims); + uint bits = (uint)floor(n * rate + 0.5); + switch (type) { + case zfp_type_float: + bits = MAX(bits, 1 + 8u); + break; + case zfp_type_double: + bits = MAX(bits, 1 + 11u); + break; + default: + break; + } + if (wra) { + /* for write random access, round up to next multiple of stream word size */ + bits += (uint)stream_word_bits - 1; + bits &= ~(stream_word_bits - 1); + } + zfp->minbits = bits; + zfp->maxbits = bits; + zfp->maxprec = ZFP_MAX_PREC; + zfp->minexp = ZFP_MIN_EXP; + return (double)bits / n; +} + +uint +zfp_stream_set_precision(zfp_stream* zfp, uint precision) +{ + zfp->minbits = ZFP_MIN_BITS; + zfp->maxbits = ZFP_MAX_BITS; + zfp->maxprec = precision ? MIN(precision, ZFP_MAX_PREC) : ZFP_MAX_PREC; + zfp->minexp = ZFP_MIN_EXP; + return zfp->maxprec; +} + +double +zfp_stream_set_accuracy(zfp_stream* zfp, double tolerance) +{ + int emin = ZFP_MIN_EXP; + if (tolerance > 0) { + /* tolerance = x * 2^emin, with 0.5 <= x < 1 */ + frexp(tolerance, &emin); + emin--; + /* assert: 2^emin <= tolerance < 2^(emin+1) */ + } + zfp->minbits = ZFP_MIN_BITS; + zfp->maxbits = ZFP_MAX_BITS; + zfp->maxprec = ZFP_MAX_PREC; + zfp->minexp = emin; + return tolerance > 0 ? ldexp(1.0, emin) : 0; +} + +zfp_mode +zfp_stream_set_mode(zfp_stream* zfp, uint64 mode) +{ + uint minbits, maxbits, maxprec; + int minexp; + + if (mode <= ZFP_MODE_SHORT_MAX) { + /* 12-bit (short) encoding of one of three modes */ + if (mode < 2048) { + /* fixed rate */ + minbits = maxbits = (uint)mode + 1; + maxprec = ZFP_MAX_PREC; + minexp = ZFP_MIN_EXP; + } + else if (mode < (2048 + 128)) { + /* fixed precision */ + minbits = ZFP_MIN_BITS; + maxbits = ZFP_MAX_BITS; + maxprec = (uint)mode + 1 - (2048); + minexp = ZFP_MIN_EXP; + } + else { + /* fixed accuracy */ + minbits = ZFP_MIN_BITS; + maxbits = ZFP_MAX_BITS; + maxprec = ZFP_MAX_PREC; + minexp = (uint)mode + ZFP_MIN_EXP - (2048 + 128 + 1); + } + } + else { + /* 64-bit encoding */ + mode >>= 12; minbits = ((uint)mode & 0x7fffu) + 1; + mode >>= 15; maxbits = ((uint)mode & 0x7fffu) + 1; + mode >>= 15; maxprec = ((uint)mode & 0x007fu) + 1; + mode >>= 7; minexp = ((uint)mode & 0x7fffu) - 16495; + } + + if (!zfp_stream_set_params(zfp, minbits, maxbits, maxprec, minexp)) + return zfp_mode_null; + + return zfp_stream_compression_mode(zfp); +} + +int +zfp_stream_set_params(zfp_stream* zfp, uint minbits, uint maxbits, uint maxprec, int minexp) +{ + if (minbits > maxbits || !(0 < maxprec && maxprec <= 64)) + return 0; + zfp->minbits = minbits; + zfp->maxbits = maxbits; + zfp->maxprec = maxprec; + zfp->minexp = minexp; + return 1; +} + +size_t +zfp_stream_flush(zfp_stream* zfp) +{ + return stream_flush(zfp->stream); +} + +size_t +zfp_stream_align(zfp_stream* zfp) +{ + return stream_align(zfp->stream); +} + +void +zfp_stream_rewind(zfp_stream* zfp) +{ + stream_rewind(zfp->stream); +} + +/* public functions: execution policy -------------------------------------- */ + +zfp_exec_policy +zfp_stream_execution(const zfp_stream* zfp) +{ + return zfp->exec.policy; +} + +uint +zfp_stream_omp_threads(const zfp_stream* zfp) +{ + return zfp->exec.params.omp.threads; +} + +uint +zfp_stream_omp_chunk_size(const zfp_stream* zfp) +{ + return zfp->exec.params.omp.chunk_size; +} + +int +zfp_stream_set_execution(zfp_stream* zfp, zfp_exec_policy policy) +{ + switch (policy) { + case zfp_exec_serial: + break; +#ifdef ZFP_WITH_CUDA + case zfp_exec_cuda: + break; +#endif + case zfp_exec_omp: +#ifdef _OPENMP + if (zfp->exec.policy != policy) { + zfp->exec.params.omp.threads = 0; + zfp->exec.params.omp.chunk_size = 0; + } + break; +#else + return 0; +#endif + default: + return 0; + } + zfp->exec.policy = policy; + return 1; +} + +int +zfp_stream_set_omp_threads(zfp_stream* zfp, uint threads) +{ + if (!zfp_stream_set_execution(zfp, zfp_exec_omp)) + return 0; + zfp->exec.params.omp.threads = threads; + return 1; +} + +int +zfp_stream_set_omp_chunk_size(zfp_stream* zfp, uint chunk_size) +{ + if (!zfp_stream_set_execution(zfp, zfp_exec_omp)) + return 0; + zfp->exec.params.omp.chunk_size = chunk_size; + return 1; +} + +/* public functions: utility functions --------------------------------------*/ + +void +zfp_promote_int8_to_int32(int32* oblock, const int8* iblock, uint dims) +{ + uint count = 1u << (2 * dims); + while (count--) + *oblock++ = (int32)*iblock++ << 23; +} + +void +zfp_promote_uint8_to_int32(int32* oblock, const uint8* iblock, uint dims) +{ + uint count = 1u << (2 * dims); + while (count--) + *oblock++ = ((int32)*iblock++ - 0x80) << 23; +} + +void +zfp_promote_int16_to_int32(int32* oblock, const int16* iblock, uint dims) +{ + uint count = 1u << (2 * dims); + while (count--) + *oblock++ = (int32)*iblock++ << 15; +} + +void +zfp_promote_uint16_to_int32(int32* oblock, const uint16* iblock, uint dims) +{ + uint count = 1u << (2 * dims); + while (count--) + *oblock++ = ((int32)*iblock++ - 0x8000) << 15; +} + +void +zfp_demote_int32_to_int8(int8* oblock, const int32* iblock, uint dims) +{ + uint count = 1u << (2 * dims); + while (count--) { + int32 i = *iblock++ >> 23; + *oblock++ = (int8)MAX(-0x80, MIN(i, 0x7f)); + } +} + +void +zfp_demote_int32_to_uint8(uint8* oblock, const int32* iblock, uint dims) +{ + uint count = 1u << (2 * dims); + while (count--) { + int32 i = (*iblock++ >> 23) + 0x80; + *oblock++ = (uint8)MAX(0x00, MIN(i, 0xff)); + } +} + +void +zfp_demote_int32_to_int16(int16* oblock, const int32* iblock, uint dims) +{ + uint count = 1u << (2 * dims); + while (count--) { + int32 i = *iblock++ >> 15; + *oblock++ = (int16)MAX(-0x8000, MIN(i, 0x7fff)); + } +} + +void +zfp_demote_int32_to_uint16(uint16* oblock, const int32* iblock, uint dims) +{ + uint count = 1u << (2 * dims); + while (count--) { + int32 i = (*iblock++ >> 15) + 0x8000; + *oblock++ = (uint16)MAX(0x0000, MIN(i, 0xffff)); + } +} + +/* public functions: compression and decompression --------------------------*/ + +size_t +zfp_compress(zfp_stream* zfp, const zfp_field* field) +{ + /* function table [execution][strided][dimensionality][scalar type] */ + void (*ftable[3][2][4][4])(zfp_stream*, const zfp_field*) = { + /* serial */ + {{{ compress_int32_1, compress_int64_1, compress_float_1, compress_double_1 }, + { compress_strided_int32_2, compress_strided_int64_2, compress_strided_float_2, compress_strided_double_2 }, + { compress_strided_int32_3, compress_strided_int64_3, compress_strided_float_3, compress_strided_double_3 }, + { compress_strided_int32_4, compress_strided_int64_4, compress_strided_float_4, compress_strided_double_4 }}, + {{ compress_strided_int32_1, compress_strided_int64_1, compress_strided_float_1, compress_strided_double_1 }, + { compress_strided_int32_2, compress_strided_int64_2, compress_strided_float_2, compress_strided_double_2 }, + { compress_strided_int32_3, compress_strided_int64_3, compress_strided_float_3, compress_strided_double_3 }, + { compress_strided_int32_4, compress_strided_int64_4, compress_strided_float_4, compress_strided_double_4 }}}, + + /* OpenMP */ +#ifdef _OPENMP + {{{ compress_omp_int32_1, compress_omp_int64_1, compress_omp_float_1, compress_omp_double_1 }, + { compress_strided_omp_int32_2, compress_strided_omp_int64_2, compress_strided_omp_float_2, compress_strided_omp_double_2 }, + { compress_strided_omp_int32_3, compress_strided_omp_int64_3, compress_strided_omp_float_3, compress_strided_omp_double_3 }, + { compress_strided_omp_int32_4, compress_strided_omp_int64_4, compress_strided_omp_float_4, compress_strided_omp_double_4 }}, + {{ compress_strided_omp_int32_1, compress_strided_omp_int64_1, compress_strided_omp_float_1, compress_strided_omp_double_1 }, + { compress_strided_omp_int32_2, compress_strided_omp_int64_2, compress_strided_omp_float_2, compress_strided_omp_double_2 }, + { compress_strided_omp_int32_3, compress_strided_omp_int64_3, compress_strided_omp_float_3, compress_strided_omp_double_3 }, + { compress_strided_omp_int32_4, compress_strided_omp_int64_4, compress_strided_omp_float_4, compress_strided_omp_double_4 }}}, +#else + {{{ NULL }}}, +#endif + + /* CUDA */ +#ifdef ZFP_WITH_CUDA + {{{ compress_cuda_int32_1, compress_cuda_int64_1, compress_cuda_float_1, compress_cuda_double_1 }, + { compress_strided_cuda_int32_2, compress_strided_cuda_int64_2, compress_strided_cuda_float_2, compress_strided_cuda_double_2 }, + { compress_strided_cuda_int32_3, compress_strided_cuda_int64_3, compress_strided_cuda_float_3, compress_strided_cuda_double_3 }, + { NULL, NULL, NULL, NULL }}, + {{ compress_strided_cuda_int32_1, compress_strided_cuda_int64_1, compress_strided_cuda_float_1, compress_strided_cuda_double_1 }, + { compress_strided_cuda_int32_2, compress_strided_cuda_int64_2, compress_strided_cuda_float_2, compress_strided_cuda_double_2 }, + { compress_strided_cuda_int32_3, compress_strided_cuda_int64_3, compress_strided_cuda_float_3, compress_strided_cuda_double_3 }, + { NULL, NULL, NULL, NULL }}}, +#else + {{{ NULL }}}, +#endif + }; + uint exec = zfp->exec.policy; + uint strided = zfp_field_stride(field, NULL); + uint dims = zfp_field_dimensionality(field); + uint type = field->type; + + switch (type) { + case zfp_type_int32: + case zfp_type_int64: + case zfp_type_float: + case zfp_type_double: + break; + default: + return 0; + } + + /* return 0 if compression mode is not supported */ + void (*compress)(zfp_stream*, const zfp_field*) = ftable[exec][strided][dims - 1][type - zfp_type_int32]; + if (!compress) + return 0; + + /* compress field and align bit stream on word boundary */ + compress(zfp, field); + stream_flush(zfp->stream); + + return stream_size(zfp->stream); +} + +size_t +zfp_decompress(zfp_stream* zfp, zfp_field* field) +{ + /* function table [execution][strided][dimensionality][scalar type] */ + void (*ftable[3][2][4][4])(zfp_stream*, zfp_field*) = { + /* serial */ + {{{ decompress_int32_1, decompress_int64_1, decompress_float_1, decompress_double_1 }, + { decompress_strided_int32_2, decompress_strided_int64_2, decompress_strided_float_2, decompress_strided_double_2 }, + { decompress_strided_int32_3, decompress_strided_int64_3, decompress_strided_float_3, decompress_strided_double_3 }, + { decompress_strided_int32_4, decompress_strided_int64_4, decompress_strided_float_4, decompress_strided_double_4 }}, + {{ decompress_strided_int32_1, decompress_strided_int64_1, decompress_strided_float_1, decompress_strided_double_1 }, + { decompress_strided_int32_2, decompress_strided_int64_2, decompress_strided_float_2, decompress_strided_double_2 }, + { decompress_strided_int32_3, decompress_strided_int64_3, decompress_strided_float_3, decompress_strided_double_3 }, + { decompress_strided_int32_4, decompress_strided_int64_4, decompress_strided_float_4, decompress_strided_double_4 }}}, + + /* OpenMP; not yet supported */ + {{{ NULL }}}, + + /* CUDA */ +#ifdef ZFP_WITH_CUDA + {{{ decompress_cuda_int32_1, decompress_cuda_int64_1, decompress_cuda_float_1, decompress_cuda_double_1 }, + { decompress_strided_cuda_int32_2, decompress_strided_cuda_int64_2, decompress_strided_cuda_float_2, decompress_strided_cuda_double_2 }, + { decompress_strided_cuda_int32_3, decompress_strided_cuda_int64_3, decompress_strided_cuda_float_3, decompress_strided_cuda_double_3 }, + { NULL, NULL, NULL, NULL }}, + {{ decompress_strided_cuda_int32_1, decompress_strided_cuda_int64_1, decompress_strided_cuda_float_1, decompress_strided_cuda_double_1 }, + { decompress_strided_cuda_int32_2, decompress_strided_cuda_int64_2, decompress_strided_cuda_float_2, decompress_strided_cuda_double_2 }, + { decompress_strided_cuda_int32_3, decompress_strided_cuda_int64_3, decompress_strided_cuda_float_3, decompress_strided_cuda_double_3 }, + { NULL, NULL, NULL, NULL }}}, +#else + {{{ NULL }}}, +#endif + }; + uint exec = zfp->exec.policy; + uint strided = zfp_field_stride(field, NULL); + uint dims = zfp_field_dimensionality(field); + uint type = field->type; + + switch (type) { + case zfp_type_int32: + case zfp_type_int64: + case zfp_type_float: + case zfp_type_double: + break; + default: + return 0; + } + + /* return 0 if decompression mode is not supported */ + void (*decompress)(zfp_stream*, zfp_field*) = ftable[exec][strided][dims - 1][type - zfp_type_int32]; + if (!decompress) + return 0; + + /* decompress field and align bit stream on word boundary */ + decompress(zfp, field); + stream_align(zfp->stream); + + return stream_size(zfp->stream); +} + +size_t +zfp_write_header(zfp_stream* zfp, const zfp_field* field, uint mask) +{ + size_t bits = 0; + /* 32-bit magic */ + if (mask & ZFP_HEADER_MAGIC) { + stream_write_bits(zfp->stream, 'z', 8); + stream_write_bits(zfp->stream, 'f', 8); + stream_write_bits(zfp->stream, 'p', 8); + stream_write_bits(zfp->stream, zfp_codec_version, 8); + bits += ZFP_MAGIC_BITS; + } + /* 52-bit field metadata */ + if (mask & ZFP_HEADER_META) { + uint64 meta = zfp_field_metadata(field); + stream_write_bits(zfp->stream, meta, ZFP_META_BITS); + bits += ZFP_META_BITS; + } + /* 12- or 64-bit compression parameters */ + if (mask & ZFP_HEADER_MODE) { + uint64 mode = zfp_stream_mode(zfp); + uint size = mode > ZFP_MODE_SHORT_MAX ? ZFP_MODE_LONG_BITS : ZFP_MODE_SHORT_BITS; + stream_write_bits(zfp->stream, mode, size); + bits += size; + } + return bits; +} + +size_t +zfp_read_header(zfp_stream* zfp, zfp_field* field, uint mask) +{ + size_t bits = 0; + if (mask & ZFP_HEADER_MAGIC) { + if (stream_read_bits(zfp->stream, 8) != 'z' || + stream_read_bits(zfp->stream, 8) != 'f' || + stream_read_bits(zfp->stream, 8) != 'p' || + stream_read_bits(zfp->stream, 8) != zfp_codec_version) + return 0; + bits += ZFP_MAGIC_BITS; + } + if (mask & ZFP_HEADER_META) { + uint64 meta = stream_read_bits(zfp->stream, ZFP_META_BITS); + if (!zfp_field_set_metadata(field, meta)) + return 0; + bits += ZFP_META_BITS; + } + if (mask & ZFP_HEADER_MODE) { + uint64 mode = stream_read_bits(zfp->stream, ZFP_MODE_SHORT_BITS); + bits += ZFP_MODE_SHORT_BITS; + if (mode > ZFP_MODE_SHORT_MAX) { + uint size = ZFP_MODE_LONG_BITS - ZFP_MODE_SHORT_BITS; + mode += stream_read_bits(zfp->stream, size) << ZFP_MODE_SHORT_BITS; + bits += size; + } + if (zfp_stream_set_mode(zfp, mode) == zfp_mode_null) + return 0; + } + return bits; +} diff --git a/zfp/tests/CMakeLists.txt b/zfp/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a44af0160ad3becccbd8f9e2930870c372ecabb --- /dev/null +++ b/zfp/tests/CMakeLists.txt @@ -0,0 +1,21 @@ +add_executable(testzfp testzfp.cpp) +target_link_libraries(testzfp zfp) +target_compile_definitions(testzfp PRIVATE ${zfp_defs}) + +option(ZFP_BUILD_TESTING_SMALL "Enable small-sized array testing" ON) +if(ZFP_BUILD_TESTING_SMALL) + foreach(D IN ITEMS 1 2 3 4) + foreach(P IN ITEMS 32 64) + add_test(NAME small-arrays-${D}d-fp${P} COMMAND testzfp small ${D}d fp${P}) + endforeach() + endforeach() +endif() + +option(ZFP_BUILD_TESTING_LARGE "Enable large-sized array testing" OFF) +if(ZFP_BUILD_TESTING_LARGE) + foreach(D IN ITEMS 1 2 3 4) + foreach(P IN ITEMS 32 64) + add_test(NAME large-arrays-${D}d-fp${P} COMMAND testzfp large ${D}d fp${P}) + endforeach() + endforeach() +endif() diff --git a/zfp/tests/Makefile b/zfp/tests/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..2c496ee3513eb6dc3294065052e67c749affa2d6 --- /dev/null +++ b/zfp/tests/Makefile @@ -0,0 +1,16 @@ +include ../Config + +BINDIR = ../bin +TARGETS = $(BINDIR)/testzfp +CXXLIBS = -L../lib -lzfp + +all: $(TARGETS) + +$(BINDIR)/testzfp: testzfp.cpp ../lib/$(LIBZFP) + $(CXX) $(CXXFLAGS) -I../array testzfp.cpp $(CXXLIBS) -o $@ + +test: $(BINDIR)/testzfp + $(BINDIR)/testzfp + +clean: + rm -f $(TARGETS) diff --git a/zfp/tests/testzfp.cpp b/zfp/tests/testzfp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..883e4d56a348cf405cfbe596f494526ffc5df59d --- /dev/null +++ b/zfp/tests/testzfp.cpp @@ -0,0 +1,1017 @@ +#include <algorithm> +#include <cmath> +#include <ctime> +#include <cstdio> +#include <cstdlib> +#include <iomanip> +#include <iostream> +#include <limits> +#include <numeric> +#include <sstream> +#include <string> +#include "zfp.h" +#include "zfparray1.h" +#include "zfparray2.h" +#include "zfparray3.h" + +enum ArraySize { + Small = 0, // 2^12 = 4096 scalars (2^12 = (2^6)^2 = (2^4)^3 = (2^3)^4) + Large = 1 // 2^24 = 16 M scalars (2^24 = (2^12)^2 = (2^8)^3 = (2^6)^4) +}; + +enum ScalarType { + Float = 0, // 32-bit single precision + Double = 1 // 64-bit double precision +}; + +static const int width = 72; // characters per line + +inline uint +mask(uint i) +{ + return 1u << i; +} + +inline uint +test_size(ArraySize size) +{ + return 2u << size; +} + +// refine 1D array f[m] to g[2m] +inline void +refine1d(int* g, const int* f, size_t m) +{ + const int weight[4] = { -1, 9, 9, -1 }; + const size_t n = 2 * m; + + for (size_t x = 0; x < n; x++) { + int s = 0; + for (int i = 0; i < 4; i++) { + size_t xx = x & 1u ? (x / 2 + i - 1 + m) % m : x / 2; + s += weight[i] * f[xx]; + } + g[x] = s / 16; + } +} + +// refine 2D array f[m][m] to g[2m][2m] +inline void +refine2d(int* g, const int* f, size_t m) +{ + const int weight[4] = { -1, 9, 9, -1 }; + const size_t n = 2 * m; + + for (size_t y = 0; y < n; y++) + for (size_t x = 0; x < n; x++) { + int s = 0; + for (int j = 0; j < 4; j++) { + size_t yy = y & 1u ? (y / 2 + j - 1 + m) % m : y / 2; + for (int i = 0; i < 4; i++) { + size_t xx = x & 1u ? (x / 2 + i - 1 + m) % m : x / 2; + s += weight[i] * weight[j] * f[xx + m * yy]; + } + } + g[x + n * y] = s / (16 * 16); + } +} + +// refine 3D array f[m][m][m] to g[2m][2m][2m] +inline void +refine3d(int* g, const int* f, size_t m) +{ + const int weight[4] = { -1, 9, 9, -1 }; + const size_t n = 2 * m; + + for (size_t z = 0; z < n; z++) + for (size_t y = 0; y < n; y++) + for (size_t x = 0; x < n; x++) { + int s = 0; + for (int k = 0; k < 4; k++) { + size_t zz = z & 1u ? (z / 2 + k - 1 + m) % m : z / 2; + for (int j = 0; j < 4; j++) { + size_t yy = y & 1u ? (y / 2 + j - 1 + m) % m : y / 2; + for (int i = 0; i < 4; i++) { + size_t xx = x & 1u ? (x / 2 + i - 1 + m) % m : x / 2; + s += weight[i] * weight[j] * weight[k] * f[xx + m * (yy + m * zz)]; + } + } + } + g[x + n * (y + n * z)] = s / (16 * 16 * 16); + } +} + +// refine 4D array f[m][m][m][m] to g[2m][2m][2m][2m] +inline void +refine4d(int* g, const int* f, size_t m) +{ + const int weight[4] = { -1, 9, 9, -1 }; + const size_t n = 2 * m; + + for (size_t w = 0; w < n; w++) + for (size_t z = 0; z < n; z++) + for (size_t y = 0; y < n; y++) + for (size_t x = 0; x < n; x++) { + int s = 0; + for (int l = 0; l < 4; l++) { + size_t ww = w & 1u ? (w / 2 + l - 1 + m) % m : w / 2; + for (int k = 0; k < 4; k++) { + size_t zz = z & 1u ? (z / 2 + k - 1 + m) % m : z / 2; + for (int j = 0; j < 4; j++) { + size_t yy = y & 1u ? (y / 2 + j - 1 + m) % m : y / 2; + for (int i = 0; i < 4; i++) { + size_t xx = x & 1u ? (x / 2 + i - 1 + m) % m : x / 2; + s += weight[i] * weight[j] * weight[k] * weight[l] * f[xx + m * (yy + m * (zz + m * ww))]; + } + } + } + } + g[x + n * (y + n * (z + n * w))] = s / (16 * 16 * 16 * 16); + } +} + +template <typename real> +inline void +convert_ints_to_reals(real* data, const int* f, size_t n) +{ + for (size_t i = 0; i < n; i++) + data[i] = std::ldexp(real(f[i]), -12); +} + +// generate 1D test array of size n +template <typename real> +inline bool +gen_array_1d(real* data, size_t n) +{ + // ensure n >= 4 is a power of two + if (n < 4 || n & (n - 1)) + return false; + + // initialize 4-element integer array + int* f = new int[n]; + std::fill(f, f + 4, 0); + for (uint x = 1; x < 3; x++) + f[x] = 0x10000 * (1 - 2 * int(x & 1u)); + + // refine to n-element array + int* g = new int[n]; + for (size_t m = 4; m < n; m *= 2) { + refine1d(g, f, m); + std::swap(f, g); + } + delete[] g; + + // convert ints to real type + convert_ints_to_reals(data, f, n); + delete[] f; + + return true; +} + +// generate 2D test array of size n^2 +template <typename real> +inline bool +gen_array_2d(real* data, size_t n) +{ + // ensure n >= 4 is a power of two + if (n < 4 || n & (n - 1)) + return false; + + // initialize 4x4 integer array + int* f = new int[n * n]; + std::fill(f, f + 4 * 4, 0); + for (uint y = 1; y < 3; y++) + for (uint x = 1; x < 3; x++) + f[x + 4 * y] = 0x10000 * (1 - 2 * int((x ^ y) & 1u)); + + // refine to n^2 array + int* g = new int[n * n]; + for (size_t m = 4; m < n; m *= 2) { + refine2d(g, f, m); + std::swap(f, g); + } + delete[] g; + + // convert ints to real type + convert_ints_to_reals(data, f, n * n); + delete[] f; + + return true; +} + +// generate 3D test array of size n^3 +template <typename real> +inline bool +gen_array_3d(real* data, size_t n) +{ + // ensure n >= 4 is a power of two + if (n < 4 || n & (n - 1)) + return false; + + // initialize 4x4x4 integer array + int* f = new int[n * n * n]; + std::fill(f, f + 4 * 4 * 4, 0); + for (uint z = 1; z <= 2u; z++) + for (uint y = 1; y <= 2u; y++) + for (uint x = 1; x <= 2u; x++) + f[x + 4 * (y + 4 * z)] = 0x10000 * (1 - 2 * int((x ^ y ^ z) & 1u)); + + // refine to n^3 array + int* g = new int[n * n * n]; + for (size_t m = 4; m < n; m *= 2) { + refine3d(g, f, m); + std::swap(f, g); + } + delete[] g; + + // convert ints to real type + convert_ints_to_reals(data, f, n * n * n); + delete[] f; + + return true; +} + +// generate 4D test array of size n^4 +template <typename real> +inline bool +gen_array_4d(real* data, size_t n) +{ + // ensure n >= 4 is a power of two + if (n < 4 || n & (n - 1)) + return false; + + // initialize 4x4x4x4 integer array + int* f = new int[n * n * n * n]; + std::fill(f, f + 4 * 4 * 4 * 4, 0); + for (uint w = 1; w < 3; w++) + for (uint z = 1; z < 3; z++) + for (uint y = 1; y < 3; y++) + for (uint x = 1; x < 3; x++) + f[x + 4 * (y + 4 * (z + 4 * w))] = 0x10000 * (1 - 2 * int((x ^ y ^ z ^ w) & 1u)); + + // refine to n^4 array + int* g = new int[n * n * n * n]; + for (size_t m = 4; m < n; m *= 2) { + refine4d(g, f, m); + std::swap(f, g); + } + delete[] g; + + // convert ints to real type + convert_ints_to_reals(data, f, n * n * n * n); + delete[] f; + + return true; +} + +// initialize array +template <typename Scalar> +inline void +initialize(Scalar* p, uint dims, ArraySize array_size) +{ + size_t size = 1ul << ((array_size == Small ? 12 : 24) / dims); + + switch (dims) { + default: + case 1: + gen_array_1d<Scalar>(p, size); + break; + case 2: + gen_array_2d<Scalar>(p, size); + break; + case 3: + gen_array_3d<Scalar>(p, size); + break; + case 4: + gen_array_4d<Scalar>(p, size); + break; + } +} + +// compute checksum +inline uint32 +hash(const void* p, size_t n) +{ + uint32 h = 0; + for (const uchar* q = static_cast<const uchar*>(p); n; q++, n--) { + // Jenkins one-at-a-time hash; see http://www.burtleburtle.net/bob/hash/doobs.html + h += *q; + h += h << 10; + h ^= h >> 6; + } + h += h << 3; + h ^= h >> 11; + h += h << 15; + return h; +} + +// test fixed-rate mode +template <typename Scalar> +inline uint +test_rate(zfp_stream* stream, const zfp_field* input, double rate, Scalar tolerance, bool timings = false) +{ + uint failures = 0; + size_t n = zfp_field_size(input, NULL); + uint dims = zfp_field_dimensionality(input); + zfp_type type = zfp_field_type(input); + + // allocate memory for compressed data + rate = zfp_stream_set_rate(stream, rate, type, dims, 0); + size_t bufsize = zfp_stream_maximum_size(stream, input); + uchar* buffer = new uchar[bufsize]; + bitstream* s = stream_open(buffer, bufsize); + zfp_stream_set_bit_stream(stream, s); + + // perform compression test + std::ostringstream status; + status << " compress: "; + status << " rate=" << std::fixed << std::setprecision(0) << std::setw(2) << rate; + clock_t c = clock(); + zfp_stream_rewind(stream); + size_t outsize = zfp_compress(stream, input); + double time = double(clock() - c) / CLOCKS_PER_SEC; + double throughput = (n * sizeof(Scalar)) / (0x100000 * time); + if (timings) + status << " throughput=" << std::setprecision(1) << std::setw(6) << throughput << " MB/s"; + bool pass = true; + // make sure compressed size matches rate + size_t bytes = (size_t)floor(rate * zfp_field_size(input, NULL) / CHAR_BIT + 0.5); + if (outsize != bytes) { + status << " [" << outsize << " != " << bytes << "]"; + pass = false; + } + std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl; + if (!pass) + failures++; + + // perform decompression test + status.str(""); + status << " decompress:"; + status << " rate=" << std::fixed << std::setprecision(0) << std::setw(2) << rate; + Scalar* g = new Scalar[n]; + zfp_field* output = zfp_field_alloc(); + *output = *input; + zfp_field_set_pointer(output, g); + c = clock(); + zfp_stream_rewind(stream); + pass = !!zfp_decompress(stream, output); + if (!pass) + status << " [decompression failed]"; + else { + double time = double(clock() - c) / CLOCKS_PER_SEC; + double throughput = (n * sizeof(Scalar)) / (0x100000 * time); + if (timings) + status << " throughput=" << std::setprecision(1) << std::setw(6) << throughput << " MB/s"; + // compute max error + Scalar* f = static_cast<Scalar*>(zfp_field_pointer(input)); + Scalar emax = 0; + for (uint i = 0; i < n; i++) + emax = std::max(emax, std::abs(f[i] - g[i])); + status << std::scientific; + status.precision(3); + // make sure max error is within tolerance + if (emax <= tolerance) + status << " " << emax << " <= " << tolerance; + else { + status << " [" << emax << " > " << tolerance << "]"; + pass = false; + } + } + zfp_field_free(output); + delete[] g; + stream_close(s); + delete[] buffer; + std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl; + if (!pass) + failures++; + + return failures; +} + +// test fixed-precision mode +template <typename Scalar> +inline uint +test_precision(zfp_stream* stream, const zfp_field* input, uint precision, size_t bytes) +{ + uint failures = 0; + size_t n = zfp_field_size(input, NULL); + + // allocate memory for compressed data + zfp_stream_set_precision(stream, precision); + size_t bufsize = zfp_stream_maximum_size(stream, input); + uchar* buffer = new uchar[bufsize]; + bitstream* s = stream_open(buffer, bufsize); + zfp_stream_set_bit_stream(stream, s); + + // perform compression test + std::ostringstream status; + status << " compress: "; + status << " precision=" << std::setw(2) << precision; + zfp_stream_rewind(stream); + size_t outsize = zfp_compress(stream, input); + double ratio = double(n * sizeof(Scalar)) / outsize; + status << " ratio=" << std::fixed << std::setprecision(3) << std::setw(7) << ratio; + bool pass = true; + // make sure compressed size agrees + if (outsize != bytes) { + status << " [" << outsize << " != " << bytes << "]"; + pass = false; + } + std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl; + if (!pass) + failures++; + + // perform decompression test + status.str(""); + status << " decompress:"; + status << " precision=" << std::setw(2) << precision; + Scalar* g = new Scalar[n]; + zfp_field* output = zfp_field_alloc(); + *output = *input; + zfp_field_set_pointer(output, g); + zfp_stream_rewind(stream); + pass = !!zfp_decompress(stream, output); + if (!pass) + status << " [decompression failed]"; + zfp_field_free(output); + delete[] g; + stream_close(s); + delete[] buffer; + std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl; + if (!pass) + failures++; + + return failures; +} + +// test fixed-accuracy mode +template <typename Scalar> +inline uint +test_accuracy(zfp_stream* stream, const zfp_field* input, Scalar tolerance, size_t bytes) +{ + uint failures = 0; + size_t n = zfp_field_size(input, NULL); + + // allocate memory for compressed data + tolerance = static_cast<Scalar>(zfp_stream_set_accuracy(stream, tolerance)); + size_t bufsize = zfp_stream_maximum_size(stream, input); + uchar* buffer = new uchar[bufsize]; + bitstream* s = stream_open(buffer, bufsize); + zfp_stream_set_bit_stream(stream, s); + + // perform compression test + std::ostringstream status; + status << " compress: "; + status << " tolerance=" << std::scientific << std::setprecision(3) << tolerance; + zfp_stream_rewind(stream); + size_t outsize = zfp_compress(stream, input); + double ratio = double(n * sizeof(Scalar)) / outsize; + status << " ratio=" << std::fixed << std::setprecision(3) << std::setw(7) << ratio; + bool pass = true; + // make sure compressed size agrees + if (outsize != bytes) { + status << " [" << outsize << " != " << bytes << "]"; + pass = false; + } + std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl; + if (!pass) + failures++; + + // perform decompression test + status.str(""); + status << " decompress:"; + status << " tolerance=" << std::scientific << std::setprecision(3) << tolerance; + Scalar* g = new Scalar[n]; + zfp_field* output = zfp_field_alloc(); + *output = *input; + zfp_field_set_pointer(output, g); + zfp_stream_rewind(stream); + pass = !!zfp_decompress(stream, output); + if (!pass) + status << " [decompression failed]"; + else { + // compute max error + Scalar* f = static_cast<Scalar*>(zfp_field_pointer(input)); + Scalar emax = 0; + for (uint i = 0; i < n; i++) + emax = std::max(emax, std::abs(f[i] - g[i])); + status << std::scientific << std::setprecision(3) << " "; + // make sure max error is within tolerance + if (emax <= tolerance) + status << emax << " <= " << tolerance; + else if (tolerance == 0) + status << "(" << emax << " > 0)"; + else { + status << "[" << emax << " > " << tolerance << "]"; + pass = false; + } + } + zfp_field_free(output); + delete[] g; + stream_close(s); + delete[] buffer; + std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl; + if (!pass) + failures++; + + return failures; +} + +// perform 1D differencing +template <typename Scalar> +inline void +update_array1(zfp::array1<Scalar>& a) +{ + for (uint i = 0; i < a.size() - 1; i++) + a(i) -= a(i + 1); + for (uint i = 0; i < a.size() - 1; i++) + a(0) = std::max(a(0), a(i)); +} + +// perform 2D differencing +template <typename Scalar> +inline void +update_array2(zfp::array2<Scalar>& a) +{ + for (uint j = 0; j < a.size_y(); j++) + for (uint i = 0; i < a.size_x() - 1; i++) + a(i, j) -= a(i + 1, j); + for (uint j = 0; j < a.size_y() - 1; j++) + for (uint i = 0; i < a.size_x(); i++) + a(i, j) -= a(i, j + 1); + for (uint j = 0; j < a.size_y() - 1; j++) + for (uint i = 0; i < a.size_x() - 1; i++) + a(0, 0) = std::max(a(0, 0), a(i, j)); +} + +// perform 3D differencing +template <typename Scalar> +inline void +update_array3(zfp::array3<Scalar>& a) +{ + for (uint k = 0; k < a.size_z(); k++) + for (uint j = 0; j < a.size_y(); j++) + for (uint i = 0; i < a.size_x() - 1; i++) + a(i, j, k) -= a(i + 1, j, k); + for (uint k = 0; k < a.size_z(); k++) + for (uint j = 0; j < a.size_y() - 1; j++) + for (uint i = 0; i < a.size_x(); i++) + a(i, j, k) -= a(i, j + 1, k); + for (uint k = 0; k < a.size_z() - 1; k++) + for (uint j = 0; j < a.size_y(); j++) + for (uint i = 0; i < a.size_x(); i++) + a(i, j, k) -= a(i, j, k + 1); + for (uint k = 0; k < a.size_z() - 1; k++) + for (uint j = 0; j < a.size_y() - 1; j++) + for (uint i = 0; i < a.size_x() - 1; i++) + a(0, 0, 0) = std::max(a(0, 0, 0), a(i, j, k)); +} + +template <class Array> +inline void update_array(Array& a); + +template <> +inline void +update_array(zfp::array1<float>& a) { update_array1(a); } + +template <> +inline void +update_array(zfp::array1<double>& a) { update_array1(a); } + +template <> +inline void +update_array(zfp::array2<float>& a) { update_array2(a); } + +template <> +inline void +update_array(zfp::array2<double>& a) { update_array2(a); } + +template <> +inline void +update_array(zfp::array3<float>& a) { update_array3(a); } + +template <> +inline void +update_array(zfp::array3<double>& a) { update_array3(a); } + +// test random-accessible array primitive +template <class Array, typename Scalar> +inline uint +test_array(Array& a, const Scalar* f, uint n, double tolerance, double dfmax) +{ + uint failures = 0; + + // test construction + std::ostringstream status; + status << " construct: "; + Scalar emax = 0; + for (uint i = 0; i < n; i++) + emax = std::max(emax, std::abs(f[i] - a[i])); + status << std::scientific; + status.precision(3); + // make sure max error is within tolerance + bool pass = true; + if (emax <= tolerance) + status << " " << emax << " <= " << tolerance; + else { + status << " [" << emax << " > " << tolerance << "]"; + pass = false; + } + + std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl; + if (!pass) + failures++; + + // test array updates + status.str(""); + status << " update: "; + update_array(a); + Scalar amax = a[0]; + pass = true; + if (std::abs(amax - dfmax) <= 1e-3 * dfmax) + status << " " << amax << " ~ " << dfmax; + else { + status << " [" << amax << " != " << dfmax << "]"; + pass = false; + } + + std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl; + if (!pass) + failures++; + + return failures; +} + +// test small or large d-dimensional arrays of type Scalar +template <typename Scalar> +inline uint +test(uint dims, ArraySize array_size) +{ + uint failures = 0; + uint m = test_size(array_size); + uint n = m * m * m * m * m * m * m * m * m * m * m * m; + Scalar* f = new Scalar[n]; + + // determine array size + uint nx, ny, nz ,nw; + zfp_field* field = zfp_field_alloc(); + zfp_field_set_type(field, zfp::codec<Scalar>::type); + zfp_field_set_pointer(field, f); + switch (dims) { + case 1: + nx = n; + ny = nz = nw = 0; + zfp_field_set_size_1d(field, nx); + break; + case 2: + nx = ny = m * m * m * m * m * m; + nz = nw = 0; + zfp_field_set_size_2d(field, nx, ny); + break; + case 3: + nx = ny = nz = m * m * m * m; + nw = 0; + zfp_field_set_size_3d(field, nx, ny, nz); + break; + case 4: + nx = ny = nz = nw = m * m * m; + zfp_field_set_size_4d(field, nx, ny, nz, nw); + break; + default: + std::cout << "invalid dimensions " << dims << std::endl; + return 1; + } + initialize<Scalar>(f, dims, array_size); + uint t = (zfp_field_type(field) == zfp_type_float ? 0 : 1); + std::cout << "testing " << dims << "D array of " << (t == 0 ? "floats" : "doubles") << std::endl; + + // test data integrity + uint32 checksum[2][2][4] = { + // small + {{ 0x54174c44u, 0x86609589u, 0xfc0a6a76u, 0xa3481e00u }, + { 0x7d257bb6u, 0x294bb210u, 0x68614d26u, 0xf6bd3a21u }}, + // large + {{ 0xd1ce1aceu, 0x644274dau, 0xc0ad63fau, 0x700de480u }, + { 0xc3ed7116u, 0x644e2117u, 0xd7464b07u, 0x2516382eu }}, + }; + uint32 h = hash(f, n * sizeof(Scalar)); + if (h != checksum[array_size][t][dims - 1]) + std::cout << "warning: test data checksum " << std::hex << h << " != " << checksum[array_size][t][dims - 1] << "; tests below may fail" << std::endl; + + // open compressed stream + zfp_stream* stream = zfp_stream_open(0); + + // test fixed rate + for (uint rate = 2u >> t, i = 0; rate <= 32 * (t + 1); rate *= 4, i++) { + // expected max errors + double emax[2][2][4][4] = { + // small + { + { + {1.627e+01, 8.277e-02, 0.000e+00}, + {1.500e+00, 3.663e-03, 0.000e+00}, + {1.500e+00, 9.583e-03, 0.000e+00}, + {1.373e+01, 6.633e-01, 0.000e+00}, + }, + { + {1.627e+01, 1.601e+01, 1.832e-04, 0.000e+00}, + {2.376e+01, 1.797e-01, 8.584e-06, 0.000e+00}, + {5.210e+00, 2.002e-01, 3.338e-05, 0.000e+00}, + {1.016e+01, 8.985e+00, 3.312e-03, 0.000e+00}, + }, + }, + // large + { + { + {1.627e+01, 2.100e-02, 0.000e+00}, + {1.624e-01, 7.439e-05, 0.000e+00}, + {1.001e-02, 7.248e-05, 0.000e+00}, + {2.527e-02, 2.460e-04, 0.000e+00}, + }, + { + {1.627e+01, 1.601e+01, 2.289e-05, 0.000e+00}, + {1.607e+01, 2.076e-03, 0.000e+00, 0.000e+00}, + {1.407e-01, 7.344e-04, 0.000e+00, 0.000e+00}, + {1.436e-01, 2.659e-03, 8.801e-08, 0.000e+00}, + } + } + }; + failures += test_rate<Scalar>(stream, field, rate, static_cast<Scalar>(emax[array_size][t][dims - 1][i]), array_size == Large); + } + + if (stream_word_bits != 64) + std::cout << "warning: stream word size is smaller than 64; tests below may fail" << std::endl; + + // test fixed precision + for (uint prec = 4u << t, i = 0; i < 3; prec *= 2, i++) { + // expected compressed sizes + size_t bytes[2][2][4][3] = { + // small + { + { + {2192, 3280, 6328}, + { 592, 1328, 4384}, + { 152, 1040, 4600}, + { 64, 1760, 5856}, + }, + { + {3664, 6712, 14104}, + {1424, 4480, 12616}, + {1064, 4624, 12808}, + {1768, 5864, 14056}, + }, + }, + // large + { + { + {8965672, 13160560, 21835352}, + {2235560, 3512848, 10309240}, + { 568456, 1361056, 8759696}, + { 134344, 739632, 8896360}, + }, + { + {14733112, 23407904, 44997832}, + { 3905240, 10701640, 40856544}, + { 1458368, 8857008, 41270184}, + { 763928, 8920656, 41574712}, + }, + } + }; + failures += test_precision<Scalar>(stream, field, prec, bytes[array_size][t][dims - 1][i]); + } + + // test fixed accuracy + for (uint i = 0; i < 3; i++) { + Scalar tol[] = { Scalar(1e-3), 2 * std::numeric_limits<Scalar>::epsilon(), 0 }; + // expected compressed sizes + size_t bytes[2][2][4][3] = { + // small + { + { + {6328, 11944, 13720}, + {4936, 11064, 12520}, + {6104, 11752, 12784}, + {9440, 14048, 14048}, + }, + { + {6712, 25888, 29064}, + {5032, 26016, 28984}, + {6128, 27120, 29192}, + {9448, 30440, 30440}, + }, + }, + // large + { + { + {21815976, 38285256, 43425280}, + { 9187232, 32695984, 40464144}, + { 8914336, 33364208, 41172864}, + {12109200, 35921784, 41550416}, + }, + { + {23388528, 79426016, 88659304}, + { 9579632, 89770896, 103388072}, + { 9011648, 94009072, 107606336}, + {12133496, 97126288, 107911568}, + }, + } + }; + failures += test_accuracy<Scalar>(stream, field, tol[i], bytes[array_size][t][dims - 1][i]); + } + + // test compressed array support + double emax[2][2][3] = { + // small + { + {4.578e-05, 7.630e-06, 3.148e-05}, + {1.832e-04, 8.584e-06, 3.338e-05}, + }, + // large + { + {0.000e+00, 0.000e+00, 0.000e+00}, + {2.289e-05, 0.000e+00, 0.000e+00}, + } + }; + double dfmax[2][2][3] = { + // small + { + {2.155e-02, 3.755e-01, 1.846e+00}, + {2.155e-02, 3.755e-01, 1.846e+00}, + }, + // large + { + {2.441e-04, 4.883e-04, 1.221e-03}, + {2.670e-04, 4.883e-04, 1.221e-03}, + } + }; + double rate = 16; + switch (dims) { + case 1: { + zfp::array1<Scalar> a(nx, rate, f); + failures += test_array(a, f, n, static_cast<Scalar>(emax[array_size][t][dims - 1]), static_cast<Scalar>(dfmax[array_size][t][dims - 1])); + } + break; + case 2: { + zfp::array2<Scalar> a(nx, ny, rate, f); + failures += test_array(a, f, n, static_cast<Scalar>(emax[array_size][t][dims - 1]), static_cast<Scalar>(dfmax[array_size][t][dims - 1])); + } + break; + case 3: { + zfp::array3<Scalar> a(nx, ny, nz, rate, f); + failures += test_array(a, f, n, static_cast<Scalar>(emax[array_size][t][dims - 1]), static_cast<Scalar>(dfmax[array_size][t][dims - 1])); + } + break; + case 4: // 4D arrays not yet supported + break; + } + + std::cout << std::endl; + zfp_stream_close(stream); + zfp_field_free(field); + + delete[] f; + return failures; +} + +// various library and compiler sanity checks +inline uint +common_tests() +{ + uint failures = 0; + // test library version + if (zfp_codec_version != ZFP_CODEC || zfp_library_version != ZFP_VERSION) { + std::cout << "library header and binary version mismatch" << std::endl; + failures++; + } + // ensure integer type sizes are correct + if (sizeof(int8) != 1u || sizeof(uint8) != 1u) { + std::cout << "8-bit integer type is not one byte wide" << std::endl; + failures++; + } + if (sizeof(int16) != 2u || sizeof(uint16) != 2u) { + std::cout << "16-bit integer type is not two bytes wide" << std::endl; + failures++; + } + if (sizeof(int32) != 4u || sizeof(uint32) != 4u) { + std::cout << "32-bit integer type is not four bytes wide" << std::endl; + failures++; + } + if (sizeof(int64) != 8u || sizeof(uint64) != 8u) { + std::cout << "64-bit integer type is not eight bytes wide" << std::endl; + failures++; + } + // ensure signed right shifts are arithmetic + int32 x32 = -2; + if ((x32 >> 1) != -1 || (x32 >> 2) != -1) { + std::cout << "32-bit arithmetic right shift not supported" << std::endl; + failures++; + } + int64 x64 = -2; + if ((x64 >> 1) != INT64C(-1) || (x64 >> 2) != INT64C(-1)) { + std::cout << "64-bit arithmetic right shift not supported" << std::endl; + failures++; + } + // testing requires default (64-bit) stream words + if (stream_word_bits != 64) { + std::cout << "regression testing requires BIT_STREAM_WORD_TYPE=uint64" << std::endl; + failures++; + } + return failures; +} + +int main(int argc, char* argv[]) +{ + std::cout << zfp_version_string << std::endl; + std::cout << "library version " << zfp_library_version << std::endl; + std::cout << "CODEC version " << zfp_codec_version << std::endl; + std::cout << "data model "; + size_t model = ((sizeof(uint64) - 1) << 12) + + ((sizeof(void*) - 1) << 8) + + ((sizeof(unsigned long int) - 1) << 4) + + ((sizeof(unsigned int) - 1) << 0); + switch (model) { + case 0x7331u: + std::cout << "LP32"; + break; + case 0x7333u: + std::cout << "ILP32"; + break; + case 0x7733u: + std::cout << "LLP64"; + break; + case 0x7773u: + std::cout << "LP64"; + break; + case 0x7777u: + std::cout << "ILP64"; + break; + default: + std::cout << "unknown (0x" << std::hex << model << ")"; + break; + } + std::cout << std::endl; + std::cout << std::endl; + + uint sizes = 0; + uint types = 0; + uint dims = 0; + + for (int i = 1; i < argc; i++) + if (std::string(argv[i]) == "small") + sizes |= mask(Small); + else if (std::string(argv[i]) == "large") + sizes |= mask(Large); + else if (std::string(argv[i]) == "float" || std::string(argv[i]) == "fp32") + types |= mask(Float); + else if (std::string(argv[i]) == "double" || std::string(argv[i]) == "fp64") + types |= mask(Double); + else if (std::string(argv[i]) == "1d") + dims |= mask(1); + else if (std::string(argv[i]) == "2d") + dims |= mask(2); + else if (std::string(argv[i]) == "3d") + dims |= mask(3); + else if (std::string(argv[i]) == "4d") + dims |= mask(4); + else if (std::string(argv[i]) == "all") { + sizes |= mask(Small) | mask(Large); + types |= mask(Float) | mask(Double); + dims |= mask(1) | mask(2) | mask(3) | mask(4); + } + else { + std::cerr << "Usage: testzfp [all] [small|large] [fp32|fp64|float|double] [1d|2d|3d|4d]" << std::endl; + return EXIT_FAILURE; + } + + // use defaults if not specified + if (!sizes) + sizes = mask(Small); + if (!types) + types = mask(Float) | mask(Double); + if (!dims) + dims = mask(1) | mask(2) | mask(3) | mask(4); + + // test library and compiler + uint failures = common_tests(); + if (failures) + return EXIT_FAILURE; + + // test arrays + for (int size = Small; size <= Large; size++) + if (sizes & mask(ArraySize(size))) { + for (uint d = 1; d <= 4; d++) + if (dims & mask(d)) { + if (types & mask(Float)) + failures += test<float>(d, ArraySize(size)); + if (types & mask(Double)) + failures += test<double>(d, ArraySize(size)); + } + } + + if (failures) + std::cout << failures << " test(s) failed" << std::endl; + else + std::cout << "all tests passed" << std::endl; + + return failures ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/zfp/travis.sh b/zfp/travis.sh new file mode 100755 index 0000000000000000000000000000000000000000..2314e0376889f7d8a7a8bc8de1343c3abb334f99 --- /dev/null +++ b/zfp/travis.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env sh +set -e + +mkdir build +cd build + +# build/test without OpenMP, with CFP +cmake .. -DCMAKE_C_STANDARD=${C_STANDARD:-99} -DCMAKE_CXX_STANDARD=${CXX_STANDARD:-98} -DZFP_WITH_OPENMP=OFF -DBUILD_CFP=ON +cmake --build . +ctest -V -C "Debug" + +rm -rf ./* + +# build/test with OpenMP, with CFP custom namespace +cmake .. -DCMAKE_C_STANDARD=${C_STANDARD:-99} -DCMAKE_CXX_STANDARD=${CXX_STANDARD:-98} -DBUILD_CFP=ON -DCFP_NAMESPACE=cfp2 +cmake --build . +ctest -V -C "Debug" diff --git a/zfp/utils/CMakeLists.txt b/zfp/utils/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a960d2c9cdbf2363689bbe47130578d8655bd7f4 --- /dev/null +++ b/zfp/utils/CMakeLists.txt @@ -0,0 +1,6 @@ +add_executable(zfpcmd zfp.c) +set_property(TARGET zfpcmd PROPERTY OUTPUT_NAME zfp) +target_link_libraries(zfpcmd zfp) +if(HAVE_LIBM_MATH) + target_link_libraries(zfpcmd m) +endif() diff --git a/zfp/utils/Makefile b/zfp/utils/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..50a40ce9e7d8dfd7c15f48468fefb9db0ccc08df --- /dev/null +++ b/zfp/utils/Makefile @@ -0,0 +1,12 @@ +include ../Config + +TARGET = ../bin/zfp + +all: $(TARGET) + +$(TARGET): zfp.c ../lib/$(LIBZFP) + mkdir -p ../bin + $(CC) $(CFLAGS) zfp.c -L../lib -lzfp -lm -o $(TARGET) + +clean: + rm -f $(TARGET) fields.o diff --git a/zfp/utils/zfp.c b/zfp/utils/zfp.c new file mode 100644 index 0000000000000000000000000000000000000000..d4f4cf6f512fe8a3bfa8b5f412948eb9b4a24ee1 --- /dev/null +++ b/zfp/utils/zfp.c @@ -0,0 +1,646 @@ +#include <float.h> +#include <limits.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "zfp.h" +#include "zfp/macros.h" + +/* +File I/O is done using the following combinations of i, o, s, and z: +- i : read uncompressed +- z : read compressed +- i, s: read uncompressed, print stats +- i, o: read and write uncompressed +- i, z: read uncompressed, write compressed +- z, o: read compressed, write uncompressed + +The 7 major tasks to be accomplished are: +- read uncompressed: i +- read compressed: !i +- compress: i +- write compressed: i && z +- decompress: o || s || (!i && z) +- write uncompressed: o +- compute stats: s +*/ + +/* compute and print reconstruction error */ +static void +print_error(const void* fin, const void* fout, zfp_type type, size_t n) +{ + const int32* i32i = (const int32*)fin; + const int64* i64i = (const int64*)fin; + const float* f32i = (const float*)fin; + const double* f64i = (const double*)fin; + const int32* i32o = (const int32*)fout; + const int64* i64o = (const int64*)fout; + const float* f32o = (const float*)fout; + const double* f64o = (const double*)fout; + double fmin = +DBL_MAX; + double fmax = -DBL_MAX; + double erms = 0; + double ermsn = 0; + double emax = 0; + double psnr = 0; + size_t i; + + for (i = 0; i < n; i++) { + double d, val; + switch (type) { + case zfp_type_int32: + d = fabs((double)(i32i[i] - i32o[i])); + val = (double)i32i[i]; + break; + case zfp_type_int64: + d = fabs((double)(i64i[i] - i64o[i])); + val = (double)i64i[i]; + break; + case zfp_type_float: + d = fabs((double)(f32i[i] - f32o[i])); + val = (double)f32i[i]; + break; + case zfp_type_double: + d = fabs(f64i[i] - f64o[i]); + val = f64i[i]; + break; + default: + return; + } + emax = MAX(emax, d); + erms += d * d; + fmin = MIN(fmin, val); + fmax = MAX(fmax, val); + } + erms = sqrt(erms / n); + ermsn = erms / (fmax - fmin); + psnr = 20 * log10((fmax - fmin) / (2 * erms)); + fprintf(stderr, " rmse=%.4g nrmse=%.4g maxe=%.4g psnr=%.2f", erms, ermsn, emax, psnr); +} + +static void +usage() +{ + fprintf(stderr, "%s\n", zfp_version_string); + fprintf(stderr, "Usage: zfp <options>\n"); + fprintf(stderr, "General options:\n"); + fprintf(stderr, " -h : read/write array and compression parameters from/to compressed header\n"); + fprintf(stderr, " -q : quiet mode; suppress output\n"); + fprintf(stderr, " -s : print error statistics\n"); + fprintf(stderr, "Input and output:\n"); + fprintf(stderr, " -i <path> : uncompressed binary input file (\"-\" for stdin)\n"); + fprintf(stderr, " -o <path> : decompressed binary output file (\"-\" for stdout)\n"); + fprintf(stderr, " -z <path> : compressed input (w/o -i) or output file (\"-\" for stdin/stdout)\n"); + fprintf(stderr, "Array type and dimensions (needed with -i):\n"); + fprintf(stderr, " -f : single precision (float type)\n"); + fprintf(stderr, " -d : double precision (double type)\n"); + fprintf(stderr, " -t <i32|i64|f32|f64> : integer or floating scalar type\n"); + fprintf(stderr, " -1 <nx> : dimensions for 1D array a[nx]\n"); + fprintf(stderr, " -2 <nx> <ny> : dimensions for 2D array a[ny][nx]\n"); + fprintf(stderr, " -3 <nx> <ny> <nz> : dimensions for 3D array a[nz][ny][nx]\n"); + fprintf(stderr, " -4 <nx> <ny> <nz> <nw> : dimensions for 4D array a[nw][nz][ny][nx]\n"); + fprintf(stderr, "Compression parameters (needed with -i):\n"); + fprintf(stderr, " -r <rate> : fixed rate (# compressed bits per floating-point value)\n"); + fprintf(stderr, " -p <precision> : fixed precision (# uncompressed bits per value)\n"); + fprintf(stderr, " -a <tolerance> : fixed accuracy (absolute error tolerance)\n"); + fprintf(stderr, " -c <minbits> <maxbits> <maxprec> <minexp> : advanced usage\n"); + fprintf(stderr, " minbits : min # bits per 4^d values in d dimensions\n"); + fprintf(stderr, " maxbits : max # bits per 4^d values in d dimensions (0 for unlimited)\n"); + fprintf(stderr, " maxprec : max # bits of precision per value (0 for full)\n"); + fprintf(stderr, " minexp : min bit plane # coded (-1074 for all bit planes)\n"); + fprintf(stderr, "Execution parameters:\n"); + fprintf(stderr, " -x serial : serial compression (default)\n"); + fprintf(stderr, " -x omp[=threads[,chunk_size]] : OpenMP parallel compression\n"); + fprintf(stderr, " -x cuda : CUDA fixed rate parallel compression/decompression\n"); + fprintf(stderr, "Examples:\n"); + fprintf(stderr, " -i file : read uncompressed file and compress to memory\n"); + fprintf(stderr, " -z file : read compressed file and decompress to memory\n"); + fprintf(stderr, " -i ifile -z zfile : read uncompressed ifile, write compressed zfile\n"); + fprintf(stderr, " -z zfile -o ofile : read compressed zfile, write decompressed ofile\n"); + fprintf(stderr, " -i ifile -o ofile : read ifile, compress, decompress, write ofile\n"); + fprintf(stderr, " -i file -s : read uncompressed file, compress to memory, print stats\n"); + fprintf(stderr, " -i - -o - -s : read stdin, compress, decompress, write stdout, print stats\n"); + fprintf(stderr, " -f -3 100 100 100 -r 16 : 2x fixed-rate compression of 100x100x100 floats\n"); + fprintf(stderr, " -d -1 1000000 -r 32 : 2x fixed-rate compression of 1M doubles\n"); + fprintf(stderr, " -d -2 1000 1000 -p 32 : 32-bit precision compression of 1000x1000 doubles\n"); + fprintf(stderr, " -d -1 1000000 -a 1e-9 : compression of 1M doubles with < 1e-9 max error\n"); + fprintf(stderr, " -d -1 1000000 -c 64 64 0 -1074 : 4x fixed-rate compression of 1M doubles\n"); + fprintf(stderr, " -x omp=16,256 : parallel compression with 16 threads, 256-block chunks\n"); + exit(EXIT_FAILURE); +} + +int main(int argc, char* argv[]) +{ + /* default settings */ + zfp_type type = zfp_type_none; + size_t typesize = 0; + uint dims = 0; + uint nx = 0; + uint ny = 0; + uint nz = 0; + uint nw = 0; + size_t count = 0; + double rate = 0; + uint precision = 0; + double tolerance = 0; + uint minbits = ZFP_MIN_BITS; + uint maxbits = ZFP_MAX_BITS; + uint maxprec = ZFP_MAX_PREC; + int minexp = ZFP_MIN_EXP; + int header = 0; + int quiet = 0; + int stats = 0; + char* inpath = 0; + char* zfppath = 0; + char* outpath = 0; + char mode = 0; + zfp_exec_policy exec = zfp_exec_serial; + uint threads = 0; + uint chunk_size = 0; + + /* local variables */ + int i; + zfp_field* field = NULL; + zfp_stream* zfp = NULL; + bitstream* stream = NULL; + void* fi = NULL; + void* fo = NULL; + void* buffer = NULL; + size_t rawsize = 0; + size_t zfpsize = 0; + size_t bufsize = 0; + + if (argc == 1) + usage(); + + /* parse command-line arguments */ + for (i = 1; i < argc; i++) { + if (argv[i][0] != '-' || argv[i][2]) + usage(); + switch (argv[i][1]) { + case '1': + if (++i == argc || sscanf(argv[i], "%u", &nx) != 1) + usage(); + ny = nz = nw = 1; + dims = 1; + break; + case '2': + if (++i == argc || sscanf(argv[i], "%u", &nx) != 1 || + ++i == argc || sscanf(argv[i], "%u", &ny) != 1) + usage(); + nz = nw = 1; + dims = 2; + break; + case '3': + if (++i == argc || sscanf(argv[i], "%u", &nx) != 1 || + ++i == argc || sscanf(argv[i], "%u", &ny) != 1 || + ++i == argc || sscanf(argv[i], "%u", &nz) != 1) + usage(); + nw = 1; + dims = 3; + break; + case '4': + if (++i == argc || sscanf(argv[i], "%u", &nx) != 1 || + ++i == argc || sscanf(argv[i], "%u", &ny) != 1 || + ++i == argc || sscanf(argv[i], "%u", &nz) != 1 || + ++i == argc || sscanf(argv[i], "%u", &nw) != 1) + usage(); + dims = 4; + break; + case 'a': + if (++i == argc || sscanf(argv[i], "%lf", &tolerance) != 1) + usage(); + mode = 'a'; + break; + case 'c': + if (++i == argc || sscanf(argv[i], "%u", &minbits) != 1 || + ++i == argc || sscanf(argv[i], "%u", &maxbits) != 1 || + ++i == argc || sscanf(argv[i], "%u", &maxprec) != 1 || + ++i == argc || sscanf(argv[i], "%d", &minexp) != 1) + usage(); + mode = 'c'; + break; + case 'd': + type = zfp_type_double; + break; + case 'f': + type = zfp_type_float; + break; + case 'h': + header = 1; + break; + case 'i': + if (++i == argc) + usage(); + inpath = argv[i]; + break; + case 'o': + if (++i == argc) + usage(); + outpath = argv[i]; + break; + case 'p': + if (++i == argc || sscanf(argv[i], "%u", &precision) != 1) + usage(); + mode = 'p'; + break; + case 'q': + quiet = 1; + break; + case 'r': + if (++i == argc || sscanf(argv[i], "%lf", &rate) != 1) + usage(); + mode = 'r'; + break; + case 's': + stats = 1; + break; + case 't': + if (++i == argc) + usage(); + if (!strcmp(argv[i], "i32")) + type = zfp_type_int32; + else if (!strcmp(argv[i], "i64")) + type = zfp_type_int64; + else if (!strcmp(argv[i], "f32")) + type = zfp_type_float; + else if (!strcmp(argv[i], "f64")) + type = zfp_type_double; + else + usage(); + break; + case 'x': + if (++i == argc) + usage(); + if (!strcmp(argv[i], "serial")) + exec = zfp_exec_serial; + else if (!strcmp(argv[i], "cuda")) + exec = zfp_exec_cuda; + else if (sscanf(argv[i], "omp=%u,%u", &threads, &chunk_size) == 2) + exec = zfp_exec_omp; + else if (sscanf(argv[i], "omp=%u", &threads) == 1) { + exec = zfp_exec_omp; + chunk_size = 0; + } + else if (!strcmp(argv[i], "omp")) { + exec = zfp_exec_omp; + threads = 0; + chunk_size = 0; + } + else + usage(); + break; + case 'z': + if (++i == argc) + usage(); + zfppath = argv[i]; + break; + default: + usage(); + break; + } + } + + typesize = zfp_type_size(type); + count = (size_t)nx * (size_t)ny * (size_t)nz * (size_t)nw; + + /* make sure one of the array dimensions is not zero */ + if (!count) { + fprintf(stderr, "array size must be nonzero\n"); + return EXIT_FAILURE; + } + + /* make sure we have an input file */ + if (!inpath && !zfppath) { + fprintf(stderr, "must specify uncompressed or compressed input file via -i or -z\n"); + return EXIT_FAILURE; + } + + /* make sure we know floating-point type */ + if ((inpath || !header) && !typesize) { + fprintf(stderr, "must specify scalar type via -f, -d, or -t or header via -h\n"); + return EXIT_FAILURE; + } + + /* make sure we know array dimensions */ + if ((inpath || !header) && !dims) { + fprintf(stderr, "must specify array dimensions via -1, -2, or -3 or header via -h\n"); + return EXIT_FAILURE; + } + + /* make sure we know (de)compression mode and parameters */ + if ((inpath || !header) && !mode) { + fprintf(stderr, "must specify compression parameters via -a, -c, -p, or -r or header via -h\n"); + return EXIT_FAILURE; + } + + /* make sure we have input file for stats */ + if (stats && !inpath) { + fprintf(stderr, "must specify input file via -i to compute stats\n"); + return EXIT_FAILURE; + } + + /* make sure meta data comes from header or command line, not both */ + if (!inpath && zfppath && header && (typesize || dims)) { + fprintf(stderr, "cannot specify both field type/size and header\n"); + return EXIT_FAILURE; + } + + zfp = zfp_stream_open(NULL); + field = zfp_field_alloc(); + + /* read uncompressed or compressed file */ + if (inpath) { + /* read uncompressed input file */ + FILE* file = !strcmp(inpath, "-") ? stdin : fopen(inpath, "rb"); + if (!file) { + fprintf(stderr, "cannot open input file\n"); + return EXIT_FAILURE; + } + rawsize = typesize * count; + fi = malloc(rawsize); + if (!fi) { + fprintf(stderr, "cannot allocate memory\n"); + return EXIT_FAILURE; + } + if (fread(fi, typesize, count, file) != count) { + fprintf(stderr, "cannot read input file\n"); + return EXIT_FAILURE; + } + fclose(file); + zfp_field_set_pointer(field, fi); + } + else { + /* read compressed input file in increasingly large chunks */ + FILE* file = !strcmp(zfppath, "-") ? stdin : fopen(zfppath, "rb"); + if (!file) { + fprintf(stderr, "cannot open compressed file\n"); + return EXIT_FAILURE; + } + bufsize = 0x100; + do { + bufsize *= 2; + buffer = realloc(buffer, bufsize); + if (!buffer) { + fprintf(stderr, "cannot allocate memory\n"); + return EXIT_FAILURE; + } + zfpsize += fread((uchar*)buffer + zfpsize, 1, bufsize - zfpsize, file); + } while (zfpsize == bufsize); + if (ferror(file)) { + fprintf(stderr, "cannot read compressed file\n"); + return EXIT_FAILURE; + } + fclose(file); + + /* associate bit stream with buffer */ + stream = stream_open(buffer, bufsize); + if (!stream) { + fprintf(stderr, "cannot open compressed stream\n"); + return EXIT_FAILURE; + } + zfp_stream_set_bit_stream(zfp, stream); + } + + /* set field dimensions and (de)compression parameters */ + if (inpath || !header) { + /* initialize uncompressed field */ + zfp_field_set_type(field, type); + switch (dims) { + case 1: + zfp_field_set_size_1d(field, nx); + break; + case 2: + zfp_field_set_size_2d(field, nx, ny); + break; + case 3: + zfp_field_set_size_3d(field, nx, ny, nz); + break; + case 4: + zfp_field_set_size_4d(field, nx, ny, nz, nw); + break; + } + + /* set (de)compression mode */ + switch (mode) { + case 'a': + zfp_stream_set_accuracy(zfp, tolerance); + break; + case 'p': + zfp_stream_set_precision(zfp, precision); + break; + case 'r': + zfp_stream_set_rate(zfp, rate, type, dims, 0); + break; + case 'c': + if (!maxbits) + maxbits = ZFP_MAX_BITS; + if (!maxprec) + maxprec = zfp_field_precision(field); + if (!zfp_stream_set_params(zfp, minbits, maxbits, maxprec, minexp)) { + fprintf(stderr, "invalid compression parameters\n"); + return EXIT_FAILURE; + } + break; + } + } + + /* specify execution policy */ + switch (exec) { + case zfp_exec_omp: + if (!zfp_stream_set_execution(zfp, exec) || + !zfp_stream_set_omp_threads(zfp, threads) || + !zfp_stream_set_omp_chunk_size(zfp, chunk_size)) { + fprintf(stderr, "OpenMP execution not available\n"); + return EXIT_FAILURE; + } + break; + case zfp_exec_serial: + default: + if (!zfp_stream_set_execution(zfp, exec)) { + fprintf(stderr, "serial execution not available\n"); + return EXIT_FAILURE; + } + break; + } + + /* compress input file if provided */ + if (inpath) { + /* allocate buffer for compressed data */ + bufsize = zfp_stream_maximum_size(zfp, field); + if (!bufsize) { + fprintf(stderr, "invalid compression parameters\n"); + return EXIT_FAILURE; + } + buffer = malloc(bufsize); + if (!buffer) { + fprintf(stderr, "cannot allocate memory\n"); + return EXIT_FAILURE; + } + + /* associate compressed bit stream with memory buffer */ + stream = stream_open(buffer, bufsize); + if (!stream) { + fprintf(stderr, "cannot open compressed stream\n"); + return EXIT_FAILURE; + } + zfp_stream_set_bit_stream(zfp, stream); + + /* specify execution policy */ + switch (exec) { + case zfp_exec_omp: + if (!zfp_stream_set_execution(zfp, exec) || + !zfp_stream_set_omp_threads(zfp, threads) || + !zfp_stream_set_omp_chunk_size(zfp, chunk_size)) { + fprintf(stderr, "OpenMP execution not available\n"); + return EXIT_FAILURE; + } + break; + case zfp_exec_cuda: + if (!zfp_stream_set_execution(zfp, exec)) { + fprintf(stderr, "cuda execution not available\n"); + return EXIT_FAILURE; + } + case zfp_exec_serial: + default: + if (!zfp_stream_set_execution(zfp, exec)) { + fprintf(stderr, "serial execution not available\n"); + return EXIT_FAILURE; + } + break; + } + + /* optionally write header */ + if (header && !zfp_write_header(zfp, field, ZFP_HEADER_FULL)) { + fprintf(stderr, "cannot write header\n"); + return EXIT_FAILURE; + } + + /* compress data */ + zfpsize = zfp_compress(zfp, field); + if (zfpsize == 0) { + fprintf(stderr, "compression failed\n"); + return EXIT_FAILURE; + } + + /* optionally write compressed data */ + if (zfppath) { + FILE* file = !strcmp(zfppath, "-") ? stdout : fopen(zfppath, "wb"); + if (!file) { + fprintf(stderr, "cannot create compressed file\n"); + return EXIT_FAILURE; + } + if (fwrite(buffer, 1, zfpsize, file) != zfpsize) { + fprintf(stderr, "cannot write compressed file\n"); + return EXIT_FAILURE; + } + fclose(file); + } + } + + /* decompress data if necessary */ + if ((!inpath && zfppath) || outpath || stats) { + /* obtain metadata from header when present */ + zfp_stream_rewind(zfp); + if (header) { + if (!zfp_read_header(zfp, field, ZFP_HEADER_FULL)) { + fprintf(stderr, "incorrect or missing header\n"); + return EXIT_FAILURE; + } + type = field->type; + switch (type) { + case zfp_type_float: + typesize = sizeof(float); + break; + case zfp_type_double: + typesize = sizeof(double); + break; + default: + fprintf(stderr, "unsupported type\n"); + return EXIT_FAILURE; + } + nx = MAX(field->nx, 1u); + ny = MAX(field->ny, 1u); + nz = MAX(field->nz, 1u); + nw = MAX(field->nw, 1u); + } + + /* specify execution policy */ + switch (exec) { + case zfp_exec_omp: + fprintf(stderr, "OpenMP decompression not available\n"); + return EXIT_FAILURE; + case zfp_exec_cuda: + if (!zfp_stream_set_execution(zfp, exec)) { + fprintf(stderr, "cuda execution not available\n"); + return EXIT_FAILURE; + } + case zfp_exec_serial: + default: + if (!zfp_stream_set_execution(zfp, exec)) { + fprintf(stderr, "serial execution not available\n"); + return EXIT_FAILURE; + } + break; + } + + /* allocate memory for decompressed data */ + rawsize = typesize * count; + fo = malloc(rawsize); + if (!fo) { + fprintf(stderr, "cannot allocate memory\n"); + return EXIT_FAILURE; + } + zfp_field_set_pointer(field, fo); + + /* decompress data */ + while (!zfp_decompress(zfp, field)) { + /* fall back on serial decompression if execution policy not supported */ + if (inpath && zfp_stream_execution(zfp) != zfp_exec_serial) { + if (!zfp_stream_set_execution(zfp, zfp_exec_serial)) { + fprintf(stderr, "cannot change execution policy\n"); + return EXIT_FAILURE; + } + } + else { + fprintf(stderr, "decompression failed\n"); + return EXIT_FAILURE; + } + } + + /* optionally write reconstructed data */ + if (outpath) { + FILE* file = !strcmp(outpath, "-") ? stdout : fopen(outpath, "wb"); + if (!file) { + fprintf(stderr, "cannot create output file\n"); + return EXIT_FAILURE; + } + if (fwrite(fo, typesize, count, file) != count) { + fprintf(stderr, "cannot write output file\n"); + return EXIT_FAILURE; + } + fclose(file); + } + } + + /* print compression and error statistics */ + if (!quiet) { + const char* type_name[] = { "int32", "int64", "float", "double" }; + fprintf(stderr, "type=%s nx=%u ny=%u nz=%u nw=%u", type_name[type - zfp_type_int32], nx, ny, nz, nw); + fprintf(stderr, " raw=%lu zfp=%lu ratio=%.3g rate=%.4g", (unsigned long)rawsize, (unsigned long)zfpsize, (double)rawsize / zfpsize, CHAR_BIT * (double)zfpsize / count); + if (stats) + print_error(fi, fo, type, count); + fprintf(stderr, "\n"); + } + + /* free allocated storage */ + zfp_field_free(field); + zfp_stream_close(zfp); + stream_close(stream); + free(buffer); + free(fi); + free(fo); + + return EXIT_SUCCESS; +}