diff --git a/zfp/LICENSE b/zfp/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..7945102e870621acb36ddfddec709168f9db5c52
--- /dev/null
+++ b/zfp/LICENSE
@@ -0,0 +1,57 @@
+Copyright (c) 2014-2018, Lawrence Livermore National Security, LLC.
+Produced at the Lawrence Livermore National Laboratory.
+Written by Peter Lindstrom, Markus Salasoo, and Matt Larsen.
+LLNL-CODE-663824.
+All rights reserved.
+
+This file is part of the zfp library.
+For details, see http://computation.llnl.gov/casc/zfp/.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the disclaimer below.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the disclaimer (as noted below) in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the LLNS/LLNL nor the names of its contributors may
+be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED.  IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Additional BSD Notice
+
+1. This notice is required to be provided under our contract with the U.S.
+Department of Energy (DOE).  This work was produced at Lawrence Livermore
+National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
+
+2. Neither the United States Government nor Lawrence Livermore National
+Security, LLC nor any of their employees, makes any warranty, express or
+implied, or assumes any liability or responsibility for the accuracy,
+completeness, or usefulness of any information, apparatus, product, or
+process disclosed, or represents that its use would not infringe
+privately-owned rights.
+
+3. Also, reference herein to any specific commercial products, process, or
+services by trade name, trademark, manufacturer or otherwise does not
+necessarily constitute or imply its endorsement, recommendation, or
+favoring by the United States Government or Lawrence Livermore National
+Security, LLC.  The views and opinions of authors expressed herein do not
+necessarily state or reflect those of the United States Government or
+Lawrence Livermore National Security, LLC, and shall not be used for
+advertising or product endorsement purposes.
diff --git a/zfp/Makefile b/zfp/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..cc696640144c360aa1126111ed47430a17795dbf
--- /dev/null
+++ b/zfp/Makefile
@@ -0,0 +1,69 @@
+# see Config file for compile-time settings
+#include Config
+
+include ../Make_include
+
+CFLAGS += -std=c99 -I../include
+
+MAKEFLAGS += --no-print-directory
+
+# macOS compiler options (uncomment on macOS) ---------------------------------
+
+# SOFLAGS += -undefined dynamic_lookup
+
+# default targets
+BUILD_CFP = 0
+BUILD_ZFORP = 0
+BUILD_UTILITIES = 0
+BUILD_EXAMPLES = 0
+BUILD_TESTING = 0
+BUILD_SHARED_LIBS = 0
+
+LIBRARY = static
+LIBZFP = libzfp.a
+
+# compiler options ------------------------------------------------------------
+
+# default: build all targets enabled in Config
+all:
+	@echo $(LIBRARY)
+	@cd src; $(MAKE) $(LIBRARY)
+ifneq ($(BUILD_CFP),0)
+	@cd cfp/src; $(MAKE) clean $(LIBRARY)
+endif
+ifneq ($(BUILD_ZFORP),0)
+	@cd fortran; $(MAKE) clean $(LIBRARY)
+endif
+ifneq ($(BUILD_UTILITIES),0)
+	@cd utils; $(MAKE) clean all
+endif
+ifneq ($(BUILD_TESTING),0)
+	@cd tests; $(MAKE) clean all
+endif
+ifneq ($(BUILD_EXAMPLES),0)
+	@cd examples; $(MAKE) clean all
+endif
+
+
+# run basic regression tests
+test:
+	@cd tests; $(MAKE) test
+
+# clean all
+clean:
+	@cd src; $(MAKE) clean
+ifneq ($(BUILD_CFP),0)
+	@cd cfp/src; $(MAKE) clean
+endif
+ifneq ($(BUILD_ZFORP),0)
+	@cd fortran; $(MAKE) clean
+endif
+ifneq ($(BUILD_UTILITIES),0)
+	@cd utils; $(MAKE) clean
+endif
+ifneq ($(BUILD_TESTING),0)
+	@cd tests; $(MAKE) clean
+endif
+ifneq ($(BUILD_EXAMPLES),0)
+	@cd examples; $(MAKE) clean
+endif
diff --git a/zfp/README.md b/zfp/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c2aa80dece7d9d759347ee2dff3b6589f6e0ca5f
--- /dev/null
+++ b/zfp/README.md
@@ -0,0 +1,134 @@
+ZFP
+===
+
+INTRODUCTION
+------------
+
+zfp is an open source C/C++ library for compressed numerical arrays that
+support high throughput read and write random access.  zfp also supports
+streaming compression of integer and floating-point data, e.g., for
+applications that read and write large data sets to and from disk.
+
+zfp was developed at Lawrence Livermore National Laboratory and is loosely
+based on the algorithm described in the following paper:
+
+    Peter Lindstrom
+    "Fixed-Rate Compressed Floating-Point Arrays"
+    IEEE Transactions on Visualization and Computer Graphics
+    20(12):2674-2683, December 2014
+    doi:10.1109/TVCG.2014.2346458
+
+zfp was originally designed for floating-point arrays only, but has been
+extended to also support integer data, and could for instance be used to
+compress images and quantized volumetric data.  To achieve high compression
+ratios, zfp uses lossy but optionally error-bounded compression.  Although
+bit-for-bit lossless compression of floating-point data is not always
+possible, zfp is usually accurate to within machine epsilon in near-lossless
+mode.
+
+zfp works best for 2D and 3D arrays that exhibit spatial correlation, such as
+continuous fields from physics simulations, images, regularly sampled terrain
+surfaces, etc.  Although zfp also provides a 1D array class that can be used
+for 1D signals such as audio, or even unstructured floating-point streams,
+the compression scheme has not been well optimized for this use case, and
+rate and quality may not be competitive with floating-point compressors
+designed specifically for 1D streams.
+
+zfp is freely available as open source under a BSD license, as outlined in
+the file 'LICENSE'.  For more information on zfp and comparisons with other
+compressors, please see the zfp
+[website](https://computation.llnl.gov/projects/floating-point-compression).
+For questions, comments, requests, and bug reports, please contact
+[Peter Lindstrom](mailto:pl@llnl.gov).
+
+
+DOCUMENTATION
+-------------
+
+Full
+[documentation](http://zfp.readthedocs.io/en/release0.5.4/)
+is available online via Read the Docs.  A
+[PDF](http://readthedocs.org/projects/zfp/downloads/pdf/release0.5.4/)
+version is also available.
+
+
+INSTALLATION
+------------
+
+zfp consists of three distinct parts: a compression library written in C;
+a set of C++ header files with C wrappers that implement compressed arrays;
+and a set of C and C++ examples.  The main compression codec is written in
+C and should conform to both the ISO C89 and C99 standards.  The C++ array
+classes are implemented entirely in header files and can be included as is,
+but since they call the compression library, applications must link with
+libzfp.
+
+On Linux, macOS, and MinGW, zfp is easiest compiled using gcc and gmake.
+CMake support is also available, e.g., for Windows builds.  See below for
+instructions on GNU and CMake builds.
+
+zfp has successfully been built and tested using these compilers:
+
+    gcc versions 4.4.7, 4.9.4, 5.5.0, 6.1.0, 6.4.0, 7.1.0, 7.3.0, 8.1.0
+    icc versions 15.0.6, 16.0.4, 17.0.2, 18.0.2, 19.0.0
+    clang versions 3.9.1, 4.0.0, 5.0.0, 6.0.0 
+    MinGW version 5.3.0
+    Visual Studio versions 14 (2015), 15 (2017)
+
+zfp conforms to various language standards, including C89, C99, C++98,
+C++11, and C++14.
+
+NOTE: zfp requires 64-bit compiler and operating system support.
+
+## GNU builds 
+
+To compile zfp using gcc, type
+
+    make
+
+from this directory.  This builds libzfp as a static library as well as
+utilities and example programs.  See documentation for complete build
+instructions.
+
+## CMake builds
+
+To build zfp using CMake on Linux or macOS, start a Unix shell and type
+
+    mkdir build
+    cd build
+    cmake ..
+    make
+
+To also build the examples, replace the cmake line with
+
+    cmake -DBUILD_EXAMPLES=ON ..
+
+To build zfp using Visual Studio on Windows, start a DOS shell, cd to the
+top-level zfp directory, and type
+
+    mkdir build
+    cd build
+    cmake ..
+    cmake --build . --config Release
+
+This builds zfp in release mode.  Replace 'Release' with 'Debug' to build
+zfp in debug mode.  See the instructions for Linux on how to change the
+cmake line to also build the example programs.
+
+## Testing
+
+To test that zfp is working properly, type
+
+    make test
+
+or using CMake
+
+    ctest
+
+If the compilation or regression tests fail, it is possible that some of the
+macros in the file 'Config' have to be adjusted.  Also, the tests may fail
+due to minute differences in the computed floating-point fields being
+compressed, which will be indicated by checksum errors.  If most tests
+succeed and the failures result in byte sizes and error values reasonably
+close to the expected values, then it is likely that the compressor is
+working correctly.
diff --git a/zfp/VERSIONS.md b/zfp/VERSIONS.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a33ff9aafdc906c46dbf4bd0a6cd79d2d22ce06
--- /dev/null
+++ b/zfp/VERSIONS.md
@@ -0,0 +1,273 @@
+# zfp Release Notes
+
+## 0.5.4 (October 1, 2018)
+
+- Added support for CUDA fixed-rate compression and decompression.
+
+- Added views into compressed arrays for thread safety, nested array
+  indexing, slicing, and array subsetting.
+
+- Added C language bindings for compressed arrays.
+
+- Added support for compressing and decompressing 4D data.
+
+- Changes:
+  - Execution policy now applies to both compression and decompression.
+  - Compressed array accessors now return Scalar type instead of
+    const Scalar& to avoid stale references to evicted cache lines.
+
+- Bug fixes:
+  - Handling of negative strides.
+  - Command line tool handling of arrays with more than 2^32 elements.
+  - bitstream C++ compatibility.  
+  - Respect minimum cache size request.
+
+
+## 0.5.3 (March 28, 2018)
+
+- Added support for OpenMP multithreaded compression (but not decompression).
+
+- Added options for OpenMP execution to zfp command-line tool.
+
+- Changed return value of zfp\_decompress to indicate the number of compressed
+  bytes processed so far (now returns same value as zfp\_compress on success).
+
+- Added compressed array support for copy construction and assignment via
+  deep copies.
+
+- Added virtual destructors to enable inheritance from zfp arrays.
+
+
+## 0.5.2 (September 28, 2017)
+
+- Added iterators and proxy objects for pointers and references.
+
+- Added example illustrating how to use iterators and pointers.
+
+- Modified diffusion example to optionally use iterators.
+
+- Moved internal headers under array to array/zfp.
+
+- Modified 64-bit integer typedefs to avoid the C89 non-compliant long long
+  and allow for user-supplied types and literal suffixes.
+
+- Renamed compile-time macros that did not have a ZFP prefix.
+
+- Fixed issue with setting stream word type via CMake.
+
+- Rewrote documentation in reStructuredText and added complete
+  documentation of all public functions, classes, types, and macros.
+  Removed ASCII documentation.
+
+
+## 0.5.1 (March 28, 2017)
+
+- This release primarily fixes a few minor issues but also includes
+  changes in anticipation of a large number of planned future additions
+  to the library.  No changes have been made to the compressed format,
+  which is backwards compatible with version 0.5.0.
+
+- Added high-level API support for integer types.
+
+- Separated library version from CODEC version and added version string.
+
+- Added example that illustrates in-place compression.
+
+- Added support for CMake builds.
+
+- Corrected inconsistent naming of BIT\_STREAM macros in code and
+  documentation.
+
+- Renamed some of the header bit mask macros.
+
+- Added return values to stream\_skip and stream\_flush to indicate the
+  number of bits skipped or output.
+
+- Renamed stream\_block and stream\_delta to make it clear that they refer
+  to strided streams.  Added missing definition of stream\_stride\_block.
+
+- Changed int/uint types in places to use ptrdiff\_t/size\_t where
+  appropriate.
+
+- Changed API for zfp\_set\_precision and zfp\_set\_accuracy to not require
+  the scalar type.
+
+- Added missing static keyword in decode\_block.
+
+- Changed testzfp to allow specifying which tests to perform on the
+  command line.
+
+- Fixed bug that prevented defining uninitialized arrays.
+
+- Fixed incorrect computation of array sizes in zfp\_field\_size.
+
+- Fixed minor issues that prevented code from compiling on Windows.
+
+- Fixed issue with fixed-accuracy headers that caused unnecessary storage.
+
+- Modified directory structure.
+
+- Added documentation that discusses common issues with using zfp.
+
+
+## 0.5.0 (February 29, 2016)
+
+- Modified CODEC to more efficiently encode blocks whose values are all
+  zero or are smaller in magnitude than the absolute error tolerance.
+  This allows representing "empty" blocks using only one bit each.  This
+  version is not backwards compatible with prior zfp versions.
+
+- Changed behavior of zfp\_compress and zfp\_decompress to not automatically
+  rewind the bit stream.  This makes it easier to concatenate multiple
+  compressed bit streams, e.g. when compressing vector fields or multiple
+  scalars together.
+
+- Added functions for compactly encoding the compression parameters
+  and field meta data, e.g. for producing self-contained compressed
+  streams.  Also added functions for reading and writing a header
+  containing these parameters.
+
+- Changed the zfp example program interface to allow reading and writing
+  compressed streams, optionally with a header.  The zfp tool can now be
+  used to compress and decompress files as a stand alone utility.
+
+
+## 0.4.1 (December 28, 2015)
+
+- Fixed bug that caused segmentation fault when compressing 3D arrays
+  whose dimensions are not multiples of four.  Specifically, arrays of
+  dimensions nx * ny * nz, with ny not a multiple of four, were not
+  handled correctly.
+
+- Modified examples/fields.h to ensure standard compliance.  Previously,
+  C99 support was needed to handle the hex float constants, which are
+  not supported in C++98.
+
+- Added simple.c as a minimal example of how to call the compressor.
+
+- Changed compilation of diffusion example to output two executables:
+  one with and one without compression.
+
+
+## 0.4.0 (December 5, 2015)
+
+- Substantial changes to the compression algorithm that improve PSNR
+  by about 6 dB and speed by a factor of 2-3.  These changes are not
+  backward compatible with previous versions of zfp.
+
+- Added support for 31-bit and 63-bit integer data, as well as shorter
+  integer types.
+
+- Rewrote compression codec entirely in C to make linking and calling
+  easier from other programming languages, and to expose the low-level
+  interface through C instead of C++.  This necessitated significant
+  changes to the API as well.
+
+- Minor changes to the C++ compressed array API, as well as major
+  implementation changes to support the C library.  The namespace and
+  public types are now all in lower case.
+
+- Deprecated support for general fixed-point decorrelating transforms
+  and slimmed down implementation.
+
+- Added new examples for evaluating the throughput of the (de)compressor
+  and for compressing grayscale images in the pgm format.
+
+- Added FAQ.
+
+
+## 0.3.2 (December 3, 2015)
+
+- Fixed bug in Array::get() that caused the wrong cached block to be
+  looked up, thus occasionally copying incorrect values back to parts
+  of the array.
+
+
+## 0.3.1 (May 6, 2015)
+
+- Fixed rare bug caused by exponent underflow in blocks with no normal
+  and some denormal numbers.
+
+
+## 0.3.0 (March 3, 2015)
+
+- Modified the default decorrelating transform to one that uses only
+  additions and bit shifts.  This new transform, in addition to being
+  faster, also has some theoretical optimality properties and tends to
+  improve rate distortion.
+
+- Added compile-time support for parameterized transforms, e.g. to
+  support other popular transforms like DCT, HCT, and Walsh-Hadamard.
+
+- Made forward transform range preserving: (-1, 1) is mapped to (-1, 1).
+  Consequently Q1.62 fixed point can be used throughout.
+
+- Changed the order in which bits are emitted within each bit plane
+  to be more intelligent.  Group tests are now deferred until they
+  are needed, i.e. just before the value bits for the group being
+  tested.  This improves the quality of fixed-rate encodings, but
+  has no impact on compressed size.
+
+- Made several optimizations to improve performance.
+
+- Added floating-point traits to reduce the number of template
+  parameters.  It is now possible to declare a 3D array as
+  Array3<float>, for example.
+
+- Added functions for setting the array scalar type and dimensions.
+
+- Consolidated several header files.
+
+- Added testzfp for regression testing.
+
+
+## 0.2.1 (December 12, 2014)
+
+- Added Win64 support via Microsoft Visual Studio compiler.
+
+- Fixed broken support for IBM's xlc compiler.
+
+- Made several minor changes to suppress compiler warnings.
+
+- Documented expected output for the diffusion example.
+
+
+## 0.2.0 (December 2, 2014)
+
+- The compression interface from zfpcompress was relocated to a
+  separate library, called libzfp, and modified to be callable from C.
+  This API now uses a parameter object (zfp\_params) to specify array
+  type and dimensions as well as compression parameters.
+
+- Several utility functions were added to simplify libzfp usage:
+
+  * Functions for setting the rate, precision, and accuracy.
+    Corresponding functions were also added to the Codec class.
+
+  * A function for estimating the buffer size needed for compression.
+
+- The Array class functionality was expanded:
+
+  * Support for accessing the compressed bit stream stored with an
+    array, e.g. for offline compressed storage and for initializing
+    an already compressed array.
+
+  * Functions for dynamically specifying the cache size.
+
+  * The default cache is now direct-mapped instead of two-way
+    associative.
+
+- Minor bug fixes:
+
+  * Corrected the value of the lowest possible bit plane to account for
+    both the smallest exponent and the number of bits in the significand.
+
+  * Corrected inconsistent use of rate and precision.  The rate refers
+    to the number of compressed bits per floating-point value, while
+    the precision refers to the number of uncompressed bits.  The Array
+    API was changed accordingly.
+
+
+## 0.1.0 (November 12, 2014)
+
+- Initial beta release.
diff --git a/zfp/appveyor.yml b/zfp/appveyor.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c808bfaa25f0681e14a753cf9137a1b3503dac86
--- /dev/null
+++ b/zfp/appveyor.yml
@@ -0,0 +1,96 @@
+version: 0.5.4-{build}
+
+environment:
+  matrix:
+    - COMPILER: mingw
+      GENERATOR: MinGW Makefiles
+      PLATFORM: Win32
+      BUILD_TYPE: Debug
+
+    - COMPILER: mingw
+      GENERATOR: MinGW Makefiles
+      PLATFORM: Win32
+      BUILD_TYPE: Release
+
+    - COMPILER: mingw-w64
+      GENERATOR: MinGW Makefiles
+      PLATFORM: x64
+      BUILD_TYPE: Debug
+
+    - COMPILER: mingw-w64
+      GENERATOR: MinGW Makefiles
+      PLATFORM: x64
+      BUILD_TYPE: Release
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 15 2017 Win64
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      PLATFORM: x64
+      BUILD_TYPE: Debug
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 15 2017 Win64
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      PLATFORM: x64
+      BUILD_TYPE: Release
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 15 2017
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      PLATFORM: Win32
+      BUILD_TYPE: Debug
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 15 2017
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      PLATFORM: Win32
+      BUILD_TYPE: Release
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 14 2015 Win64
+      PLATFORM: x64
+      BUILD_TYPE: Debug
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 14 2015 Win64
+      PLATFORM: x64
+      BUILD_TYPE: Release
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 14 2015
+      PLATFORM: Win32
+      BUILD_TYPE: Debug
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 14 2015
+      PLATFORM: Win32
+      BUILD_TYPE: Release
+
+install:
+  - if "%COMPILER%"=="mingw" set PATH=C:\MinGW\bin;%PATH%
+  - if "%COMPILER%"=="mingw-w64" set PATH=C:\MinGW\bin;%PATH%
+
+build_script:
+  - mkdir build
+  - cd build
+
+  # build/test without OpenMP, with CFP
+  - if "%COMPILER%"=="msvc" cmake -G "%GENERATOR%" -DCMAKE_BUILD_TYPE="%BUILD_TYPE%" -DZFP_WITH_OPENMP=OFF -DBUILD_CFP=ON ..
+  - if not "%COMPILER%"=="msvc" cmake -G "%GENERATOR%" -DCMAKE_BUILD_TYPE="%BUILD_TYPE%" -DCMAKE_SH=CMAKE_SH-NOTFOUND -DZFP_WITH_OPENMP=OFF -DBUILD_CFP=ON ..
+
+  - if "%COMPILER%"=="msvc" cmake --build . --config "%BUILD_TYPE%"
+  - if not "%COMPILER%"=="msvc" cmake --build .
+
+  - ctest -V -C "%BUILD_TYPE%"
+
+  - rm -rf ./*
+
+  # build/test with OpenMP, with CFP custom namespace
+  - if "%COMPILER%"=="msvc" cmake -G "%GENERATOR%" -DCMAKE_BUILD_TYPE="%BUILD_TYPE%" -DBUILD_CFP=ON -DCFP_NAMESPACE=cfp2 ..
+  - if not "%COMPILER%"=="msvc" cmake -G "%GENERATOR%" -DCMAKE_BUILD_TYPE="%BUILD_TYPE%" -DCMAKE_SH=CMAKE_SH-NOTFOUND -DBUILD_CFP=ON -DCFP_NAMESPACE=cfp2 ..
+
+  - if "%COMPILER%"=="msvc" cmake --build . --config "%BUILD_TYPE%"
+  - if not "%COMPILER%"=="msvc" cmake --build .
+
+  - ctest -V -C "%BUILD_TYPE%"
+
diff --git a/zfp/array/zfp/cache.h b/zfp/array/zfp/cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..280ac70812dd1f722fe141f4bc8d7304aad75990
--- /dev/null
+++ b/zfp/array/zfp/cache.h
@@ -0,0 +1,257 @@
+#ifndef ZFP_CACHE_H
+#define ZFP_CACHE_H
+
+#include "memory.h"
+
+#ifdef ZFP_WITH_CACHE_PROFILE
+  // maintain stats on hit and miss rates
+  #include <iostream>
+#endif
+
+// direct-mapped or two-way skew-associative write-back cache
+template <class Line>
+class Cache {
+public:
+  // cache line index (zero is reserved for unused lines)
+  typedef uint Index;
+
+  // cache tag containing line meta data
+  class Tag {
+  public:
+    Tag() : x(0) {}
+
+    Tag(Index x, bool d) : x(2 * x + d) {}
+
+    // cache line index
+    Index index() const { return x >> 1; }
+
+    // is line dirty?
+    bool dirty() const { return x & 1; }
+
+    // is line used?
+    bool used() const { return x != 0; }
+
+    // mark line as dirty
+    void mark() { x |= 1u; }
+
+    // mark line as unused
+    void clear() { x = 0; }
+
+  protected:
+    Index x;
+  };
+
+  // sequential iterator for looping over cache lines
+  class const_iterator {
+  public:
+    friend class Cache;
+    class Pair {
+    public:
+      Pair(Line* l, Tag t) : line(l), tag(t) {}
+      Line* line;
+      Tag tag;
+    };
+    const_iterator& operator++()
+    {
+      advance();
+      return *this;
+    }
+    const_iterator operator++(int)
+    {
+      const_iterator iter = *this;
+      advance();
+      return iter;
+    }
+    const Pair& operator*() const { return pair; }
+    const Pair* operator->() const { return &pair; }
+    operator const void*() const { return pair.line ? this : 0; }
+
+  protected:
+    const_iterator(Cache* cache) : c(cache), pair(cache->line, cache->tag[0])
+    {
+      if (!pair.tag.used())
+        advance();
+    }
+    void advance()
+    {
+      if (pair.line) {
+        uint i;
+        for (i = uint(pair.line - c->line) + 1; i <= c->mask && !c->tag[i].used(); i++);
+        pair = (i <= c->mask ? Pair(c->line + i, c->tag[i]) : Pair(0, Tag()));
+      }
+    }
+    Cache* c;
+    Pair pair;
+  };
+
+  // allocate cache with at least minsize lines
+  Cache(uint minsize = 0) : tag(0), line(0)
+  {
+    resize(minsize);
+#ifdef ZFP_WITH_CACHE_PROFILE
+    std::cerr << "cache lines=" << mask + 1 << std::endl;
+    hit[0][0] = hit[1][0] = miss[0] = back[0] = 0;
+    hit[0][1] = hit[1][1] = miss[1] = back[1] = 0;
+#endif
+  }
+
+  // copy constructor--performs a deep copy
+  Cache(const Cache& c) : tag(0), line(0)
+  {
+    deep_copy(c);
+  }
+
+  // destructor
+  ~Cache()
+  {
+    deallocate(tag);
+    deallocate(line);
+#ifdef ZFP_WITH_CACHE_PROFILE
+    std::cerr << "cache R1=" << hit[0][0] << " R2=" << hit[1][0] << " RM=" << miss[0] << " RB=" << back[0]
+              <<      " W1=" << hit[0][1] << " W2=" << hit[1][1] << " WM=" << miss[1] << " WB=" << back[1] << std::endl;
+#endif
+  }
+
+  // assignment operator--performs a deep copy
+  Cache& operator=(const Cache& c)
+  {
+    if (this != &c)
+      deep_copy(c);
+    return *this;
+  }
+
+  // cache size in number of lines
+  uint size() const { return mask + 1; }
+
+  // change cache size to at least minsize lines (all contents will be lost)
+  void resize(uint minsize)
+  {
+    for (mask = minsize ? minsize - 1 : 1; mask & (mask + 1); mask |= mask + 1);
+    reallocate(tag, ((size_t)mask + 1) * sizeof(Tag), 0x100);
+    reallocate(line, ((size_t)mask + 1) * sizeof(Line), 0x100);
+    clear();
+  }
+
+  // look up cache line #x and return pointer to it if in the cache;
+  // otherwise return null
+  const Line* lookup(Index x) const
+  {
+    uint i = primary(x);
+    if (tag[i].index() == x)
+      return line + i;
+#ifdef ZFP_WITH_CACHE_TWOWAY
+    uint j = secondary(x);
+    if (tag[j].index() == x)
+      return line + j;
+#endif
+    return 0;
+  }
+
+  // look up cache line #x and set ptr to where x is or should be stored;
+  // if the returned tag does not match x, then the caller must implement
+  // write-back (if the line is in use) and then fetch the requested line
+  Tag access(Line*& ptr, Index x, bool write)
+  {
+    uint i = primary(x);
+    if (tag[i].index() == x) {
+      ptr = line + i;
+      if (write)
+        tag[i].mark();
+#ifdef ZFP_WITH_CACHE_PROFILE
+      hit[0][write]++;
+#endif
+      return tag[i];
+    }
+#ifdef ZFP_WITH_CACHE_TWOWAY
+    uint j = secondary(x);
+    if (tag[j].index() == x) {
+      ptr = line + j;
+      if (write)
+        tag[j].mark();
+#ifdef ZFP_WITH_CACHE_PROFILE
+      hit[1][write]++;
+#endif
+      return tag[j];
+    }
+    // cache line not found; prefer primary and not dirty slots
+    i = tag[j].used() && (!tag[i].dirty() || tag[j].dirty()) ? i : j;
+#endif
+    ptr = line + i;
+    Tag t = tag[i];
+    tag[i] = Tag(x, write);
+#ifdef ZFP_WITH_CACHE_PROFILE
+    miss[write]++;
+    if (tag[i].dirty())
+      back[write]++;
+#endif
+    return t;
+  }
+
+  // clear cache without writing back
+  void clear()
+  {
+    for (uint i = 0; i <= mask; i++)
+      tag[i].clear();
+  }
+
+  // flush cache line
+  void flush(const Line* l)
+  {
+    uint i = uint(l - line);
+    tag[i].clear();
+  }
+
+  // return iterator to first cache line
+  const_iterator first() { return const_iterator(this); }
+
+protected:
+  // perform a deep copy
+  void deep_copy(const Cache& c)
+  {
+    mask = c.mask;
+    clone(tag, c.tag, mask + 1, 0x100u);
+    clone(line, c.line, mask + 1, 0x100u);
+#ifdef ZFP_WITH_CACHE_PROFILE
+    hit[0][0] = c.hit[0][0];
+    hit[0][1] = c.hit[0][1];
+    hit[1][0] = c.hit[1][0];
+    hit[1][1] = c.hit[1][1];
+    miss[0] = c.miss[0];
+    miss[1] = c.miss[1];
+    back[0] = c.back[0];
+    back[1] = c.back[1];
+#endif
+  }
+
+  uint primary(Index x) const { return x & mask; }
+  uint secondary(Index x) const
+  {
+#ifdef ZFP_WITH_CACHE_FAST_HASH
+    // max entropy hash for 26- to 16-bit mapping (not full avalanche)
+    x -= x <<  7;
+    x ^= x >> 16;
+    x -= x <<  3;
+#else
+    // Jenkins hash; see http://burtleburtle.net/bob/hash/integer.html
+    x -= x <<  6;
+    x ^= x >> 17;
+    x -= x <<  9;
+    x ^= x <<  4;
+    x -= x <<  3;
+    x ^= x << 10;
+    x ^= x >> 15;
+#endif
+    return x & mask;
+  }
+
+  Index mask; // cache line mask
+  Tag* tag;   // cache line tags
+  Line* line; // actual decompressed cache lines
+#ifdef ZFP_WITH_CACHE_PROFILE
+  uint64 hit[2][2]; // number of primary/secondary read/write hits
+  uint64 miss[2];   // number of read/write misses
+  uint64 back[2];   // number of write-backs due to read/writes
+#endif
+};
+
+#endif
diff --git a/zfp/array/zfp/iterator1.h b/zfp/array/zfp/iterator1.h
new file mode 100644
index 0000000000000000000000000000000000000000..310e8e2dd397072fa2f49a5c7519c23cb49ef625
--- /dev/null
+++ b/zfp/array/zfp/iterator1.h
@@ -0,0 +1,38 @@
+// random access iterator that visits 1D array block by block; this class is nested within zfp::array1
+class iterator {
+public:
+  // typedefs for STL compatibility
+  typedef Scalar value_type;
+  typedef ptrdiff_t difference_type;
+  typedef typename array1::reference reference;
+  typedef typename array1::pointer pointer;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  iterator() : ref(0, 0) {}
+  iterator operator=(const iterator& it) { ref.array = it.ref.array; ref.i = it.ref.i; return *this; }
+  reference operator*() const { return ref; }
+  reference operator[](difference_type d) const { return *operator+(d); }
+  iterator& operator++() { increment(); return *this; }
+  iterator& operator--() { decrement(); return *this; }
+  iterator operator++(int) { iterator it = *this; increment(); return it; }
+  iterator operator--(int) { iterator it = *this; decrement(); return it; }
+  iterator operator+=(difference_type d) { ref.i += d; return *this; }
+  iterator operator-=(difference_type d) { ref.i -= d; return *this; }
+  iterator operator+(difference_type d) const { return iterator(ref.array, ref.i + d); }
+  iterator operator-(difference_type d) const { return iterator(ref.array, ref.i - d); }
+  difference_type operator-(const iterator& it) const { return static_cast<difference_type>(ref.i) - static_cast<difference_type>(it.ref.i); }
+  bool operator==(const iterator& it) const { return ref.array == it.ref.array && ref.i == it.ref.i; }
+  bool operator!=(const iterator& it) const { return !operator==(it); }
+  bool operator<=(const iterator& it) const { return ref.array == it.ref.array && ref.i <= it.ref.i; }
+  bool operator>=(const iterator& it) const { return ref.array == it.ref.array && ref.i >= it.ref.i; }
+  bool operator<(const iterator& it) const { return !operator>=(it); }
+  bool operator>(const iterator& it) const { return !operator<=(it); }
+  uint i() const { return ref.i; }
+
+protected:
+  friend class array1;
+  explicit iterator(array1* array, uint i) : ref(array, i) {}
+  void increment() { ref.i++; }
+  void decrement() { ref.i--; }
+  reference ref;
+};
diff --git a/zfp/array/zfp/iterator2.h b/zfp/array/zfp/iterator2.h
new file mode 100644
index 0000000000000000000000000000000000000000..03052c4e6806a8b2dd09d04dd346f9504a9aeddd
--- /dev/null
+++ b/zfp/array/zfp/iterator2.h
@@ -0,0 +1,42 @@
+// forward iterator that visits 2D array block by block; this class is nested within zfp::array2
+class iterator {
+public:
+  // typedefs for STL compatibility
+  typedef Scalar value_type;
+  typedef ptrdiff_t difference_type;
+  typedef typename array2::reference reference;
+  typedef typename array2::pointer pointer;
+  typedef std::forward_iterator_tag iterator_category;
+
+  iterator() : ref(0, 0, 0) {}
+  iterator operator=(const iterator& it) { ref.array = it.ref.array; ref.i = it.ref.i; ref.j = it.ref.j; return *this; }
+  reference operator*() const { return ref; }
+  iterator& operator++() { increment(); return *this; }
+  iterator operator++(int) { iterator it = *this; increment(); return it; }
+  bool operator==(const iterator& it) const { return ref.array == it.ref.array && ref.i == it.ref.i && ref.j == it.ref.j; }
+  bool operator!=(const iterator& it) const { return !operator==(it); }
+  uint i() const { return ref.i; }
+  uint j() const { return ref.j; }
+
+protected:
+  friend class array2;
+  explicit iterator(array2* array, uint i, uint j) : ref(array, i, j) {}
+  void increment()
+  {
+    ref.i++;
+    if (!(ref.i & 3u) || ref.i == ref.array->nx) {
+      ref.i = (ref.i - 1) & ~3u;
+      ref.j++;
+      if (!(ref.j & 3u) || ref.j == ref.array->ny) {
+        ref.j = (ref.j - 1) & ~3u;
+        // done with block; advance to next
+        if ((ref.i += 4) >= ref.array->nx) {
+          ref.i = 0;
+          if ((ref.j += 4) >= ref.array->ny)
+            ref.j = ref.array->ny;
+        }
+      }
+    }
+  }
+  reference ref;
+};
diff --git a/zfp/array/zfp/iterator3.h b/zfp/array/zfp/iterator3.h
new file mode 100644
index 0000000000000000000000000000000000000000..3889fc1cacdcd4c87d07930e391bc60ba94baa1b
--- /dev/null
+++ b/zfp/array/zfp/iterator3.h
@@ -0,0 +1,50 @@
+// forward iterator that visits 3D array block by block; this class is nested within zfp::array3
+class iterator {
+public:
+  // typedefs for STL compatibility
+  typedef Scalar value_type;
+  typedef ptrdiff_t difference_type;
+  typedef typename array3::reference reference;
+  typedef typename array3::pointer pointer;
+  typedef std::forward_iterator_tag iterator_category;
+
+  iterator() : ref(0, 0, 0, 0) {}
+  iterator operator=(const iterator& it) { ref.array = it.ref.array; ref.i = it.ref.i; ref.j = it.ref.j; ref.k = it.ref.k; return *this; }
+  reference operator*() const { return ref; }
+  iterator& operator++() { increment(); return *this; }
+  iterator operator++(int) { iterator it = *this; increment(); return it; }
+  bool operator==(const iterator& it) const { return ref.array == it.ref.array && ref.i == it.ref.i && ref.j == it.ref.j && ref.k == it.ref.k; }
+  bool operator!=(const iterator& it) const { return !operator==(it); }
+  uint i() const { return ref.i; }
+  uint j() const { return ref.j; }
+  uint k() const { return ref.k; }
+
+protected:
+  friend class array3;
+  explicit iterator(array3* array, uint i, uint j, uint k) : ref(array, i, j, k) {}
+  void increment()
+  {
+    ref.i++;
+    if (!(ref.i & 3u) || ref.i == ref.array->nx) {
+      ref.i = (ref.i - 1) & ~3u;
+      ref.j++;
+      if (!(ref.j & 3u) || ref.j == ref.array->ny) {
+        ref.j = (ref.j - 1) & ~3u;
+        ref.k++;
+        if (!(ref.k & 3u) || ref.k == ref.array->nz) {
+          ref.k = (ref.k - 1) & ~3u;
+          // done with block; advance to next
+          if ((ref.i += 4) >= ref.array->nx) {
+            ref.i = 0;
+            if ((ref.j += 4) >= ref.array->ny) {
+              ref.j = 0;
+              if ((ref.k += 4) >= ref.array->nz)
+                ref.k = ref.array->nz;
+            }
+          }
+        }
+      }
+    }
+  }
+  reference ref;
+};
diff --git a/zfp/array/zfp/memory.h b/zfp/array/zfp/memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea20b77f594781d002b52e3c6bfd219c3c781fd1
--- /dev/null
+++ b/zfp/array/zfp/memory.h
@@ -0,0 +1,60 @@
+#ifndef ZFP_MEMORY_H
+#define ZFP_MEMORY_H
+
+#include <algorithm>
+#include <cstdlib>
+#include "zfp/types.h"
+
+// allocate size bytes with optional alignment
+inline void*
+allocate(size_t size, size_t alignment = 0)
+{
+#if defined(__USE_XOPEN2K) && defined(ZFP_WITH_ALIGNED_ALLOC)
+  void* ptr;
+  if (alignment > 1)
+    posix_memalign(&ptr, alignment, size);
+  else
+    ptr = malloc(size);
+  return ptr;
+#else
+  return new uchar[size];
+#endif
+}
+
+// deallocate memory pointed to by ptr
+template <typename T>
+inline void
+deallocate(T* ptr)
+{
+#if defined(__USE_XOPEN2K) && defined(ZFP_WITH_ALIGNED_ALLOC)
+  if (ptr)
+    free(ptr);
+#else
+  delete[] ptr;
+#endif
+}
+
+// reallocate size bytes with optional alignment
+template <typename T>
+inline void
+reallocate(T*& ptr, size_t size, size_t alignment = 0)
+{
+  deallocate(ptr);
+  ptr = static_cast<T*>(allocate(size, alignment));
+}
+
+// clone array 'T src[count]' with optional alignment
+template <typename T>
+inline void
+clone(T*& dst, const T* src, size_t count, size_t alignment = 0)
+{
+  deallocate(dst);
+  if (src) {
+    dst = static_cast<T*>(allocate(count * sizeof(T), alignment));
+    std::copy(src, src + count, dst);
+  }
+  else
+    dst = 0;
+}
+
+#endif
diff --git a/zfp/array/zfp/pointer1.h b/zfp/array/zfp/pointer1.h
new file mode 100644
index 0000000000000000000000000000000000000000..f58557c0df368b5672164ad61c268b6bcdc02b21
--- /dev/null
+++ b/zfp/array/zfp/pointer1.h
@@ -0,0 +1,30 @@
+// pointer to a 1D array element; this class is nested within zfp::array1
+class pointer {
+public:
+  pointer() : ref(0, 0) {}
+  pointer operator=(const pointer& p) { ref.array = p.ref.array; ref.i = p.ref.i; return *this; }
+  reference operator*() const { return ref; }
+  reference operator[](ptrdiff_t d) const { return *operator+(d); }
+  pointer& operator++() { increment(); return *this; }
+  pointer& operator--() { decrement(); return *this; }
+  pointer operator++(int) { pointer p = *this; increment(); return p; }
+  pointer operator--(int) { pointer p = *this; decrement(); return p; }
+  pointer operator+=(ptrdiff_t d) { ref.i += d; return *this; }
+  pointer operator-=(ptrdiff_t d) { ref.i -= d; return *this; }
+  pointer operator+(ptrdiff_t d) const { pointer p = *this; p += d; return p; }
+  pointer operator-(ptrdiff_t d) const { pointer p = *this; p -= d; return p; }
+  ptrdiff_t operator-(const pointer& p) const { return index() - p.index(); }
+  bool operator==(const pointer& p) const { return ref.array == p.ref.array && ref.i == p.ref.i; }
+  bool operator!=(const pointer& p) const { return !operator==(p); }
+
+protected:
+  friend class array1;
+  friend class reference;
+  explicit pointer(reference r) : ref(r) {}
+  explicit pointer(array1* array, uint i) : ref(array, i) {}
+  ptrdiff_t index() const { return ref.i; }
+  void set(ptrdiff_t index) { ref.i = index; }
+  void increment() { ref.i++; }
+  void decrement() { ref.i--; }
+  reference ref;
+};
diff --git a/zfp/array/zfp/pointer2.h b/zfp/array/zfp/pointer2.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcdb518fb90072a282dccb4f61a36d5d289667b2
--- /dev/null
+++ b/zfp/array/zfp/pointer2.h
@@ -0,0 +1,42 @@
+// pointer to a 2D array element; this class is nested within zfp::array2
+class pointer {
+public:
+  pointer() : ref(0, 0, 0) {}
+  pointer operator=(const pointer& p) { ref.array = p.ref.array; ref.i = p.ref.i; ref.j = p.ref.j; return *this; }
+  reference operator*() const { return ref; }
+  reference operator[](ptrdiff_t d) const { return *operator+(d); }
+  pointer& operator++() { increment(); return *this; }
+  pointer& operator--() { decrement(); return *this; }
+  pointer operator++(int) { pointer p = *this; increment(); return p; }
+  pointer operator--(int) { pointer p = *this; decrement(); return p; }
+  pointer operator+=(ptrdiff_t d) { set(index() + d); return *this; }
+  pointer operator-=(ptrdiff_t d) { set(index() - d); return *this; }
+  pointer operator+(ptrdiff_t d) const { pointer p = *this; p += d; return p; }
+  pointer operator-(ptrdiff_t d) const { pointer p = *this; p -= d; return p; }
+  ptrdiff_t operator-(const pointer& p) const { return index() - p.index(); }
+  bool operator==(const pointer& p) const { return ref.array == p.ref.array && ref.i == p.ref.i && ref.j == p.ref.j; }
+  bool operator!=(const pointer& p) const { return !operator==(p); }
+
+protected:
+  friend class array2;
+  friend class reference;
+  explicit pointer(reference r) : ref(r) {}
+  explicit pointer(array2* array, uint i, uint j) : ref(array, i, j) {}
+  ptrdiff_t index() const { return ref.i + ref.array->nx * ref.j; }
+  void set(ptrdiff_t index) { ref.array->ij(ref.i, ref.j, index); }
+  void increment()
+  {
+    if (++ref.i == ref.array->nx) {
+      ref.i = 0;
+      ref.j++;
+    }
+  }
+  void decrement()
+  {
+    if (!ref.i--) {
+      ref.i = ref.array->nx - 1;
+      ref.j--;
+    }
+  }
+  reference ref;
+};
diff --git a/zfp/array/zfp/pointer3.h b/zfp/array/zfp/pointer3.h
new file mode 100644
index 0000000000000000000000000000000000000000..091af6044b45cf96db396edc44a3ee7bd5b2c5c6
--- /dev/null
+++ b/zfp/array/zfp/pointer3.h
@@ -0,0 +1,48 @@
+// pointer to a 3D array element; this class is nested within zfp::array3
+class pointer {
+public:
+  pointer() : ref(0, 0, 0, 0) {}
+  pointer operator=(const pointer& p) { ref.array = p.ref.array; ref.i = p.ref.i; ref.j = p.ref.j; ref.k = p.ref.k; return *this; }
+  reference operator*() const { return ref; }
+  reference operator[](ptrdiff_t d) const { return *operator+(d); }
+  pointer& operator++() { increment(); return *this; }
+  pointer& operator--() { decrement(); return *this; }
+  pointer operator++(int) { pointer p = *this; increment(); return p; }
+  pointer operator--(int) { pointer p = *this; decrement(); return p; }
+  pointer operator+=(ptrdiff_t d) { set(index() + d); return *this; }
+  pointer operator-=(ptrdiff_t d) { set(index() - d); return *this; }
+  pointer operator+(ptrdiff_t d) const { pointer p = *this; p += d; return p; }
+  pointer operator-(ptrdiff_t d) const { pointer p = *this; p -= d; return p; }
+  ptrdiff_t operator-(const pointer& p) const { return index() - p.index(); }
+  bool operator==(const pointer& p) const { return ref.array == p.ref.array && ref.i == p.ref.i && ref.j == p.ref.j && ref.k == p.ref.k; }
+  bool operator!=(const pointer& p) const { return !operator==(p); }
+
+protected:
+  friend class array3;
+  friend class reference;
+  explicit pointer(reference r) : ref(r) {}
+  explicit pointer(array3* array, uint i, uint j, uint k) : ref(array, i, j, k) {}
+  ptrdiff_t index() const { return ref.i + ref.array->nx * (ref.j + ref.array->ny * ref.k); }
+  void set(ptrdiff_t index) { ref.array->ijk(ref.i, ref.j, ref.k, index); }
+  void increment()
+  {
+    if (++ref.i == ref.array->nx) {
+      ref.i = 0;
+      if (++ref.j == ref.array->ny) {
+        ref.j = 0;
+        ref.k++;
+      }
+    }
+  }
+  void decrement()
+  {
+    if (!ref.i--) {
+      ref.i = ref.array->nx - 1;
+      if (!ref.j--) {
+        ref.j = ref.array->ny - 1;
+        ref.k--;
+      }
+    }
+  }
+  reference ref;
+};
diff --git a/zfp/array/zfp/reference1.h b/zfp/array/zfp/reference1.h
new file mode 100644
index 0000000000000000000000000000000000000000..99f2e6a67643d0fb47db27ccaccdd418c71cf301
--- /dev/null
+++ b/zfp/array/zfp/reference1.h
@@ -0,0 +1,27 @@
+// reference to a 1D array element; this class is nested within zfp::array1
+class reference {
+public:
+  operator Scalar() const { return array->get(i); }
+  reference operator=(const reference& r) { array->set(i, r.operator Scalar()); return *this; }
+  reference operator=(Scalar val) { array->set(i, val); return *this; }
+  reference operator+=(Scalar val) { array->add(i, val); return *this; }
+  reference operator-=(Scalar val) { array->sub(i, val); return *this; }
+  reference operator*=(Scalar val) { array->mul(i, val); return *this; }
+  reference operator/=(Scalar val) { array->div(i, val); return *this; }
+  pointer operator&() const { return pointer(*this); }
+  // swap two array elements via proxy references
+  friend void swap(reference a, reference b)
+  {
+    Scalar x = a.operator Scalar();
+    Scalar y = b.operator Scalar();
+    b.operator=(x);
+    a.operator=(y);
+  }
+
+protected:
+  friend class array1;
+  friend class iterator;
+  explicit reference(array1* array, uint i) : array(array), i(i) {}
+  array1* array;
+  uint i;
+};
diff --git a/zfp/array/zfp/reference2.h b/zfp/array/zfp/reference2.h
new file mode 100644
index 0000000000000000000000000000000000000000..76a0bd3b10158e015203e644a74070159703eed9
--- /dev/null
+++ b/zfp/array/zfp/reference2.h
@@ -0,0 +1,27 @@
+// reference to a 2D array element; this class is nested within zfp::array2
+class reference {
+public:
+  operator Scalar() const { return array->get(i, j); }
+  reference operator=(const reference& r) { array->set(i, j, r.operator Scalar()); return *this; }
+  reference operator=(Scalar val) { array->set(i, j, val); return *this; }
+  reference operator+=(Scalar val) { array->add(i, j, val); return *this; }
+  reference operator-=(Scalar val) { array->sub(i, j, val); return *this; }
+  reference operator*=(Scalar val) { array->mul(i, j, val); return *this; }
+  reference operator/=(Scalar val) { array->div(i, j, val); return *this; }
+  pointer operator&() const { return pointer(*this); }
+  // swap two array elements via proxy references
+  friend void swap(reference a, reference b)
+  {
+    Scalar x = a.operator Scalar();
+    Scalar y = b.operator Scalar();
+    b.operator=(x);
+    a.operator=(y);
+  }
+
+protected:
+  friend class array2;
+  friend class iterator;
+  explicit reference(array2* array, uint i, uint j) : array(array), i(i), j(j) {}
+  array2* array;
+  uint i, j;
+};
diff --git a/zfp/array/zfp/reference3.h b/zfp/array/zfp/reference3.h
new file mode 100644
index 0000000000000000000000000000000000000000..91175e18033c1c1a578346bc0c5a0342e646853b
--- /dev/null
+++ b/zfp/array/zfp/reference3.h
@@ -0,0 +1,27 @@
+// reference to a 3D array element; this class is nested within zfp::array3
+class reference {
+public:
+  operator Scalar() const { return array->get(i, j, k); }
+  reference operator=(const reference& r) { array->set(i, j, k, r.operator Scalar()); return *this; }
+  reference operator=(Scalar val) { array->set(i, j, k, val); return *this; }
+  reference operator+=(Scalar val) { array->add(i, j, k, val); return *this; }
+  reference operator-=(Scalar val) { array->sub(i, j, k, val); return *this; }
+  reference operator*=(Scalar val) { array->mul(i, j, k, val); return *this; }
+  reference operator/=(Scalar val) { array->div(i, j, k, val); return *this; }
+  pointer operator&() const { return pointer(*this); }
+  // swap two array elements via proxy references
+  friend void swap(reference a, reference b)
+  {
+    Scalar x = a.operator Scalar();
+    Scalar y = b.operator Scalar();
+    b.operator=(x);
+    a.operator=(y);
+  }
+
+protected:
+  friend class array3;
+  friend class iterator;
+  explicit reference(array3* array, uint i, uint j, uint k) : array(array), i(i), j(j), k(k) {}
+  array3* array;
+  uint i, j, k;
+};
diff --git a/zfp/array/zfp/view1.h b/zfp/array/zfp/view1.h
new file mode 100644
index 0000000000000000000000000000000000000000..6129ae5ee06431720931c27bbd0d7d6726b5dee3
--- /dev/null
+++ b/zfp/array/zfp/view1.h
@@ -0,0 +1,291 @@
+// 1D array views; these classes are nested within zfp::array1
+
+// abstract view of 1D array (base class)
+class preview {
+public:
+  // rate in bits per value
+  double rate() const { return array->rate(); }
+
+  // dimensions of (sub)array
+  size_t size() const { return size_t(nx); }
+
+  // local to global array index
+  uint global_x(uint i) const { return x + i; }
+
+protected:
+  // construction and assignment--perform shallow copy of (sub)array
+  explicit preview(array1* array) : array(array), x(0), nx(array->nx) {}
+  explicit preview(array1* array, uint x, uint nx) : array(array), x(x), nx(nx) {}
+  preview& operator=(array1* a)
+  {
+    array = a;
+    x = 0;
+    nx = a->nx;
+    return *this;
+  }
+
+  array1* array; // underlying container
+  uint x;        // offset into array
+  uint nx;       // dimensions of subarray
+};
+
+// generic read-only view into a rectangular subset of a 1D array
+class const_view : public preview {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::nx;
+public:
+  // construction--perform shallow copy of (sub)array
+  const_view(array1* array) : preview(array) {}
+  const_view(array1* array, uint x, uint nx) : preview(array, x, nx) {}
+
+  // dimensions of (sub)array
+  uint size_x() const { return nx; }
+
+  // [i] accessor
+  Scalar operator[](uint index) const { return array->get(x + index); }
+
+  // (i) accessor
+  Scalar operator()(uint i) const { return array->get(x + i); }
+};
+
+// generic read-write view into a rectangular subset of a 1D array
+class view : public const_view {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::nx;
+public:
+  // construction--perform shallow copy of (sub)array
+  view(array1* array) : const_view(array) {}
+  view(array1* array, uint x, uint nx) : const_view(array, x, nx) {}
+
+  // [i] accessor from base class
+  using const_view::operator[];
+
+  // (i) accessor from base class
+  using const_view::operator();
+
+  // [i] mutator
+  reference operator[](uint index) { return reference(array, x + index); }
+
+  // (i) mutator
+  reference operator()(uint i) { return reference(array, x + i); }
+};
+
+// thread-safe read-only view of 1D (sub)array with private cache
+class private_const_view : public preview {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::nx;
+public:
+  // construction--perform shallow copy of (sub)array
+  private_const_view(array1* array) :
+    preview(array),
+    cache(array->cache.size())
+  {
+    init();
+  }
+  private_const_view(array1* array, uint x, uint nx) :
+    preview(array, x, nx),
+    cache(array->cache.size())
+  {
+    init();
+  }
+
+  // destructor
+  ~private_const_view()
+  {
+    stream_close(zfp->stream);
+    zfp_stream_close(zfp);
+  }
+
+  // dimensions of (sub)array
+  uint size_x() const { return nx; }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size() * sizeof(CacheLine); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t csize)
+  {
+    cache.resize(array->lines(csize, nx));
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // (i) accessor
+  Scalar operator()(uint i) const { return get(x + i); }
+
+protected:
+  // cache line representing one block of decompressed values
+  class CacheLine {
+  public:
+    const Scalar& operator()(uint i) const { return a[index(i)]; }
+    Scalar& operator()(uint i) { return a[index(i)]; }
+    const Scalar* data() const { return a; }
+    Scalar* data() { return a; }
+  protected:
+    static uint index(uint i) { return i & 3u; }
+    Scalar a[4];
+  };
+
+  // copy private data
+  void init()
+  {
+    // copy compressed stream
+    zfp = zfp_stream_open(0);
+    *zfp = *array->zfp;
+    // copy bit stream
+    zfp->stream = stream_clone(array->zfp->stream);
+  }
+
+  // inspector
+  const Scalar& get(uint i) const
+  {
+    const CacheLine* p = line(i);
+    return (*p)(i);
+  }
+
+  // return cache line for i; may require write-back and fetch
+  CacheLine* line(uint i) const
+  {
+    CacheLine* p = 0;
+    uint b = array->block(i);
+    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, false);
+    uint c = t.index() - 1;
+    // fetch cache line; no writeback possible since view is read-only
+    if (c != b)
+      decode(b, p->data());
+    return p;
+  }
+
+  // decode block with given index
+  void decode(uint index, Scalar* block) const
+  {
+    stream_rseek(zfp->stream, index * array->blkbits);
+    Codec::decode_block_1(zfp, block, array->shape ? array->shape[index] : 0);
+  }
+
+  zfp_stream* zfp;                // stream of compressed blocks
+  mutable Cache<CacheLine> cache; // cache of decompressed blocks
+};
+
+// thread-safe read-write view of private 1D (sub)array
+class private_view : public private_const_view {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::nx;
+  using private_const_view::zfp;
+  using private_const_view::cache;
+  using private_const_view::init;
+  using private_const_view::decode;
+  class view_reference;
+  typedef typename private_const_view::CacheLine CacheLine;
+public:
+  // construction--perform shallow copy of (sub)array
+  private_view(array1* array) : private_const_view(array) {}
+  private_view(array1* array, uint x, uint nx) : private_const_view(array, x, nx) {}
+
+  // partition view into count block-aligned pieces, with 0 <= index < count
+  void partition(uint index, uint count)
+  {
+    partition(x, nx, index, count);
+  }
+
+  // flush cache by compressing all modified cached blocks
+  void flush_cache() const
+  {
+    for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
+      if (p->tag.dirty()) {
+        uint b = p->tag.index() - 1;
+        encode(b, p->line->data());
+      }
+      cache.flush(p->line);
+    }
+  }
+
+  // (i) accessor from base class
+  using private_const_view::operator();
+
+  // (i) mutator
+  view_reference operator()(uint i) { return view_reference(this, x + i); }
+
+protected:
+  class view_reference {
+  public:
+    operator Scalar() const { return view->get(i); }
+    view_reference operator=(const view_reference& r) { view->set(i, r.operator Scalar()); return *this; }
+    view_reference operator=(Scalar val) { view->set(i, val); return *this; }
+    view_reference operator+=(Scalar val) { view->add(i, val); return *this; }
+    view_reference operator-=(Scalar val) { view->sub(i, val); return *this; }
+    view_reference operator*=(Scalar val) { view->mul(i, val); return *this; }
+    view_reference operator/=(Scalar val) { view->div(i, val); return *this; }
+    // swap two array elements via proxy references
+    friend void swap(view_reference a, view_reference b)
+    {
+      Scalar x = a.operator Scalar();
+      Scalar y = b.operator Scalar();
+      b.operator=(x);
+      a.operator=(y);
+    }
+
+  protected:
+    friend class private_view;
+    explicit view_reference(private_view* view, uint i) : view(view), i(i) {}
+    private_view* view;
+    uint i;
+  };
+
+  // block-aligned partition of [offset, offset + size): index out of count
+  static void partition(uint& offset, uint& size, uint index, uint count)
+  {
+    uint bmin = offset / 4;
+    uint bmax = (offset + size + 3) / 4;
+    uint xmin = std::max(offset +    0, 4 * (bmin + (bmax - bmin) * (index + 0) / count));
+    uint xmax = std::min(offset + size, 4 * (bmin + (bmax - bmin) * (index + 1) / count));
+    offset = xmin;
+    size = xmax - xmin;
+  }
+
+  // mutator
+  void set(uint i, Scalar val)
+  {
+    CacheLine* p = line(i, true);
+    (*p)(i) = val;
+  }
+
+  // in-place updates
+  void add(uint i, Scalar val) { (*line(i, true))(i) += val; }
+  void sub(uint i, Scalar val) { (*line(i, true))(i) -= val; }
+  void mul(uint i, Scalar val) { (*line(i, true))(i) *= val; }
+  void div(uint i, Scalar val) { (*line(i, true))(i) /= val; }
+
+  // return cache line for i; may require write-back and fetch
+  CacheLine* line(uint i, bool write) const
+  {
+    CacheLine* p = 0;
+    uint b = array->block(i);
+    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write);
+    uint c = t.index() - 1;
+    if (c != b) {
+      // write back occupied cache line if it is dirty
+      if (t.dirty())
+        encode(c, p->data());
+      decode(b, p->data());
+    }
+    return p;
+  }
+
+  // encode block with given index
+  void encode(uint index, const Scalar* block) const
+  {
+    stream_wseek(zfp->stream, index * array->blkbits);
+    Codec::encode_block_1(zfp, block, array->shape ? array->shape[index] : 0);
+    stream_flush(zfp->stream);
+  }
+};
diff --git a/zfp/array/zfp/view2.h b/zfp/array/zfp/view2.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcfdf8cad8f05594ca3963541590eec81f305888
--- /dev/null
+++ b/zfp/array/zfp/view2.h
@@ -0,0 +1,393 @@
+// 2D array views; these classes are nested within zfp::array2
+
+// abstract view of 2D array (base class)
+class preview {
+public:
+  // rate in bits per value
+  double rate() const { return array->rate(); }
+
+  // dimensions of (sub)array
+  size_t size() const { return size_t(nx) * size_t(ny); }
+
+  // local to global array indices
+  uint global_x(uint i) const { return x + i; }
+  uint global_y(uint j) const { return y + j; }
+
+protected:
+  // construction and assignment--perform shallow copy of (sub)array
+  explicit preview(array2* array) : array(array), x(0), y(0), nx(array->nx), ny(array->ny) {}
+  explicit preview(array2* array, uint x, uint y, uint nx, uint ny) : array(array), x(x), y(y), nx(nx), ny(ny) {}
+  preview& operator=(array2* a)
+  {
+    array = a;
+    x = y = 0;
+    nx = a->nx;
+    ny = a->ny;
+    return *this;
+  }
+
+  array2* array; // underlying container
+  uint x, y;     // offset into array
+  uint nx, ny;   // dimensions of subarray
+};
+
+// generic read-only view into a rectangular subset of a 2D array
+class const_view : public preview {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::nx;
+  using preview::ny;
+public:
+  // construction--perform shallow copy of (sub)array
+  const_view(array2* array) : preview(array) {}
+  const_view(array2* array, uint x, uint y, uint nx, uint ny) : preview(array, x, y, nx, ny) {}
+
+  // dimensions of (sub)array
+  uint size_x() const { return nx; }
+  uint size_y() const { return ny; }
+
+  // (i, j) accessor
+  Scalar operator()(uint i, uint j) const { return array->get(x + i, y + j); }
+};
+
+// generic read-write view into a rectangular subset of a 2D array
+class view : public const_view {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::nx;
+  using preview::ny;
+public:
+  // construction--perform shallow copy of (sub)array
+  view(array2* array) : const_view(array) {}
+  view(array2* array, uint x, uint y, uint nx, uint ny) : const_view(array, x, y, nx, ny) {}
+
+  // (i, j) accessor from base class
+  using const_view::operator();
+
+  // (i, j) mutator
+  reference operator()(uint i, uint j) { return reference(array, x + i, y + j); }
+};
+
+// flat view of 2D array (operator[] returns scalar)
+class flat_view : public view {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::nx;
+  using preview::ny;
+public:
+  // construction--perform shallow copy of (sub)array
+  flat_view(array2* array) : view(array) {}
+  flat_view(array2* array, uint x, uint y, uint nx, uint ny) : view(array, x, y, nx, ny) {}
+
+  // convert (i, j) index to flat index
+  uint index(uint i, uint j) const { return i + nx * j; }
+
+  // convert flat index to (i, j) index
+  void ij(uint& i, uint& j, uint index) const
+  {
+    i = index % nx; index /= nx;
+    j = index;
+  }
+
+  // flat index accessors
+  Scalar operator[](uint index) const
+  {
+    uint i, j;
+    ij(i, j, index);
+    return array->get(x + i, y + j);
+  }
+  reference operator[](uint index)
+  {
+    uint i, j;
+    ij(i, j, index);
+    return reference(array, x + i, y + j);
+  }
+};
+
+// forward declaration of friends
+class nested_view1;
+class nested_view2;
+
+// nested view into a 1D rectangular subset of a 2D array
+class nested_view1 : public preview {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::nx;
+  using preview::ny;
+public:
+  // dimensions of (sub)array
+  uint size_x() const { return nx; }
+
+  // [i] accessor and mutator
+  Scalar operator[](uint index) const { return array->get(x + index, y); }
+  reference operator[](uint index) { return reference(array, x + index, y); }
+
+  // (i) accessor and mutator
+  Scalar operator()(uint i) const { return array->get(x + i, y); }
+  reference operator()(uint i) { return reference(array, x + i, y); }
+
+protected:
+  // construction--perform shallow copy of (sub)array
+  friend class nested_view2;
+  explicit nested_view1(array2* array) : preview(array) {}
+  explicit nested_view1(array2* array, uint x, uint y, uint nx, uint ny) : preview(array, x, y, nx, ny) {}
+};
+
+// nested view into a 2D rectangular subset of a 2D array
+class nested_view2 : public preview {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::nx;
+  using preview::ny;
+public:
+  // construction--perform shallow copy of (sub)array
+  nested_view2(array2* array) : preview(array) {}
+  nested_view2(array2* array, uint x, uint y, uint nx, uint ny) : preview(array, x, y, nx, ny) {}
+
+  // dimensions of (sub)array
+  uint size_x() const { return nx; }
+  uint size_y() const { return ny; }
+
+  // 1D view
+  nested_view1 operator[](uint index) const { return nested_view1(array, x, y + index, nx, 1); }
+
+  // (i, j) accessor and mutator
+  Scalar operator()(uint i, uint j) const { return array->get(x + i, y + j); }
+  reference operator()(uint i, uint j) { return reference(array, x + i, y + j); }
+};
+
+typedef nested_view2 nested_view;
+
+// thread-safe read-only view of 2D (sub)array with private cache
+class private_const_view : public preview {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::nx;
+  using preview::ny;
+public:
+  // construction--perform shallow copy of (sub)array
+  private_const_view(array2* array) :
+    preview(array),
+    cache(array->cache.size())
+  {
+    init();
+  }
+  private_const_view(array2* array, uint x, uint y, uint nx, uint ny) :
+    preview(array, x, y, nx, ny),
+    cache(array->cache.size())
+  {
+    init();
+  }
+
+  // destructor
+  ~private_const_view()
+  {
+    stream_close(zfp->stream);
+    zfp_stream_close(zfp);
+  }
+
+  // dimensions of (sub)array
+  uint size_x() const { return nx; }
+  uint size_y() const { return ny; }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size() * sizeof(CacheLine); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t csize)
+  {
+    cache.resize(array->lines(csize, nx, ny));
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // (i, j) accessor
+  Scalar operator()(uint i, uint j) const { return get(x + i, y + j); }
+
+protected:
+  // cache line representing one block of decompressed values
+  class CacheLine {
+  public:
+    const Scalar& operator()(uint i, uint j) const { return a[index(i, j)]; }
+    Scalar& operator()(uint i, uint j) { return a[index(i, j)]; }
+    const Scalar* data() const { return a; }
+    Scalar* data() { return a; }
+  protected:
+    static uint index(uint i, uint j) { return (i & 3u) + 4 * (j & 3u); }
+    Scalar a[16];
+  };
+
+  // copy private data
+  void init()
+  {
+    // copy compressed stream
+    zfp = zfp_stream_open(0);
+    *zfp = *array->zfp;
+    // copy bit stream
+    zfp->stream = stream_clone(array->zfp->stream);
+  }
+
+  // inspector
+  const Scalar& get(uint i, uint j) const
+  {
+    const CacheLine* p = line(i, j);
+    return (*p)(i, j);
+  }
+
+  // return cache line for (i, j); may require write-back and fetch
+  CacheLine* line(uint i, uint j) const
+  {
+    CacheLine* p = 0;
+    uint b = array->block(i, j);
+    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, false);
+    uint c = t.index() - 1;
+    // fetch cache line; no writeback possible since view is read-only
+    if (c != b)
+      decode(b, p->data());
+    return p;
+  }
+
+  // decode block with given index
+  void decode(uint index, Scalar* block) const
+  {
+    stream_rseek(zfp->stream, index * array->blkbits);
+    Codec::decode_block_2(zfp, block, array->shape ? array->shape[index] : 0);
+  }
+
+  zfp_stream* zfp;                // stream of compressed blocks
+  mutable Cache<CacheLine> cache; // cache of decompressed blocks
+};
+
+// thread-safe read-write view of private 2D (sub)array
+class private_view : public private_const_view {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::nx;
+  using preview::ny;
+  using private_const_view::zfp;
+  using private_const_view::cache;
+  using private_const_view::init;
+  using private_const_view::decode;
+  class view_reference;
+  typedef typename private_const_view::CacheLine CacheLine;
+public:
+  // construction--perform shallow copy of (sub)array
+  private_view(array2* array) : private_const_view(array) {}
+  private_view(array2* array, uint x, uint y, uint nx, uint ny) : private_const_view(array, x, y, nx, ny) {}
+
+  // partition view into count block-aligned pieces, with 0 <= index < count
+  void partition(uint index, uint count)
+  {
+    if (nx > ny)
+      partition(x, nx, index, count);
+    else
+      partition(y, ny, index, count);
+  }
+
+  // flush cache by compressing all modified cached blocks
+  void flush_cache() const
+  {
+    for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
+      if (p->tag.dirty()) {
+        uint b = p->tag.index() - 1;
+        encode(b, p->line->data());
+      }
+      cache.flush(p->line);
+    }
+  }
+
+  // (i, j) accessor from base class
+  using private_const_view::operator();
+
+  // (i, j) mutator
+  view_reference operator()(uint i, uint j) { return view_reference(this, x + i, y + j); }
+
+protected:
+  class view_reference {
+  public:
+    operator Scalar() const { return view->get(i, j); }
+    view_reference operator=(const view_reference& r) { view->set(i, j, r.operator Scalar()); return *this; }
+    view_reference operator=(Scalar val) { view->set(i, j, val); return *this; }
+    view_reference operator+=(Scalar val) { view->add(i, j, val); return *this; }
+    view_reference operator-=(Scalar val) { view->sub(i, j, val); return *this; }
+    view_reference operator*=(Scalar val) { view->mul(i, j, val); return *this; }
+    view_reference operator/=(Scalar val) { view->div(i, j, val); return *this; }
+    // swap two array elements via proxy references
+    friend void swap(view_reference a, view_reference b)
+    {
+      Scalar x = a.operator Scalar();
+      Scalar y = b.operator Scalar();
+      b.operator=(x);
+      a.operator=(y);
+    }
+
+  protected:
+    friend class private_view;
+    explicit view_reference(private_view* view, uint i, uint j) : view(view), i(i), j(j) {}
+    private_view* view;
+    uint i, j;
+  };
+
+  // block-aligned partition of [offset, offset + size): index out of count
+  static void partition(uint& offset, uint& size, uint index, uint count)
+  {
+    uint bmin = offset / 4;
+    uint bmax = (offset + size + 3) / 4;
+    uint xmin = std::max(offset +    0, 4 * (bmin + (bmax - bmin) * (index + 0) / count));
+    uint xmax = std::min(offset + size, 4 * (bmin + (bmax - bmin) * (index + 1) / count));
+    offset = xmin;
+    size = xmax - xmin;
+  }
+
+  // mutator
+  void set(uint i, uint j, Scalar val)
+  {
+    CacheLine* p = line(i, j, true);
+    (*p)(i, j) = val;
+  }
+
+  // in-place updates
+  void add(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) += val; }
+  void sub(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) -= val; }
+  void mul(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) *= val; }
+  void div(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) /= val; }
+
+  // return cache line for (i, j); may require write-back and fetch
+  CacheLine* line(uint i, uint j, bool write) const
+  {
+    CacheLine* p = 0;
+    uint b = array->block(i, j);
+    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write);
+    uint c = t.index() - 1;
+    if (c != b) {
+      // write back occupied cache line if it is dirty
+      if (t.dirty())
+        encode(c, p->data());
+      decode(b, p->data());
+    }
+    return p;
+  }
+
+  // encode block with given index
+  void encode(uint index, const Scalar* block) const
+  {
+    stream_wseek(zfp->stream, index * array->blkbits);
+    Codec::encode_block_2(zfp, block, array->shape ? array->shape[index] : 0);
+    stream_flush(zfp->stream);
+  }
+};
diff --git a/zfp/array/zfp/view3.h b/zfp/array/zfp/view3.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1bf457fc8a1f144ab67e4fc9f0caf1221eb494a
--- /dev/null
+++ b/zfp/array/zfp/view3.h
@@ -0,0 +1,445 @@
+// 3D array views; these classes are nested within zfp::array3
+
+// abstract view of 3D array (base class)
+class preview {
+public:
+  // rate in bits per value
+  double rate() const { return array->rate(); }
+
+  // dimensions of (sub)array
+  size_t size() const { return size_t(nx) * size_t(ny) * size_t(nz); }
+
+  // local to global array indices
+  uint global_x(uint i) const { return x + i; }
+  uint global_y(uint j) const { return y + j; }
+  uint global_z(uint k) const { return z + k; }
+
+protected:
+  // construction and assignment--perform shallow copy of (sub)array
+  explicit preview(array3* array) : array(array), x(0), y(0), z(0), nx(array->nx), ny(array->ny), nz(array->nz) {}
+  explicit preview(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : array(array), x(x), y(y), z(z), nx(nx), ny(ny), nz(nz) {}
+  preview& operator=(array3* a)
+  {
+    array = a;
+    x = y = z = 0;
+    nx = a->nx;
+    ny = a->ny;
+    nz = a->nz;
+    return *this;
+  }
+
+  array3* array;   // underlying container
+  uint x, y, z;    // offset into array
+  uint nx, ny, nz; // dimensions of subarray
+};
+
+// generic read-only view into a rectangular subset of a 3D array
+class const_view : public preview {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::z;
+  using preview::nx;
+  using preview::ny;
+  using preview::nz;
+public:
+  // construction--perform shallow copy of (sub)array
+  const_view(array3* array) : preview(array) {}
+  const_view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : preview(array, x, y, z, nx, ny, nz) {}
+
+  // dimensions of (sub)array
+  uint size_x() const { return nx; }
+  uint size_y() const { return ny; }
+  uint size_z() const { return nz; }
+
+  // (i, j, k) accessor
+  Scalar operator()(uint i, uint j, uint k) const { return array->get(x + i, y + j, z + k); }
+};
+
+// generic read-write view into a rectangular subset of a 3D array
+class view : public const_view {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::z;
+  using preview::nx;
+  using preview::ny;
+  using preview::nz;
+public:
+  // construction--perform shallow copy of (sub)array
+  view(array3* array) : const_view(array) {}
+  view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : const_view(array, x, y, z, nx, ny, nz) {}
+
+  // (i, j, k) accessor from base class
+  using const_view::operator();
+
+  // (i, j, k) mutator
+  reference operator()(uint i, uint j, uint k) { return reference(array, x + i, y + j, z + k); }
+};
+
+// flat view of 3D array (operator[] returns scalar)
+class flat_view : public view {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::z;
+  using preview::nx;
+  using preview::ny;
+  using preview::nz;
+public:
+  // construction--perform shallow copy of (sub)array
+  flat_view(array3* array) : view(array) {}
+  flat_view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : view(array, x, y, z, nx, ny, nz) {}
+
+  // convert (i, j, k) index to flat index
+  uint index(uint i, uint j, uint k) const { return i + nx * (j + ny * k); }
+
+  // convert flat index to (i, j, k) index
+  void ijk(uint& i, uint& j, uint& k, uint index) const
+  {
+    i = index % nx; index /= nx;
+    j = index % ny; index /= ny;
+    k = index;
+  }
+
+  // flat index accessors
+  Scalar operator[](uint index) const
+  {
+    uint i, j, k;
+    ijk(i, j, k, index);
+    return array->get(x + i, y + j, z + k);
+  }
+  reference operator[](uint index)
+  {
+    uint i, j, k;
+    ijk(i, j, k, index);
+    return reference(array, x + i, y + j, z + k);
+  }
+};
+
+// forward declaration of friends
+class nested_view1;
+class nested_view2;
+class nested_view3;
+
+// nested view into a 1D rectangular subset of a 3D array
+class nested_view1 : public preview {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::z;
+  using preview::nx;
+  using preview::ny;
+  using preview::nz;
+public:
+  // dimensions of (sub)array
+  uint size_x() const { return nx; }
+
+  // [i] accessor and mutator
+  Scalar operator[](uint index) const { return array->get(x + index, y, z); }
+  reference operator[](uint index) { return reference(array, x + index, y, z); }
+
+  // (i) accessor and mutator
+  Scalar operator()(uint i) const { return array->get(x + i, y, z); }
+  reference operator()(uint i) { return reference(array, x + i, y, z); }
+
+protected:
+  // construction--perform shallow copy of (sub)array
+  friend class nested_view2;
+  explicit nested_view1(array3* array) : preview(array) {}
+  explicit nested_view1(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : preview(array, x, y, z, nx, ny, nz) {}
+};
+
+// nested view into a 2D rectangular subset of a 3D array
+class nested_view2 : public preview {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::z;
+  using preview::nx;
+  using preview::ny;
+  using preview::nz;
+public:
+  // dimensions of (sub)array
+  uint size_x() const { return nx; }
+  uint size_y() const { return ny; }
+
+  // 1D view
+  nested_view1 operator[](uint index) const { return nested_view1(array, x, y + index, z, nx, 1, 1); }
+
+  // (i, j) accessor and mutator
+  Scalar operator()(uint i, uint j) const { return array->get(x + i, y + j, z); }
+  reference operator()(uint i, uint j) { return reference(array, x + i, y + j, z); }
+
+protected:
+  // construction--perform shallow copy of (sub)array
+  friend class nested_view3;
+  explicit nested_view2(array3* array) : preview(array) {}
+  explicit nested_view2(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : preview(array, x, y, z, nx, ny, nz) {}
+};
+
+// nested view into a 3D rectangular subset of a 3D array
+class nested_view3 : public preview {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::z;
+  using preview::nx;
+  using preview::ny;
+  using preview::nz;
+public:
+  // construction--perform shallow copy of (sub)array
+  nested_view3(array3* array) : preview(array) {}
+  nested_view3(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : preview(array, x, y, z, nx, ny, nz) {}
+
+  // dimensions of (sub)array
+  uint size_x() const { return nx; }
+  uint size_y() const { return ny; }
+  uint size_z() const { return nz; }
+
+  // 2D view
+  nested_view2 operator[](uint index) const { return nested_view2(array, x, y, z + index, nx, ny, 1); }
+
+  // (i, j, k) accessor and mutator
+  Scalar operator()(uint i, uint j, uint k) const { return array->get(x + i, y + j, z + k); }
+  reference operator()(uint i, uint j, uint k) { return reference(array, x + i, y + j, z + k); }
+};
+
+typedef nested_view3 nested_view;
+
+// thread-safe read-only view of 3D (sub)array with private cache
+class private_const_view : public preview {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::z;
+  using preview::nx;
+  using preview::ny;
+  using preview::nz;
+public:
+  // construction--perform shallow copy of (sub)array
+  private_const_view(array3* array) :
+    preview(array),
+    cache(array->cache.size())
+  {
+    init();
+  }
+  private_const_view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) :
+    preview(array, x, y, z, nx, ny, nz),
+    cache(array->cache.size())
+  {
+    init();
+  }
+
+  // destructor
+  ~private_const_view()
+  {
+    stream_close(zfp->stream);
+    zfp_stream_close(zfp);
+  }
+
+  // dimensions of (sub)array
+  uint size_x() const { return nx; }
+  uint size_y() const { return ny; }
+  uint size_z() const { return nz; }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size() * sizeof(CacheLine); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t csize)
+  {
+    cache.resize(array->lines(csize, nx, ny, nz));
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // (i, j, k) accessor
+  Scalar operator()(uint i, uint j, uint k) const { return get(x + i, y + j, z + k); }
+
+protected:
+  // cache line representing one block of decompressed values
+  class CacheLine {
+  public:
+    const Scalar& operator()(uint i, uint j, uint k) const { return a[index(i, j, k)]; }
+    Scalar& operator()(uint i, uint j, uint k) { return a[index(i, j, k)]; }
+    const Scalar* data() const { return a; }
+    Scalar* data() { return a; }
+  protected:
+    static uint index(uint i, uint j, uint k) { return (i & 3u) + 4 * ((j & 3u) + 4 * (k & 3u)); }
+    Scalar a[64];
+  };
+
+  // copy private data
+  void init()
+  {
+    // copy compressed stream
+    zfp = zfp_stream_open(0);
+    *zfp = *array->zfp;
+    // copy bit stream
+    zfp->stream = stream_clone(array->zfp->stream);
+  }
+
+  // inspector
+  const Scalar& get(uint i, uint j, uint k) const
+  {
+    const CacheLine* p = line(i, j, k);
+    return (*p)(i, j, k);
+  }
+
+  // return cache line for (i, j, k); may require write-back and fetch
+  CacheLine* line(uint i, uint j, uint k) const
+  {
+    CacheLine* p = 0;
+    uint b = array->block(i, j, k);
+    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, false);
+    uint c = t.index() - 1;
+    // fetch cache line; no writeback possible since view is read-only
+    if (c != b)
+      decode(b, p->data());
+    return p;
+  }
+
+  // decode block with given index
+  void decode(uint index, Scalar* block) const
+  {
+    stream_rseek(zfp->stream, index * array->blkbits);
+    Codec::decode_block_3(zfp, block, array->shape ? array->shape[index] : 0);
+  }
+
+  zfp_stream* zfp;                // stream of compressed blocks
+  mutable Cache<CacheLine> cache; // cache of decompressed blocks
+};
+
+// thread-safe read-write view of private 3D (sub)array
+class private_view : public private_const_view {
+protected:
+  using preview::array;
+  using preview::x;
+  using preview::y;
+  using preview::z;
+  using preview::nx;
+  using preview::ny;
+  using preview::nz;
+  using private_const_view::zfp;
+  using private_const_view::cache;
+  using private_const_view::init;
+  using private_const_view::decode;
+  class view_reference;
+  typedef typename private_const_view::CacheLine CacheLine;
+public:
+  // construction--perform shallow copy of (sub)array
+  private_view(array3* array) : private_const_view(array) {}
+  private_view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : private_const_view(array, x, y, z, nx, ny, nz) {}
+
+  // partition view into count block-aligned pieces, with 0 <= index < count
+  void partition(uint index, uint count)
+  {
+    if (nx > std::max(ny, nz))
+      partition(x, nx, index, count);
+    else if (ny > std::max(nx, nz))
+      partition(y, ny, index, count);
+    else
+      partition(z, nz, index, count);
+  }
+
+  // flush cache by compressing all modified cached blocks
+  void flush_cache() const
+  {
+    for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
+      if (p->tag.dirty()) {
+        uint b = p->tag.index() - 1;
+        encode(b, p->line->data());
+      }
+      cache.flush(p->line);
+    }
+  }
+
+  // (i, j, k) accessor from base class
+  using private_const_view::operator();
+
+  // (i, j, k) mutator
+  view_reference operator()(uint i, uint j, uint k) { return view_reference(this, x + i, y + j, z + k); }
+
+protected:
+  class view_reference {
+  public:
+    operator Scalar() const { return view->get(i, j, k); }
+    view_reference operator=(const view_reference& r) { view->set(i, j, k, r.operator Scalar()); return *this; }
+    view_reference operator=(Scalar val) { view->set(i, j, k, val); return *this; }
+    view_reference operator+=(Scalar val) { view->add(i, j, k, val); return *this; }
+    view_reference operator-=(Scalar val) { view->sub(i, j, k, val); return *this; }
+    view_reference operator*=(Scalar val) { view->mul(i, j, k, val); return *this; }
+    view_reference operator/=(Scalar val) { view->div(i, j, k, val); return *this; }
+    // swap two array elements via proxy references
+    friend void swap(view_reference a, view_reference b)
+    {
+      Scalar x = a.operator Scalar();
+      Scalar y = b.operator Scalar();
+      b.operator=(x);
+      a.operator=(y);
+    }
+
+  protected:
+    friend class private_view;
+    explicit view_reference(private_view* view, uint i, uint j, uint k) : view(view), i(i), j(j), k(k) {}
+    private_view* view;
+    uint i, j, k;
+  };
+
+  // block-aligned partition of [offset, offset + size): index out of count
+  static void partition(uint& offset, uint& size, uint index, uint count)
+  {
+    uint bmin = offset / 4;
+    uint bmax = (offset + size + 3) / 4;
+    uint xmin = std::max(offset +    0, 4 * (bmin + (bmax - bmin) * (index + 0) / count));
+    uint xmax = std::min(offset + size, 4 * (bmin + (bmax - bmin) * (index + 1) / count));
+    offset = xmin;
+    size = xmax - xmin;
+  }
+
+  // mutator
+  void set(uint i, uint j, uint k, Scalar val)
+  {
+    CacheLine* p = line(i, j, k, true);
+    (*p)(i, j, k) = val;
+  }
+
+  // in-place updates
+  void add(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) += val; }
+  void sub(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) -= val; }
+  void mul(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) *= val; }
+  void div(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) /= val; }
+
+  // return cache line for (i, j, k); may require write-back and fetch
+  CacheLine* line(uint i, uint j, uint k, bool write) const
+  {
+    CacheLine* p = 0;
+    uint b = array->block(i, j, k);
+    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write);
+    uint c = t.index() - 1;
+    if (c != b) {
+      // write back occupied cache line if it is dirty
+      if (t.dirty())
+        encode(c, p->data());
+      decode(b, p->data());
+    }
+    return p;
+  }
+
+  // encode block with given index
+  void encode(uint index, const Scalar* block) const
+  {
+    stream_wseek(zfp->stream, index * array->blkbits);
+    Codec::encode_block_3(zfp, block, array->shape ? array->shape[index] : 0);
+    stream_flush(zfp->stream);
+  }
+};
diff --git a/zfp/array/zfparray.h b/zfp/array/zfparray.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d5e65f6d133b5a96700849583a9d41051003376
--- /dev/null
+++ b/zfp/array/zfparray.h
@@ -0,0 +1,163 @@
+#ifndef ZFP_ARRAY_H
+#define ZFP_ARRAY_H
+
+#include <algorithm>
+#include <climits>
+#include "zfp.h"
+#include "zfp/memory.h"
+
+namespace zfp {
+
+// abstract base class for compressed array of scalars
+class array {
+protected:
+  // default constructor
+  array() :
+    dims(0), type(zfp_type_none),
+    nx(0), ny(0), nz(0),
+    bx(0), by(0), bz(0),
+    blocks(0), blkbits(0),
+    bytes(0), data(0),
+    zfp(0),
+    shape(0)
+  {}
+
+  // generic array with 'dims' dimensions and scalar type 'type'
+  array(uint dims, zfp_type type) :
+    dims(dims), type(type),
+    nx(0), ny(0), nz(0),
+    bx(0), by(0), bz(0),
+    blocks(0), blkbits(0),
+    bytes(0), data(0),
+    zfp(zfp_stream_open(0)),
+    shape(0)
+  {}
+
+  // copy constructor--performs a deep copy
+  array(const array& a) :
+    data(0),
+    zfp(0),
+    shape(0)
+  {
+    deep_copy(a);
+  }
+
+  // protected destructor (cannot delete array through base class pointer)
+  ~array()
+  {
+    free();
+    zfp_stream_close(zfp);
+  }
+
+  // assignment operator--performs a deep copy
+  array& operator=(const array& a)
+  {
+    deep_copy(a);
+    return *this;
+  }
+ 
+public:
+  // rate in bits per value
+  double rate() const { return double(blkbits) / block_size(); }
+
+  // set compression rate in bits per value
+  double set_rate(double rate)
+  {
+    rate = zfp_stream_set_rate(zfp, rate, type, dims, 1);
+    blkbits = zfp->maxbits;
+    alloc();
+    return rate;
+  }
+
+  // empty cache without compressing modified cached blocks
+  virtual void clear_cache() const = 0;
+
+  // flush cache by compressing all modified cached blocks
+  virtual void flush_cache() const = 0;
+
+  // number of bytes of compressed data
+  size_t compressed_size() const { return bytes; }
+
+  // pointer to compressed data for read or write access
+  uchar* compressed_data() const
+  {
+    // first write back any modified cached data
+    flush_cache();
+    return data;
+  }
+
+protected:
+  // number of values per block
+  uint block_size() const { return 1u << (2 * dims); }
+
+  // allocate memory for compressed data
+  void alloc(bool clear = true)
+  {
+    bytes = blocks * blkbits / CHAR_BIT;
+    reallocate(data, bytes, 0x100u);
+    if (clear)
+      std::fill(data, data + bytes, 0);
+    stream_close(zfp->stream);
+    zfp_stream_set_bit_stream(zfp, stream_open(data, bytes));
+    clear_cache();
+  }
+
+  // free memory associated with compressed data
+  void free()
+  {
+    nx = ny = nz = 0;
+    bx = by = bz = 0;
+    blocks = 0;
+    stream_close(zfp->stream);
+    zfp_stream_set_bit_stream(zfp, 0);
+    bytes = 0;
+    deallocate(data);
+    data = 0;
+    deallocate(shape);
+    shape = 0;
+  }
+
+  // perform a deep copy
+  void deep_copy(const array& a)
+  {
+    // copy metadata
+    dims = a.dims;
+    type = a.type;
+    nx = a.nx;
+    ny = a.ny;
+    nz = a.nz;
+    bx = a.bx;
+    by = a.by;
+    bz = a.bz;
+    blocks = a.blocks;
+    blkbits = a.blkbits;
+    bytes = a.bytes;
+
+    // copy dynamically allocated data
+    clone(data, a.data, bytes, 0x100u);
+    if (zfp) {
+      if (zfp->stream)
+        stream_close(zfp->stream);
+      zfp_stream_close(zfp);
+    }
+    zfp = zfp_stream_open(0);
+    *zfp = *a.zfp;
+    zfp_stream_set_bit_stream(zfp, stream_open(data, bytes));
+    clone(shape, a.shape, blocks);
+  }
+
+  uint dims;           // array dimensionality (1, 2, or 3)
+  zfp_type type;       // scalar type
+  uint nx, ny, nz;     // array dimensions
+  uint bx, by, bz;     // array dimensions in number of blocks
+  uint blocks;         // number of blocks
+  size_t blkbits;      // number of bits per compressed block
+  size_t bytes;        // total bytes of compressed data
+  mutable uchar* data; // pointer to compressed data
+  zfp_stream* zfp;     // compressed stream of blocks
+  uchar* shape;        // precomputed block dimensions (or null if uniform)
+};
+
+}
+
+#endif
diff --git a/zfp/array/zfparray1.h b/zfp/array/zfparray1.h
new file mode 100644
index 0000000000000000000000000000000000000000..7949d83bcb3db6d560a0d177e97f3750d1ce35fb
--- /dev/null
+++ b/zfp/array/zfparray1.h
@@ -0,0 +1,286 @@
+#ifndef ZFP_ARRAY1_H
+#define ZFP_ARRAY1_H
+
+#include <cstddef>
+#include <iterator>
+#include "zfparray.h"
+#include "zfpcodec.h"
+#include "zfp/cache.h"
+
+namespace zfp {
+
+// compressed 1D array of scalars
+template < typename Scalar, class Codec = zfp::codec<Scalar> >
+class array1 : public array {
+public:
+  // forward declarations
+  class reference;
+  class pointer;
+  class iterator;
+  class view;
+  #include "zfp/reference1.h"
+  #include "zfp/pointer1.h"
+  #include "zfp/iterator1.h"
+  #include "zfp/view1.h"
+
+  // default constructor
+  array1() : array(1, Codec::type) {}
+
+  // constructor of n-sample array using rate bits per value, at least
+  // csize bytes of cache, and optionally initialized from flat array p
+  array1(uint n, double rate, const Scalar* p = 0, size_t csize = 0) :
+    array(1, Codec::type),
+    cache(lines(csize, n))
+  {
+    set_rate(rate);
+    resize(n, p == 0);
+    if (p)
+      set(p);
+  }
+
+  // copy constructor--performs a deep copy
+  array1(const array1& a)
+  {
+    deep_copy(a);
+  }
+
+  // construction from view--perform deep copy of (sub)array
+  template <class View>
+  array1(const View& v) :
+    array(1, Codec::type),
+    cache(lines(0, v.size_x()))
+  {
+    set_rate(v.rate());
+    resize(v.size_x(), true);
+    // initialize array in its preferred order
+    for (iterator it = begin(); it != end(); ++it)
+      *it = v(it.i());
+  }
+
+  // virtual destructor
+  virtual ~array1() {}
+
+  // assignment operator--performs a deep copy
+  array1& operator=(const array1& a)
+  {
+    if (this != &a)
+      deep_copy(a);
+    return *this;
+  }
+
+  // total number of elements in array
+  size_t size() const { return size_t(nx); }
+
+  // array dimensions
+  uint size_x() const { return nx; }
+
+  // resize the array (all previously stored data will be lost)
+  void resize(uint n, bool clear = true)
+  {
+    if (n == 0)
+      free();
+    else {
+      nx = n;
+      bx = (nx + 3) / 4;
+      blocks = bx;
+      alloc(clear);
+
+      // precompute block dimensions
+      deallocate(shape);
+      if (nx & 3u) {
+        shape = (uchar*)allocate(blocks);
+        uchar* p = shape;
+        for (uint i = 0; i < bx; i++)
+          *p++ = (i == bx - 1 ? -nx & 3u : 0);
+      }
+      else
+        shape = 0;
+    }
+  }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size() * sizeof(CacheLine); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t csize)
+  {
+    flush_cache();
+    cache.resize(lines(csize, nx));
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // flush cache by compressing all modified cached blocks
+  void flush_cache() const
+  {
+    for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
+      if (p->tag.dirty()) {
+        uint b = p->tag.index() - 1;
+        encode(b, p->line->data());
+      }
+      cache.flush(p->line);
+    }
+  }
+
+  // decompress array and store at p
+  void get(Scalar* p) const
+  {
+    uint b = 0;
+    for (uint i = 0; i < bx; i++, p += 4, b++) {
+      const CacheLine* line = cache.lookup(b + 1);
+      if (line)
+        line->get(p, 1, shape ? shape[b] : 0);
+      else
+        decode(b, p, 1);
+    }
+  }
+
+  // initialize array by copying and compressing data stored at p
+  void set(const Scalar* p)
+  {
+    uint b = 0;
+    for (uint i = 0; i < bx; i++, b++, p += 4)
+      encode(b, p, 1);
+    cache.clear();
+  }
+
+  // (i) accessors
+  Scalar operator()(uint i) const { return get(i); }
+  reference operator()(uint i) { return reference(this, i); }
+
+  // flat index accessors
+  Scalar operator[](uint index) const { return get(index); }
+  reference operator[](uint index) { return reference(this, index); }
+
+  // random access iterators
+  iterator begin() { return iterator(this, 0); }
+  iterator end() { return iterator(this, nx); }
+
+protected:
+  // cache line representing one block of decompressed values
+  class CacheLine {
+  public:
+    Scalar operator()(uint i) const { return a[index(i)]; }
+    Scalar& operator()(uint i) { return a[index(i)]; }
+    const Scalar* data() const { return a; }
+    Scalar* data() { return a; }
+    // copy cache line
+    void get(Scalar* p, int sx) const
+    {
+      const Scalar* q = a;
+      for (uint x = 0; x < 4; x++, p += sx, q++)
+        *p = *q;
+    }
+    void get(Scalar* p, int sx, uint shape) const
+    {
+      if (!shape)
+        get(p, sx);
+      else {
+        // determine block dimensions
+        uint nx = 4 - (shape & 3u); shape >>= 2;
+        const Scalar* q = a;
+        for (uint x = 0; x < nx; x++, p += sx, q++)
+          *p = *q;
+      }
+    }
+  protected:
+    static uint index(uint i) { return i & 3u; }
+    Scalar a[4];
+  };
+
+  // perform a deep copy
+  void deep_copy(const array1& a)
+  {
+    // copy base class members
+    array::deep_copy(a);
+    // copy cache
+    cache = a.cache;
+  }
+
+  // inspector
+  Scalar get(uint i) const
+  {
+    const CacheLine* p = line(i, false);
+    return (*p)(i);
+  }
+
+  // mutator
+  void set(uint i, Scalar val)
+  {
+    CacheLine* p = line(i, true);
+    (*p)(i) = val;
+  }
+
+  // in-place updates
+  void add(uint i, Scalar val) { (*line(i, true))(i) += val; }
+  void sub(uint i, Scalar val) { (*line(i, true))(i) -= val; }
+  void mul(uint i, Scalar val) { (*line(i, true))(i) *= val; }
+  void div(uint i, Scalar val) { (*line(i, true))(i) /= val; }
+
+  // return cache line for i; may require write-back and fetch
+  CacheLine* line(uint i, bool write) const
+  {
+    CacheLine* p = 0;
+    uint b = block(i);
+    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write);
+    uint c = t.index() - 1;
+    if (c != b) {
+      // write back occupied cache line if it is dirty
+      if (t.dirty())
+        encode(c, p->data());
+      // fetch cache line
+      decode(b, p->data());
+    }
+    return p;
+  }
+
+  // encode block with given index
+  void encode(uint index, const Scalar* block) const
+  {
+    stream_wseek(zfp->stream, index * blkbits);
+    Codec::encode_block_1(zfp, block, shape ? shape[index] : 0);
+    stream_flush(zfp->stream);
+  }
+
+  // encode block with given index from strided array
+  void encode(uint index, const Scalar* p, int sx) const
+  {
+    stream_wseek(zfp->stream, index * blkbits);
+    Codec::encode_block_strided_1(zfp, p, shape ? shape[index] : 0, sx);
+    stream_flush(zfp->stream);
+  }
+
+  // decode block with given index
+  void decode(uint index, Scalar* block) const
+  {
+    stream_rseek(zfp->stream, index * blkbits);
+    Codec::decode_block_1(zfp, block, shape ? shape[index] : 0);
+  }
+
+  // decode block with given index to strided array
+  void decode(uint index, Scalar* p, int sx) const
+  {
+    stream_rseek(zfp->stream, index * blkbits);
+    Codec::decode_block_strided_1(zfp, p, shape ? shape[index] : 0, sx);
+  }
+
+  // block index for i
+  static uint block(uint i) { return i / 4; }
+
+  // number of cache lines corresponding to size (or suggested size if zero)
+  static uint lines(size_t size, uint n)
+  {
+    n = uint(((size ? size : 8 * sizeof(Scalar)) + sizeof(CacheLine) - 1) / sizeof(CacheLine));
+    return std::max(n, 1u);
+  }
+
+  mutable Cache<CacheLine> cache; // cache of decompressed blocks
+};
+
+typedef array1<float> array1f;
+typedef array1<double> array1d;
+
+}
+
+#endif
diff --git a/zfp/array/zfparray2.h b/zfp/array/zfparray2.h
new file mode 100644
index 0000000000000000000000000000000000000000..152b06698a1380c7f3d67fc2e7a249f01e32aecd
--- /dev/null
+++ b/zfp/array/zfparray2.h
@@ -0,0 +1,313 @@
+#ifndef ZFP_ARRAY2_H
+#define ZFP_ARRAY2_H
+
+#include <cstddef>
+#include <iterator>
+#include "zfparray.h"
+#include "zfpcodec.h"
+#include "zfp/cache.h"
+
+namespace zfp {
+
+// compressed 2D array of scalars
+template < typename Scalar, class Codec = zfp::codec<Scalar> >
+class array2 : public array {
+public:
+  // forward declarations
+  class reference;
+  class pointer;
+  class iterator;
+  class view;
+  #include "zfp/reference2.h"
+  #include "zfp/pointer2.h"
+  #include "zfp/iterator2.h"
+  #include "zfp/view2.h"
+
+  // default constructor
+  array2() : array(2, Codec::type) {}
+
+  // constructor of nx * ny array using rate bits per value, at least
+  // csize bytes of cache, and optionally initialized from flat array p
+  array2(uint nx, uint ny, double rate, const Scalar* p = 0, size_t csize = 0) :
+    array(2, Codec::type),
+    cache(lines(csize, nx, ny))
+  {
+    set_rate(rate);
+    resize(nx, ny, p == 0);
+    if (p)
+      set(p);
+  }
+
+  // copy constructor--performs a deep copy
+  array2(const array2& a)
+  {
+    deep_copy(a);
+  }
+
+  // construction from view--perform deep copy of (sub)array
+  template <class View>
+  array2(const View& v) :
+    array(2, Codec::type),
+    cache(lines(0, v.size_x(), v.size_y()))
+  {
+    set_rate(v.rate());
+    resize(v.size_x(), v.size_y(), true);
+    // initialize array in its preferred order
+    for (iterator it = begin(); it != end(); ++it)
+      *it = v(it.i(), it.j());
+  }
+
+  // virtual destructor
+  virtual ~array2() {}
+
+  // assignment operator--performs a deep copy
+  array2& operator=(const array2& a)
+  {
+    if (this != &a)
+      deep_copy(a);
+    return *this;
+  }
+
+  // total number of elements in array
+  size_t size() const { return size_t(nx) * size_t(ny); }
+
+  // array dimensions
+  uint size_x() const { return nx; }
+  uint size_y() const { return ny; }
+
+  // resize the array (all previously stored data will be lost)
+  void resize(uint nx, uint ny, bool clear = true)
+  {
+    if (nx == 0 || ny == 0)
+      free();
+    else {
+      this->nx = nx;
+      this->ny = ny;
+      bx = (nx + 3) / 4;
+      by = (ny + 3) / 4;
+      blocks = bx * by;
+      alloc(clear);
+
+      // precompute block dimensions
+      deallocate(shape);
+      if ((nx | ny) & 3u) {
+        shape = (uchar*)allocate(blocks);
+        uchar* p = shape;
+        for (uint j = 0; j < by; j++)
+          for (uint i = 0; i < bx; i++)
+            *p++ = (i == bx - 1 ? -nx & 3u : 0) + 4 * (j == by - 1 ? -ny & 3u : 0);
+      }
+      else
+        shape = 0;
+    }
+  }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size() * sizeof(CacheLine); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t csize)
+  {
+    flush_cache();
+    cache.resize(lines(csize, nx, ny));
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // flush cache by compressing all modified cached blocks
+  void flush_cache() const
+  {
+    for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
+      if (p->tag.dirty()) {
+        uint b = p->tag.index() - 1;
+        encode(b, p->line->data());
+      }
+      cache.flush(p->line);
+    }
+  }
+
+  // decompress array and store at p
+  void get(Scalar* p) const
+  {
+    uint b = 0;
+    for (uint j = 0; j < by; j++, p += 4 * (nx - bx))
+      for (uint i = 0; i < bx; i++, p += 4, b++) {
+        const CacheLine* line = cache.lookup(b + 1);
+        if (line)
+          line->get(p, 1, nx, shape ? shape[b] : 0);
+        else
+          decode(b, p, 1, nx);
+      }
+  }
+
+  // initialize array by copying and compressing data stored at p
+  void set(const Scalar* p)
+  {
+    uint b = 0;
+    for (uint j = 0; j < by; j++, p += 4 * (nx - bx))
+      for (uint i = 0; i < bx; i++, p += 4, b++)
+        encode(b, p, 1, nx);
+    cache.clear();
+  }
+
+  // (i, j) accessors
+  Scalar operator()(uint i, uint j) const { return get(i, j); }
+  reference operator()(uint i, uint j) { return reference(this, i, j); }
+
+  // flat index accessors
+  Scalar operator[](uint index) const
+  {
+    uint i, j;
+    ij(i, j, index);
+    return get(i, j);
+  }
+  reference operator[](uint index)
+  {
+    uint i, j;
+    ij(i, j, index);
+    return reference(this, i, j);
+  }
+
+  // sequential iterators
+  iterator begin() { return iterator(this, 0, 0); }
+  iterator end() { return iterator(this, 0, ny); }
+
+protected:
+  // cache line representing one block of decompressed values
+  class CacheLine {
+  public:
+    Scalar operator()(uint i, uint j) const { return a[index(i, j)]; }
+    Scalar& operator()(uint i, uint j) { return a[index(i, j)]; }
+    const Scalar* data() const { return a; }
+    Scalar* data() { return a; }
+    // copy cache line
+    void get(Scalar* p, int sx, int sy) const
+    {
+      const Scalar* q = a;
+      for (uint y = 0; y < 4; y++, p += sy - 4 * sx)
+        for (uint x = 0; x < 4; x++, p += sx, q++)
+          *p = *q;
+    }
+    void get(Scalar* p, int sx, int sy, uint shape) const
+    {
+      if (!shape)
+        get(p, sx, sy);
+      else {
+        // determine block dimensions
+        uint nx = 4 - (shape & 3u); shape >>= 2;
+        uint ny = 4 - (shape & 3u); shape >>= 2;
+        const Scalar* q = a;
+        for (uint y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+          for (uint x = 0; x < nx; x++, p += sx, q++)
+            *p = *q;
+      }
+    }
+  protected:
+    static uint index(uint i, uint j) { return (i & 3u) + 4 * (j & 3u); }
+    Scalar a[16];
+  };
+
+  // perform a deep copy
+  void deep_copy(const array2& a)
+  {
+    // copy base class members
+    array::deep_copy(a);
+    // copy cache
+    cache = a.cache;
+  }
+
+  // inspector
+  Scalar get(uint i, uint j) const
+  {
+    const CacheLine* p = line(i, j, false);
+    return (*p)(i, j);
+  }
+
+  // mutator
+  void set(uint i, uint j, Scalar val)
+  {
+    CacheLine* p = line(i, j, true);
+    (*p)(i, j) = val;
+  }
+
+  // in-place updates
+  void add(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) += val; }
+  void sub(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) -= val; }
+  void mul(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) *= val; }
+  void div(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) /= val; }
+
+  // return cache line for (i, j); may require write-back and fetch
+  CacheLine* line(uint i, uint j, bool write) const
+  {
+    CacheLine* p = 0;
+    uint b = block(i, j);
+    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write);
+    uint c = t.index() - 1;
+    if (c != b) {
+      // write back occupied cache line if it is dirty
+      if (t.dirty())
+        encode(c, p->data());
+      // fetch cache line
+      decode(b, p->data());
+    }
+    return p;
+  }
+
+  // encode block with given index
+  void encode(uint index, const Scalar* block) const
+  {
+    stream_wseek(zfp->stream, index * blkbits);
+    Codec::encode_block_2(zfp, block, shape ? shape[index] : 0);
+    stream_flush(zfp->stream);
+  }
+
+  // encode block with given index from strided array
+  void encode(uint index, const Scalar* p, int sx, int sy) const
+  {
+    stream_wseek(zfp->stream, index * blkbits);
+    Codec::encode_block_strided_2(zfp, p, shape ? shape[index] : 0, sx, sy);
+    stream_flush(zfp->stream);
+  }
+
+  // decode block with given index
+  void decode(uint index, Scalar* block) const
+  {
+    stream_rseek(zfp->stream, index * blkbits);
+    Codec::decode_block_2(zfp, block, shape ? shape[index] : 0);
+  }
+
+  // decode block with given index to strided array
+  void decode(uint index, Scalar* p, int sx, int sy) const
+  {
+    stream_rseek(zfp->stream, index * blkbits);
+    Codec::decode_block_strided_2(zfp, p, shape ? shape[index] : 0, sx, sy);
+  }
+
+  // block index for (i, j)
+  uint block(uint i, uint j) const { return (i / 4) + bx * (j / 4); }
+
+  // convert flat index to (i, j)
+  void ij(uint& i, uint& j, uint index) const
+  {
+    i = index % nx;
+    index /= nx;
+    j = index;
+  }
+
+  // number of cache lines corresponding to size (or suggested size if zero)
+  static uint lines(size_t size, uint nx, uint ny)
+  {
+    uint n = uint(((size ? size : 8 * nx * sizeof(Scalar)) + sizeof(CacheLine) - 1) / sizeof(CacheLine));
+    return std::max(n, 1u);
+  }
+
+  mutable Cache<CacheLine> cache; // cache of decompressed blocks
+};
+
+typedef array2<float> array2f;
+typedef array2<double> array2d;
+
+}
+
+#endif
diff --git a/zfp/array/zfparray3.h b/zfp/array/zfparray3.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4fd7614a6fb24b884b4f4dff9b47b78c548b556
--- /dev/null
+++ b/zfp/array/zfparray3.h
@@ -0,0 +1,327 @@
+#ifndef ZFP_ARRAY3_H
+#define ZFP_ARRAY3_H
+
+#include <cstddef>
+#include <iterator>
+#include "zfparray.h"
+#include "zfpcodec.h"
+#include "zfp/cache.h"
+
+namespace zfp {
+
+// compressed 3D array of scalars
+template < typename Scalar, class Codec = zfp::codec<Scalar> >
+class array3 : public array {
+public:
+  // forward declarations
+  class reference;
+  class pointer;
+  class iterator;
+  class view;
+  #include "zfp/reference3.h"
+  #include "zfp/pointer3.h"
+  #include "zfp/iterator3.h"
+  #include "zfp/view3.h"
+
+  // default constructor
+  array3() : array(3, Codec::type) {}
+
+  // constructor of nx * ny * nz array using rate bits per value, at least
+  // csize bytes of cache, and optionally initialized from flat array p
+  array3(uint nx, uint ny, uint nz, double rate, const Scalar* p = 0, size_t csize = 0) :
+    array(3, Codec::type),
+    cache(lines(csize, nx, ny, nz))
+  {
+    set_rate(rate);
+    resize(nx, ny, nz, p == 0);
+    if (p)
+      set(p);
+  }
+
+  // copy constructor--performs a deep copy
+  array3(const array3& a)
+  {
+    deep_copy(a);
+  }
+
+  // construction from view--perform deep copy of (sub)array
+  template <class View>
+  array3(const View& v) :
+    array(3, Codec::type),
+    cache(lines(0, v.size_x(), v.size_y(), v.size_z()))
+  {
+    set_rate(v.rate());
+    resize(v.size_x(), v.size_y(), v.size_z(), true);
+    // initialize array in its preferred order
+    for (iterator it = begin(); it != end(); ++it)
+      *it = v(it.i(), it.j(), it.k());
+  }
+
+  // virtual destructor
+  virtual ~array3() {}
+
+  // assignment operator--performs a deep copy
+  array3& operator=(const array3& a)
+  {
+    if (this != &a)
+      deep_copy(a);
+    return *this;
+  }
+
+  // total number of elements in array
+  size_t size() const { return size_t(nx) * size_t(ny) * size_t(nz); }
+
+  // array dimensions
+  uint size_x() const { return nx; }
+  uint size_y() const { return ny; }
+  uint size_z() const { return nz; }
+
+  // resize the array (all previously stored data will be lost)
+  void resize(uint nx, uint ny, uint nz, bool clear = true)
+  {
+    if (nx == 0 || ny == 0 || nz == 0)
+      free();
+    else {
+      this->nx = nx;
+      this->ny = ny;
+      this->nz = nz;
+      bx = (nx + 3) / 4;
+      by = (ny + 3) / 4;
+      bz = (nz + 3) / 4;
+      blocks = bx * by * bz;
+      alloc(clear);
+
+      // precompute block dimensions
+      deallocate(shape);
+      if ((nx | ny | nz) & 3u) {
+        shape = (uchar*)allocate(blocks);
+        uchar* p = shape;
+        for (uint k = 0; k < bz; k++)
+          for (uint j = 0; j < by; j++)
+            for (uint i = 0; i < bx; i++)
+              *p++ = (i == bx - 1 ? -nx & 3u : 0) + 4 * ((j == by - 1 ? -ny & 3u : 0) + 4 * (k == bz - 1 ? -nz & 3u : 0));
+      }
+      else
+        shape = 0;
+    }
+  }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size() * sizeof(CacheLine); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t csize)
+  {
+    flush_cache();
+    cache.resize(lines(csize, nx, ny, nz));
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // flush cache by compressing all modified cached blocks
+  void flush_cache() const
+  {
+    for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
+      if (p->tag.dirty()) {
+        uint b = p->tag.index() - 1;
+        encode(b, p->line->data());
+      }
+      cache.flush(p->line);
+    }
+  }
+
+  // decompress array and store at p
+  void get(Scalar* p) const
+  {
+    uint b = 0;
+    for (uint k = 0; k < bz; k++, p += 4 * nx * (ny - by))
+      for (uint j = 0; j < by; j++, p += 4 * (nx - bx))
+        for (uint i = 0; i < bx; i++, p += 4, b++) {
+          const CacheLine* line = cache.lookup(b + 1);
+          if (line)
+            line->get(p, 1, nx, nx * ny, shape ? shape[b] : 0);
+          else
+            decode(b, p, 1, nx, nx * ny);
+        }
+  }
+
+  // initialize array by copying and compressing data stored at p
+  void set(const Scalar* p)
+  {
+    uint b = 0;
+    for (uint k = 0; k < bz; k++, p += 4 * nx * (ny - by))
+      for (uint j = 0; j < by; j++, p += 4 * (nx - bx))
+        for (uint i = 0; i < bx; i++, p += 4, b++)
+          encode(b, p, 1, nx, nx * ny);
+    cache.clear();
+  }
+
+  // (i, j, k) accessors
+  Scalar operator()(uint i, uint j, uint k) const { return get(i, j, k); }
+  reference operator()(uint i, uint j, uint k) { return reference(this, i, j, k); }
+
+  // flat index corresponding to (i, j, k)
+  uint index(uint i, uint j, uint k) const { return i + nx * (j + ny * k); }
+
+  // flat index accessors
+  Scalar operator[](uint index) const
+  {
+    uint i, j, k;
+    ijk(i, j, k, index);
+    return get(i, j, k);
+  }
+  reference operator[](uint index)
+  {
+    uint i, j, k;
+    ijk(i, j, k, index);
+    return reference(this, i, j, k);
+  }
+
+  // sequential iterators
+  iterator begin() { return iterator(this, 0, 0, 0); }
+  iterator end() { return iterator(this, 0, 0, nz); }
+
+protected:
+  // cache line representing one block of decompressed values
+  class CacheLine {
+  public:
+    Scalar operator()(uint i, uint j, uint k) const { return a[index(i, j, k)]; }
+    Scalar& operator()(uint i, uint j, uint k) { return a[index(i, j, k)]; }
+    const Scalar* data() const { return a; }
+    Scalar* data() { return a; }
+    // copy cache line
+    void get(Scalar* p, int sx, int sy, int sz) const
+    {
+      const Scalar* q = a;
+      for (uint z = 0; z < 4; z++, p += sz - 4 * sy)
+        for (uint y = 0; y < 4; y++, p += sy - 4 * sx)
+          for (uint x = 0; x < 4; x++, p += sx, q++)
+            *p = *q;
+    }
+    void get(Scalar* p, int sx, int sy, int sz, uint shape) const
+    {
+      if (!shape)
+        get(p, sx, sy, sz);
+      else {
+        // determine block dimensions
+        uint nx = 4 - (shape & 3u); shape >>= 2;
+        uint ny = 4 - (shape & 3u); shape >>= 2;
+        uint nz = 4 - (shape & 3u); shape >>= 2;
+        const Scalar* q = a;
+        for (uint z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny)
+          for (uint y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+            for (uint x = 0; x < nx; x++, p += sx, q++)
+              *p = *q;
+      }
+    }
+  protected:
+    static uint index(uint i, uint j, uint k) { return (i & 3u) + 4 * ((j & 3u) + 4 * (k & 3u)); }
+    Scalar a[64];
+  };
+
+  // perform a deep copy
+  void deep_copy(const array3& a)
+  {
+    // copy base class members
+    array::deep_copy(a);
+    // copy cache
+    cache = a.cache;
+  }
+
+  // inspector
+  Scalar get(uint i, uint j, uint k) const
+  {
+    const CacheLine* p = line(i, j, k, false);
+    return (*p)(i, j, k);
+  }
+
+  // mutator
+  void set(uint i, uint j, uint k, Scalar val)
+  {
+    CacheLine* p = line(i, j, k, true);
+    (*p)(i, j, k) = val;
+  }
+
+  // in-place updates
+  void add(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) += val; }
+  void sub(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) -= val; }
+  void mul(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) *= val; }
+  void div(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) /= val; }
+
+  // return cache line for (i, j, k); may require write-back and fetch
+  CacheLine* line(uint i, uint j, uint k, bool write) const
+  {
+    CacheLine* p = 0;
+    uint b = block(i, j, k);
+    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write);
+    uint c = t.index() - 1;
+    if (c != b) {
+      // write back occupied cache line if it is dirty
+      if (t.dirty())
+        encode(c, p->data());
+      // fetch cache line
+      decode(b, p->data());
+    }
+    return p;
+  }
+
+  // encode block with given index
+  void encode(uint index, const Scalar* block) const
+  {
+    stream_wseek(zfp->stream, index * blkbits);
+    Codec::encode_block_3(zfp, block, shape ? shape[index] : 0);
+    stream_flush(zfp->stream);
+  }
+
+  // encode block with given index from strided array
+  void encode(uint index, const Scalar* p, int sx, int sy, int sz) const
+  {
+    stream_wseek(zfp->stream, index * blkbits);
+    Codec::encode_block_strided_3(zfp, p, shape ? shape[index] : 0, sx, sy, sz);
+    stream_flush(zfp->stream);
+  }
+
+  // decode block with given index
+  void decode(uint index, Scalar* block) const
+  {
+    stream_rseek(zfp->stream, index * blkbits);
+    Codec::decode_block_3(zfp, block, shape ? shape[index] : 0);
+  }
+
+  // decode block with given index to strided array
+  void decode(uint index, Scalar* p, int sx, int sy, int sz) const
+  {
+    stream_rseek(zfp->stream, index * blkbits);
+    Codec::decode_block_strided_3(zfp, p, shape ? shape[index] : 0, sx, sy, sz);
+  }
+
+  // block index for (i, j, k)
+  uint block(uint i, uint j, uint k) const { return (i / 4) + bx * ((j / 4) + by * (k / 4)); }
+
+  // convert flat index to (i, j, k)
+  void ijk(uint& i, uint& j, uint& k, uint index) const
+  {
+    i = index % nx;
+    index /= nx;
+    j = index % ny;
+    index /= ny;
+    k = index;
+  }
+
+  // number of cache lines corresponding to size (or suggested size if zero)
+  static uint lines(size_t size, uint nx, uint ny, uint nz)
+  {
+    uint n = uint(((size ? size : 8 * nx * ny * sizeof(Scalar)) + sizeof(CacheLine) - 1) / sizeof(CacheLine));
+    return std::max(n, 1u);
+  }
+
+  mutable Cache<CacheLine> cache; // cache of decompressed blocks
+};
+
+typedef array3<float> array3f;
+typedef array3<double> array3d;
+
+}
+
+#endif
diff --git a/zfp/array/zfpcodec.h b/zfp/array/zfpcodec.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d4674444e07f3262943df3c7e543ce14e3d66e0
--- /dev/null
+++ b/zfp/array/zfpcodec.h
@@ -0,0 +1,17 @@
+#ifndef ZFP_CODEC_H
+#define ZFP_CODEC_H
+
+#include "zfp.h"
+
+namespace zfp {
+
+// C++ wrappers around libzfp C functions
+template <typename Scalar>
+struct codec {};
+
+#include "zfpcodecf.h"
+#include "zfpcodecd.h"
+
+}
+
+#endif
diff --git a/zfp/array/zfpcodecd.h b/zfp/array/zfpcodecd.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e7d893234062afb0801573c01f2100c73848183
--- /dev/null
+++ b/zfp/array/zfpcodecd.h
@@ -0,0 +1,149 @@
+// double-precision codec
+template <>
+struct codec<double> {
+  // encode contiguous 1D block
+  static void encode_block_1(zfp_stream* zfp, const double* block, uint shape)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      zfp_encode_partial_block_strided_double_1(zfp, block, nx, 1);
+    }
+    else
+      zfp_encode_block_double_1(zfp, block);
+  }
+
+  // encode 1D block from strided storage
+  static void encode_block_strided_1(zfp_stream* zfp, const double* p, uint shape, int sx)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      zfp_encode_partial_block_strided_double_1(zfp, p, nx, sx);
+    }
+    else
+      zfp_encode_block_strided_double_1(zfp, p, sx);
+  }
+
+  // encode contiguous 2D block
+  static void encode_block_2(zfp_stream* zfp, const double* block, uint shape)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      zfp_encode_partial_block_strided_double_2(zfp, block, nx, ny, 1, 4);
+    }
+    else
+      zfp_encode_block_double_2(zfp, block);
+  }
+
+  // encode 2D block from strided storage
+  static void encode_block_strided_2(zfp_stream* zfp, const double* p, uint shape, int sx, int sy)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      zfp_encode_partial_block_strided_double_2(zfp, p, nx, ny, sx, sy);
+    }
+    else
+      zfp_encode_block_strided_double_2(zfp, p, sx, sy);
+  }
+
+  // encode contiguous 3D block
+  static void encode_block_3(zfp_stream* zfp, const double* block, uint shape)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      zfp_encode_partial_block_strided_double_3(zfp, block, nx, ny, nz, 1, 4, 16);
+    }
+    else
+      zfp_encode_block_double_3(zfp, block);
+  }
+
+  // encode 3D block from strided storage
+  static void encode_block_strided_3(zfp_stream* zfp, const double* p, uint shape, int sx, int sy, int sz)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      zfp_encode_partial_block_strided_double_3(zfp, p, nx, ny, nz, sx, sy, sz);
+    }
+    else
+      zfp_encode_block_strided_double_3(zfp, p, sx, sy, sz);
+  }
+
+  // decode contiguous 1D block
+  static void decode_block_1(zfp_stream* zfp, double* block, uint shape)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      zfp_decode_partial_block_strided_double_1(zfp, block, nx, 1);
+    }
+    else
+      zfp_decode_block_double_1(zfp, block);
+  }
+
+  // decode 1D block to strided storage
+  static void decode_block_strided_1(zfp_stream* zfp, double* p, uint shape, int sx)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      zfp_decode_partial_block_strided_double_1(zfp, p, nx, sx);
+    }
+    else
+      zfp_decode_block_strided_double_1(zfp, p, sx);
+  }
+
+  // decode contiguous 2D block
+  static void decode_block_2(zfp_stream* zfp, double* block, uint shape)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      zfp_decode_partial_block_strided_double_2(zfp, block, nx, ny, 1, 4);
+    }
+    else
+      zfp_decode_block_double_2(zfp, block);
+  }
+
+  // decode 2D block to strided storage
+  static void decode_block_strided_2(zfp_stream* zfp, double* p, uint shape, int sx, int sy)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      zfp_decode_partial_block_strided_double_2(zfp, p, nx, ny, sx, sy);
+    }
+    else
+      zfp_decode_block_strided_double_2(zfp, p, sx, sy);
+  }
+
+  // decode contiguous 3D block
+  static void decode_block_3(zfp_stream* zfp, double* block, uint shape)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      zfp_decode_partial_block_strided_double_3(zfp, block, nx, ny, nz, 1, 4, 16);
+    }
+    else
+      zfp_decode_block_double_3(zfp, block);
+  }
+
+  // decode 3D block to strided storage
+  static void decode_block_strided_3(zfp_stream* zfp, double* p, uint shape, int sx, int sy, int sz)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      zfp_decode_partial_block_strided_double_3(zfp, p, nx, ny, nz, sx, sy, sz);
+    }
+    else
+      zfp_decode_block_strided_double_3(zfp, p, sx, sy, sz);
+  }
+
+  static const zfp_type type = zfp_type_double;
+};
diff --git a/zfp/array/zfpcodecf.h b/zfp/array/zfpcodecf.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ec74a60990281d545886ad5f49a699d18c14a10
--- /dev/null
+++ b/zfp/array/zfpcodecf.h
@@ -0,0 +1,149 @@
+// single-precision codec
+template <>
+struct codec<float> {
+  // encode contiguous 1D block
+  static void encode_block_1(zfp_stream* zfp, const float* block, uint shape)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      zfp_encode_partial_block_strided_float_1(zfp, block, nx, 1);
+    }
+    else
+      zfp_encode_block_float_1(zfp, block);
+  }
+
+  // encode 1D block from strided storage
+  static void encode_block_strided_1(zfp_stream* zfp, const float* p, uint shape, int sx)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      zfp_encode_partial_block_strided_float_1(zfp, p, nx, sx);
+    }
+    else
+      zfp_encode_block_strided_float_1(zfp, p, sx);
+  }
+
+  // encode contiguous 2D block
+  static void encode_block_2(zfp_stream* zfp, const float* block, uint shape)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      zfp_encode_partial_block_strided_float_2(zfp, block, nx, ny, 1, 4);
+    }
+    else
+      zfp_encode_block_float_2(zfp, block);
+  }
+
+  // encode 2D block from strided storage
+  static void encode_block_strided_2(zfp_stream* zfp, const float* p, uint shape, int sx, int sy)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      zfp_encode_partial_block_strided_float_2(zfp, p, nx, ny, sx, sy);
+    }
+    else
+      zfp_encode_block_strided_float_2(zfp, p, sx, sy);
+  }
+
+  // encode contiguous 3D block
+  static void encode_block_3(zfp_stream* zfp, const float* block, uint shape)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      zfp_encode_partial_block_strided_float_3(zfp, block, nx, ny, nz, 1, 4, 16);
+    }
+    else
+      zfp_encode_block_float_3(zfp, block);
+  }
+
+  // encode 3D block from strided storage
+  static void encode_block_strided_3(zfp_stream* zfp, const float* p, uint shape, int sx, int sy, int sz)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      zfp_encode_partial_block_strided_float_3(zfp, p, nx, ny, nz, sx, sy, sz);
+    }
+    else
+      zfp_encode_block_strided_float_3(zfp, p, sx, sy, sz);
+  }
+
+  // decode contiguous 1D block
+  static void decode_block_1(zfp_stream* zfp, float* block, uint shape)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      zfp_decode_partial_block_strided_float_1(zfp, block, nx, 1);
+    }
+    else
+      zfp_decode_block_float_1(zfp, block);
+  }
+
+  // decode 1D block to strided storage
+  static void decode_block_strided_1(zfp_stream* zfp, float* p, uint shape, int sx)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      zfp_decode_partial_block_strided_float_1(zfp, p, nx, sx);
+    }
+    else
+      zfp_decode_block_strided_float_1(zfp, p, sx);
+  }
+
+  // decode contiguous 2D block
+  static void decode_block_2(zfp_stream* zfp, float* block, uint shape)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      zfp_decode_partial_block_strided_float_2(zfp, block, nx, ny, 1, 4);
+    }
+    else
+      zfp_decode_block_float_2(zfp, block);
+  }
+
+  // decode 2D block to strided storage
+  static void decode_block_strided_2(zfp_stream* zfp, float* p, uint shape, int sx, int sy)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      zfp_decode_partial_block_strided_float_2(zfp, p, nx, ny, sx, sy);
+    }
+    else
+      zfp_decode_block_strided_float_2(zfp, p, sx, sy);
+  }
+
+  // decode contiguous 3D block
+  static void decode_block_3(zfp_stream* zfp, float* block, uint shape)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      zfp_decode_partial_block_strided_float_3(zfp, block, nx, ny, nz, 1, 4, 16);
+    }
+    else
+      zfp_decode_block_float_3(zfp, block);
+  }
+
+  // decode 3D block to strided storage
+  static void decode_block_strided_3(zfp_stream* zfp, float* p, uint shape, int sx, int sy, int sz)
+  {
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      zfp_decode_partial_block_strided_float_3(zfp, p, nx, ny, nz, sx, sy, sz);
+    }
+    else
+      zfp_decode_block_strided_float_3(zfp, p, sx, sy, sz);
+  }
+
+  static const zfp_type type = zfp_type_float;
+};
diff --git a/zfp/cfp/CMakeLists.txt b/zfp/cfp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..febd4f0ab6f826fc669a9047b2c86fd7dc8c351d
--- /dev/null
+++ b/zfp/cfp/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(src)
diff --git a/zfp/cfp/include/cfparray1d.h b/zfp/cfp/include/cfparray1d.h
new file mode 100644
index 0000000000000000000000000000000000000000..1be2729566a3f867018fe3f2861b0267f7a47ce8
--- /dev/null
+++ b/zfp/cfp/include/cfparray1d.h
@@ -0,0 +1,37 @@
+#ifndef CFP_ARRAY_1D
+#define CFP_ARRAY_1D
+
+#include <stddef.h>
+#include "zfp/types.h"
+
+struct cfp_array1d;
+typedef struct cfp_array1d cfp_array1d;
+
+typedef struct {
+  cfp_array1d* (*ctor_default)();
+  cfp_array1d* (*ctor)(uint n, double rate, const double* p, size_t csize);
+  cfp_array1d* (*ctor_copy)(const cfp_array1d* src);
+  void (*dtor)(cfp_array1d* self);
+
+  void (*deep_copy)(cfp_array1d* self, const cfp_array1d* src);
+
+  double (*rate)(const cfp_array1d* self);
+  double (*set_rate)(cfp_array1d* self, double rate);
+  size_t (*cache_size)(const cfp_array1d* self);
+  void (*set_cache_size)(cfp_array1d* self, size_t csize);
+  void (*clear_cache)(const cfp_array1d* self);
+  void (*flush_cache)(const cfp_array1d* self);
+  size_t (*compressed_size)(const cfp_array1d* self);
+  uchar* (*compressed_data)(const cfp_array1d* self);
+  size_t (*size)(const cfp_array1d* self);
+  void (*resize)(cfp_array1d* self, uint n, int clear);
+
+  void (*get_array)(const cfp_array1d* self, double* p);
+  void (*set_array)(cfp_array1d* self, const double* p);
+  double (*get_flat)(const cfp_array1d* self, uint i);
+  void (*set_flat)(cfp_array1d* self, uint i, double val);
+  double (*get)(const cfp_array1d* self, uint i);
+  void (*set)(cfp_array1d* self, uint i, double val);
+} cfp_array1d_api;
+
+#endif
diff --git a/zfp/cfp/include/cfparray1f.h b/zfp/cfp/include/cfparray1f.h
new file mode 100644
index 0000000000000000000000000000000000000000..90d52391e8925803f27a67d1a68928589e4abe8f
--- /dev/null
+++ b/zfp/cfp/include/cfparray1f.h
@@ -0,0 +1,37 @@
+#ifndef CFP_ARRAY_1F
+#define CFP_ARRAY_1F
+
+#include <stddef.h>
+#include "zfp/types.h"
+
+struct cfp_array1f;
+typedef struct cfp_array1f cfp_array1f;
+
+typedef struct {
+  cfp_array1f* (*ctor_default)();
+  cfp_array1f* (*ctor)(uint n, double rate, const float* p, size_t csize);
+  cfp_array1f* (*ctor_copy)(const cfp_array1f* src);
+  void (*dtor)(cfp_array1f* self);
+
+  void (*deep_copy)(cfp_array1f* self, const cfp_array1f* src);
+
+  double (*rate)(const cfp_array1f* self);
+  double (*set_rate)(cfp_array1f* self, double rate);
+  size_t (*cache_size)(const cfp_array1f* self);
+  void (*set_cache_size)(cfp_array1f* self, size_t csize);
+  void (*clear_cache)(const cfp_array1f* self);
+  void (*flush_cache)(const cfp_array1f* self);
+  size_t (*compressed_size)(const cfp_array1f* self);
+  uchar* (*compressed_data)(const cfp_array1f* self);
+  size_t (*size)(const cfp_array1f* self);
+  void (*resize)(cfp_array1f* self, uint n, int clear);
+
+  void (*get_array)(const cfp_array1f* self, float* p);
+  void (*set_array)(cfp_array1f* self, const float* p);
+  float (*get_flat)(const cfp_array1f* self, uint i);
+  void (*set_flat)(cfp_array1f* self, uint i, float val);
+  float (*get)(const cfp_array1f* self, uint i);
+  void (*set)(cfp_array1f* self, uint i, float val);
+} cfp_array1f_api;
+
+#endif
diff --git a/zfp/cfp/include/cfparray2d.h b/zfp/cfp/include/cfparray2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8d4c2a849d47078b971f00d31333ca23e4f1af1
--- /dev/null
+++ b/zfp/cfp/include/cfparray2d.h
@@ -0,0 +1,39 @@
+#ifndef CFP_ARRAY_2D
+#define CFP_ARRAY_2D
+
+#include <stddef.h>
+#include "zfp/types.h"
+
+struct cfp_array2d;
+typedef struct cfp_array2d cfp_array2d;
+
+typedef struct {
+  cfp_array2d* (*ctor_default)();
+  cfp_array2d* (*ctor)(uint nx, uint ny, double rate, const double* p, size_t csize);
+  cfp_array2d* (*ctor_copy)(const cfp_array2d* src);
+  void (*dtor)(cfp_array2d* self);
+
+  void (*deep_copy)(cfp_array2d* self, const cfp_array2d* src);
+
+  double (*rate)(const cfp_array2d* self);
+  double (*set_rate)(cfp_array2d* self, double rate);
+  size_t (*cache_size)(const cfp_array2d* self);
+  void (*set_cache_size)(cfp_array2d* self, size_t csize);
+  void (*clear_cache)(const cfp_array2d* self);
+  void (*flush_cache)(const cfp_array2d* self);
+  size_t (*compressed_size)(const cfp_array2d* self);
+  uchar* (*compressed_data)(const cfp_array2d* self);
+  size_t (*size)(const cfp_array2d* self);
+  uint (*size_x)(const cfp_array2d* self);
+  uint (*size_y)(const cfp_array2d* self);
+  void (*resize)(cfp_array2d* self, uint nx, uint ny, int clear);
+
+  void (*get_array)(const cfp_array2d* self, double* p);
+  void (*set_array)(cfp_array2d* self, const double* p);
+  double (*get_flat)(const cfp_array2d* self, uint i);
+  void (*set_flat)(cfp_array2d* self, uint i, double val);
+  double (*get)(const cfp_array2d* self, uint i, uint j);
+  void (*set)(cfp_array2d* self, uint i, uint j, double val);
+} cfp_array2d_api;
+
+#endif
diff --git a/zfp/cfp/include/cfparray2f.h b/zfp/cfp/include/cfparray2f.h
new file mode 100644
index 0000000000000000000000000000000000000000..a531ac2403e9347bd6ff200a6e614bf3da325dce
--- /dev/null
+++ b/zfp/cfp/include/cfparray2f.h
@@ -0,0 +1,39 @@
+#ifndef CFP_ARRAY_2F
+#define CFP_ARRAY_2F
+
+#include <stddef.h>
+#include "zfp/types.h"
+
+struct cfp_array2f;
+typedef struct cfp_array2f cfp_array2f;
+
+typedef struct {
+  cfp_array2f* (*ctor_default)();
+  cfp_array2f* (*ctor)(uint nx, uint ny, double rate, const float* p, size_t csize);
+  cfp_array2f* (*ctor_copy)(const cfp_array2f* src);
+  void (*dtor)(cfp_array2f* self);
+
+  void (*deep_copy)(cfp_array2f* self, const cfp_array2f* src);
+
+  double (*rate)(const cfp_array2f* self);
+  double (*set_rate)(cfp_array2f* self, double rate);
+  size_t (*cache_size)(const cfp_array2f* self);
+  void (*set_cache_size)(cfp_array2f* self, size_t csize);
+  void (*clear_cache)(const cfp_array2f* self);
+  void (*flush_cache)(const cfp_array2f* self);
+  size_t (*compressed_size)(const cfp_array2f* self);
+  uchar* (*compressed_data)(const cfp_array2f* self);
+  size_t (*size)(const cfp_array2f* self);
+  uint (*size_x)(const cfp_array2f* self);
+  uint (*size_y)(const cfp_array2f* self);
+  void (*resize)(cfp_array2f* self, uint nx, uint ny, int clear);
+
+  void (*get_array)(const cfp_array2f* self, float* p);
+  void (*set_array)(cfp_array2f* self, const float* p);
+  float (*get_flat)(const cfp_array2f* self, uint i);
+  void (*set_flat)(cfp_array2f* self, uint i, float val);
+  float (*get)(const cfp_array2f* self, uint i, uint j);
+  void (*set)(cfp_array2f* self, uint i, uint j, float val);
+} cfp_array2f_api;
+
+#endif
diff --git a/zfp/cfp/include/cfparray3d.h b/zfp/cfp/include/cfparray3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..8390a61949d2bb4840fe85b63637ee731b6f1b7a
--- /dev/null
+++ b/zfp/cfp/include/cfparray3d.h
@@ -0,0 +1,40 @@
+#ifndef CFP_ARRAY_3D
+#define CFP_ARRAY_3D
+
+#include <stddef.h>
+#include "zfp/types.h"
+
+struct cfp_array3d;
+typedef struct cfp_array3d cfp_array3d;
+
+typedef struct {
+  cfp_array3d* (*ctor_default)();
+  cfp_array3d* (*ctor)(uint nx, uint ny, uint nz, double rate, const double* p, size_t csize);
+  cfp_array3d* (*ctor_copy)(const cfp_array3d* src);
+  void (*dtor)(cfp_array3d* self);
+
+  void (*deep_copy)(cfp_array3d* self, const cfp_array3d* src);
+
+  double (*rate)(const cfp_array3d* self);
+  double (*set_rate)(cfp_array3d* self, double rate);
+  size_t (*cache_size)(const cfp_array3d* self);
+  void (*set_cache_size)(cfp_array3d* self, size_t csize);
+  void (*clear_cache)(const cfp_array3d* self);
+  void (*flush_cache)(const cfp_array3d* self);
+  size_t (*compressed_size)(const cfp_array3d* self);
+  uchar* (*compressed_data)(const cfp_array3d* self);
+  size_t (*size)(const cfp_array3d* self);
+  uint (*size_x)(const cfp_array3d* self);
+  uint (*size_y)(const cfp_array3d* self);
+  uint (*size_z)(const cfp_array3d* self);
+  void (*resize)(cfp_array3d* self, uint nx, uint ny, uint nz, int clear);
+
+  void (*get_array)(const cfp_array3d* self, double* p);
+  void (*set_array)(cfp_array3d* self, const double* p);
+  double (*get_flat)(const cfp_array3d* self, uint i);
+  void (*set_flat)(cfp_array3d* self, uint i, double val);
+  double (*get)(const cfp_array3d* self, uint i, uint j, uint k);
+  void (*set)(cfp_array3d* self, uint i, uint j, uint k, double val);
+} cfp_array3d_api;
+
+#endif
diff --git a/zfp/cfp/include/cfparray3f.h b/zfp/cfp/include/cfparray3f.h
new file mode 100644
index 0000000000000000000000000000000000000000..0261df3132814c9e2645153ee47388341e3b01c6
--- /dev/null
+++ b/zfp/cfp/include/cfparray3f.h
@@ -0,0 +1,40 @@
+#ifndef CFP_ARRAY_3F
+#define CFP_ARRAY_3F
+
+#include <stddef.h>
+#include "zfp/types.h"
+
+struct cfp_array3f;
+typedef struct cfp_array3f cfp_array3f;
+
+typedef struct {
+  cfp_array3f* (*ctor_default)();
+  cfp_array3f* (*ctor)(uint nx, uint ny, uint nz, double rate, const float* p, size_t csize);
+  cfp_array3f* (*ctor_copy)(const cfp_array3f* src);
+  void (*dtor)(cfp_array3f* self);
+
+  void (*deep_copy)(cfp_array3f* self, const cfp_array3f* src);
+
+  double (*rate)(const cfp_array3f* self);
+  double (*set_rate)(cfp_array3f* self, double rate);
+  size_t (*cache_size)(const cfp_array3f* self);
+  void (*set_cache_size)(cfp_array3f* self, size_t csize);
+  void (*clear_cache)(const cfp_array3f* self);
+  void (*flush_cache)(const cfp_array3f* self);
+  size_t (*compressed_size)(const cfp_array3f* self);
+  uchar* (*compressed_data)(const cfp_array3f* self);
+  size_t (*size)(const cfp_array3f* self);
+  uint (*size_x)(const cfp_array3f* self);
+  uint (*size_y)(const cfp_array3f* self);
+  uint (*size_z)(const cfp_array3f* self);
+  void (*resize)(cfp_array3f* self, uint nx, uint ny, uint nz, int clear);
+
+  void (*get_array)(const cfp_array3f* self, float* p);
+  void (*set_array)(cfp_array3f* self, const float* p);
+  float (*get_flat)(const cfp_array3f* self, uint i);
+  void (*set_flat)(cfp_array3f* self, uint i, float val);
+  float (*get)(const cfp_array3f* self, uint i, uint j, uint k);
+  void (*set)(cfp_array3f* self, uint i, uint j, uint k, float val);
+} cfp_array3f_api;
+
+#endif
diff --git a/zfp/cfp/include/cfparrays.h b/zfp/cfp/include/cfparrays.h
new file mode 100644
index 0000000000000000000000000000000000000000..f716d8283eca427e58f34cd72bc178a85cd1e198
--- /dev/null
+++ b/zfp/cfp/include/cfparrays.h
@@ -0,0 +1,28 @@
+#ifndef CFP_ARRAYS
+#define CFP_ARRAYS
+
+#include "cfparray1f.h"
+#include "cfparray1d.h"
+#include "cfparray2f.h"
+#include "cfparray2d.h"
+#include "cfparray3f.h"
+#include "cfparray3d.h"
+
+#include "zfp/system.h"
+
+typedef struct {
+  cfp_array1f_api array1f;
+  cfp_array1d_api array1d;
+  cfp_array2f_api array2f;
+  cfp_array2d_api array2d;
+  cfp_array3f_api array3f;
+  cfp_array3d_api array3d;
+} cfp_api;
+
+#ifndef CFP_NAMESPACE
+  #define CFP_NAMESPACE cfp
+#endif
+
+extern_ const cfp_api CFP_NAMESPACE;
+
+#endif
diff --git a/zfp/cfp/src/CMakeLists.txt b/zfp/cfp/src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d2e8c680dfb4f658689663799469e0299f63c0be
--- /dev/null
+++ b/zfp/cfp/src/CMakeLists.txt
@@ -0,0 +1,25 @@
+add_library(cfp cfparrays.cpp)
+
+if(DEFINED CFP_NAMESPACE)
+  list(APPEND cfp_public_defs "CFP_NAMESPACE=${CFP_NAMESPACE}")
+endif()
+
+if(WIN32)
+  # define ZFP_SOURCE when compiling libcfp to export symbols to Windows DLL
+  list(APPEND cfp_private_defs ZFP_SOURCE)
+endif()
+
+target_compile_definitions(cfp
+  PUBLIC ${cfp_public_defs}
+  PRIVATE ${cfp_private_defs})
+
+target_include_directories(cfp
+  PUBLIC
+    ${ZFP_SOURCE_DIR}/include
+    ${ZFP_SOURCE_DIR}/cfp/include
+  PRIVATE
+    ${ZFP_SOURCE_DIR}/array
+    ${ZFP_SOURCE_DIR}/src
+)
+
+target_link_libraries(cfp zfp)
diff --git a/zfp/cfp/src/Makefile b/zfp/cfp/src/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..eef12ffc96e4deea4e43de88be54dcaeed012aa7
--- /dev/null
+++ b/zfp/cfp/src/Makefile
@@ -0,0 +1,25 @@
+include ../../Config
+
+CXXFLAGS += -I../../include -I../../src -I../../array
+LIBDIR = ../../lib
+TARGETS = $(LIBDIR)/libcfp.a $(LIBDIR)/libcfp.so
+OBJECTS = cfparrays.o
+
+static: $(LIBDIR)/libcfp.a
+
+shared: $(LIBDIR)/libcfp.so
+
+clean:
+	rm -f $(TARGETS) $(OBJECTS)
+
+$(LIBDIR)/libcfp.a: $(OBJECTS)
+	mkdir -p $(LIBDIR)
+	rm -f $@
+	ar rc $@ $^
+
+$(LIBDIR)/libcfp.so: $(OBJECTS)
+	mkdir -p $(LIBDIR)
+	$(CXX) $(CXXLAGS) -shared $(SOFLAGS) $^ -o $@
+
+.cpp.o:
+	$(CXX) $(CXXFLAGS) -c $<
diff --git a/zfp/cfp/src/cfparray1_source.cpp b/zfp/cfp/src/cfparray1_source.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bdab414d4b9e7ba61349d577194af47ab57c2487
--- /dev/null
+++ b/zfp/cfp/src/cfparray1_source.cpp
@@ -0,0 +1,23 @@
+static CFP_ARRAY_TYPE *
+_t1(CFP_ARRAY_TYPE, ctor)(uint n, double rate, const ZFP_SCALAR_TYPE * p, size_t csize)
+{
+  return reinterpret_cast<CFP_ARRAY_TYPE *>(new ZFP_ARRAY_TYPE(n, rate, p, csize));
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, resize)(CFP_ARRAY_TYPE * self, uint n, int clear)
+{
+  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->resize(n, clear);
+}
+
+static ZFP_SCALAR_TYPE
+_t1(CFP_ARRAY_TYPE, get)(const CFP_ARRAY_TYPE * self, uint i)
+{
+  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->operator()(i);
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, set)(CFP_ARRAY_TYPE * self, uint i, ZFP_SCALAR_TYPE val)
+{
+  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->operator()(i) = val;
+}
diff --git a/zfp/cfp/src/cfparray1d.cpp b/zfp/cfp/src/cfparray1d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e71b0d2d745ecb7da11ddc95e71d5f1576de4f5
--- /dev/null
+++ b/zfp/cfp/src/cfparray1d.cpp
@@ -0,0 +1,15 @@
+#include "cfparray1d.h"
+#include "zfparray1.h"
+
+#include "template/template.h"
+
+#define CFP_ARRAY_TYPE cfp_array1d
+#define ZFP_ARRAY_TYPE zfp::array1d
+#define ZFP_SCALAR_TYPE double
+
+#include "cfparray_source.cpp"
+#include "cfparray1_source.cpp"
+
+#undef CFP_ARRAY_TYPE
+#undef ZFP_ARRAY_TYPE
+#undef ZFP_SCALAR_TYPE
diff --git a/zfp/cfp/src/cfparray1f.cpp b/zfp/cfp/src/cfparray1f.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..56ecda58030d1fe528e8dce4a817df0208e1f318
--- /dev/null
+++ b/zfp/cfp/src/cfparray1f.cpp
@@ -0,0 +1,15 @@
+#include "cfparray1f.h"
+#include "zfparray1.h"
+
+#include "template/template.h"
+
+#define CFP_ARRAY_TYPE cfp_array1f
+#define ZFP_ARRAY_TYPE zfp::array1f
+#define ZFP_SCALAR_TYPE float
+
+#include "cfparray_source.cpp"
+#include "cfparray1_source.cpp"
+
+#undef CFP_ARRAY_TYPE
+#undef ZFP_ARRAY_TYPE
+#undef ZFP_SCALAR_TYPE
diff --git a/zfp/cfp/src/cfparray2_source.cpp b/zfp/cfp/src/cfparray2_source.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6135ae40eb70b31521a4a60633fea7f2faaf5800
--- /dev/null
+++ b/zfp/cfp/src/cfparray2_source.cpp
@@ -0,0 +1,35 @@
+static CFP_ARRAY_TYPE *
+_t1(CFP_ARRAY_TYPE, ctor)(uint nx, uint ny, double rate, const ZFP_SCALAR_TYPE * p, size_t csize)
+{
+  return reinterpret_cast<CFP_ARRAY_TYPE *>(new ZFP_ARRAY_TYPE(nx, ny, rate, p, csize));
+}
+
+static uint
+_t1(CFP_ARRAY_TYPE, size_x)(const CFP_ARRAY_TYPE * self)
+{
+  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_x();
+}
+
+static uint
+_t1(CFP_ARRAY_TYPE, size_y)(const CFP_ARRAY_TYPE * self)
+{
+  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_y();
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, resize)(CFP_ARRAY_TYPE * self, uint nx, uint ny, int clear)
+{
+  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->resize(nx, ny, clear);
+}
+
+static ZFP_SCALAR_TYPE
+_t1(CFP_ARRAY_TYPE, get)(const CFP_ARRAY_TYPE * self, uint i, uint j)
+{
+  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->operator()(i, j);
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, set)(CFP_ARRAY_TYPE * self, uint i, uint j, ZFP_SCALAR_TYPE val)
+{
+  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->operator()(i, j) = val;
+}
diff --git a/zfp/cfp/src/cfparray2d.cpp b/zfp/cfp/src/cfparray2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3debb2b8044dd8bd9bbce4213fcf0fd30c380121
--- /dev/null
+++ b/zfp/cfp/src/cfparray2d.cpp
@@ -0,0 +1,15 @@
+#include "cfparray2d.h"
+#include "zfparray2.h"
+
+#include "template/template.h"
+
+#define CFP_ARRAY_TYPE cfp_array2d
+#define ZFP_ARRAY_TYPE zfp::array2d
+#define ZFP_SCALAR_TYPE double
+
+#include "cfparray_source.cpp"
+#include "cfparray2_source.cpp"
+
+#undef CFP_ARRAY_TYPE
+#undef ZFP_ARRAY_TYPE
+#undef ZFP_SCALAR_TYPE
diff --git a/zfp/cfp/src/cfparray2f.cpp b/zfp/cfp/src/cfparray2f.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..37407cc8a75277b0129c50b71ef36ede82fbc557
--- /dev/null
+++ b/zfp/cfp/src/cfparray2f.cpp
@@ -0,0 +1,15 @@
+#include "cfparray2f.h"
+#include "zfparray2.h"
+
+#include "template/template.h"
+
+#define CFP_ARRAY_TYPE cfp_array2f
+#define ZFP_ARRAY_TYPE zfp::array2f
+#define ZFP_SCALAR_TYPE float
+
+#include "cfparray_source.cpp"
+#include "cfparray2_source.cpp"
+
+#undef CFP_ARRAY_TYPE
+#undef ZFP_ARRAY_TYPE
+#undef ZFP_SCALAR_TYPE
diff --git a/zfp/cfp/src/cfparray3_source.cpp b/zfp/cfp/src/cfparray3_source.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae2ebf6da29835cd2a348ac4de392ab722bec44f
--- /dev/null
+++ b/zfp/cfp/src/cfparray3_source.cpp
@@ -0,0 +1,41 @@
+static CFP_ARRAY_TYPE *
+_t1(CFP_ARRAY_TYPE, ctor)(uint nx, uint ny, uint nz, double rate, const ZFP_SCALAR_TYPE * p, size_t csize)
+{
+  return reinterpret_cast<CFP_ARRAY_TYPE *>(new ZFP_ARRAY_TYPE(nx, ny, nz, rate, p, csize));
+}
+
+static uint
+_t1(CFP_ARRAY_TYPE, size_x)(const CFP_ARRAY_TYPE * self)
+{
+  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_x();
+}
+
+static uint
+_t1(CFP_ARRAY_TYPE, size_y)(const CFP_ARRAY_TYPE * self)
+{
+  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_y();
+}
+
+static uint
+_t1(CFP_ARRAY_TYPE, size_z)(const CFP_ARRAY_TYPE * self)
+{
+  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_z();
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, resize)(CFP_ARRAY_TYPE * self, uint nx, uint ny, uint nz, int clear)
+{
+  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->resize(nx, ny, nz, clear);
+}
+
+static ZFP_SCALAR_TYPE
+_t1(CFP_ARRAY_TYPE, get)(const CFP_ARRAY_TYPE * self, uint i, uint j, uint k)
+{
+  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->operator()(i, j, k);
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, set)(CFP_ARRAY_TYPE * self, uint i, uint j, uint k, ZFP_SCALAR_TYPE val)
+{
+  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->operator()(i, j, k) = val;
+}
diff --git a/zfp/cfp/src/cfparray3d.cpp b/zfp/cfp/src/cfparray3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fb5cc2e248148639d07186ed65778d62c1e696ef
--- /dev/null
+++ b/zfp/cfp/src/cfparray3d.cpp
@@ -0,0 +1,15 @@
+#include "cfparray3d.h"
+#include "zfparray3.h"
+
+#include "template/template.h"
+
+#define CFP_ARRAY_TYPE cfp_array3d
+#define ZFP_ARRAY_TYPE zfp::array3d
+#define ZFP_SCALAR_TYPE double
+
+#include "cfparray_source.cpp"
+#include "cfparray3_source.cpp"
+
+#undef CFP_ARRAY_TYPE
+#undef ZFP_ARRAY_TYPE
+#undef ZFP_SCALAR_TYPE
diff --git a/zfp/cfp/src/cfparray3f.cpp b/zfp/cfp/src/cfparray3f.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..69331b1c1a0463ff9b2db941ac1a938a6ade5ce8
--- /dev/null
+++ b/zfp/cfp/src/cfparray3f.cpp
@@ -0,0 +1,15 @@
+#include "cfparray3f.h"
+#include "zfparray3.h"
+
+#include "template/template.h"
+
+#define CFP_ARRAY_TYPE cfp_array3f
+#define ZFP_ARRAY_TYPE zfp::array3f
+#define ZFP_SCALAR_TYPE float
+
+#include "cfparray_source.cpp"
+#include "cfparray3_source.cpp"
+
+#undef CFP_ARRAY_TYPE
+#undef ZFP_ARRAY_TYPE
+#undef ZFP_SCALAR_TYPE
diff --git a/zfp/cfp/src/cfparray_source.cpp b/zfp/cfp/src/cfparray_source.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d94e1a4979534f03b0ef278d4b0e2671bc642766
--- /dev/null
+++ b/zfp/cfp/src/cfparray_source.cpp
@@ -0,0 +1,106 @@
+// common constructor, destructor
+static CFP_ARRAY_TYPE *
+_t1(CFP_ARRAY_TYPE, ctor_default)()
+{
+  return reinterpret_cast<CFP_ARRAY_TYPE *>(new ZFP_ARRAY_TYPE());
+}
+
+static CFP_ARRAY_TYPE *
+_t1(CFP_ARRAY_TYPE, ctor_copy)(const CFP_ARRAY_TYPE * src)
+{
+  return reinterpret_cast<CFP_ARRAY_TYPE *>(
+    new ZFP_ARRAY_TYPE(*reinterpret_cast<const ZFP_ARRAY_TYPE *>(src))
+  );
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, dtor)(CFP_ARRAY_TYPE * self)
+{
+  delete reinterpret_cast<ZFP_ARRAY_TYPE *>(self);
+}
+
+// functions defined in zfparray.h (base class)
+static double
+_t1(CFP_ARRAY_TYPE, rate)(const CFP_ARRAY_TYPE * self)
+{
+  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->rate();
+}
+
+static double
+_t1(CFP_ARRAY_TYPE, set_rate)(CFP_ARRAY_TYPE * self, double rate)
+{
+  return reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->set_rate(rate);
+}
+
+static size_t
+_t1(CFP_ARRAY_TYPE, compressed_size)(const CFP_ARRAY_TYPE * self)
+{
+  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->compressed_size();
+}
+
+static uchar*
+_t1(CFP_ARRAY_TYPE, compressed_data)(const CFP_ARRAY_TYPE * self)
+{
+  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->compressed_data();
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, deep_copy)(CFP_ARRAY_TYPE * self, const CFP_ARRAY_TYPE * src)
+{
+  *reinterpret_cast<ZFP_ARRAY_TYPE *>(self) = *reinterpret_cast<const ZFP_ARRAY_TYPE *>(src);
+}
+
+// functions defined in subclasses
+static size_t
+_t1(CFP_ARRAY_TYPE, size)(const CFP_ARRAY_TYPE * self)
+{
+  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size();
+}
+
+static size_t
+_t1(CFP_ARRAY_TYPE, cache_size)(const CFP_ARRAY_TYPE * self)
+{
+  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->cache_size();
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, set_cache_size)(CFP_ARRAY_TYPE * self, size_t csize)
+{
+  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->set_cache_size(csize);
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, clear_cache)(const CFP_ARRAY_TYPE * self)
+{
+  reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->clear_cache();
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, flush_cache)(const CFP_ARRAY_TYPE * self)
+{
+  reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->flush_cache();
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, get_array)(const CFP_ARRAY_TYPE * self, ZFP_SCALAR_TYPE * p)
+{
+  reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->get(p);
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, set_array)(CFP_ARRAY_TYPE * self, const ZFP_SCALAR_TYPE * p)
+{
+  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->set(p);
+}
+
+static ZFP_SCALAR_TYPE
+_t1(CFP_ARRAY_TYPE, get_flat)(const CFP_ARRAY_TYPE * self, uint i)
+{
+  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->operator[](i);
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, set_flat)(CFP_ARRAY_TYPE * self, uint i, ZFP_SCALAR_TYPE val)
+{
+  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->operator[](i) = val;
+}
diff --git a/zfp/cfp/src/cfparrays.cpp b/zfp/cfp/src/cfparrays.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bcd886868a2cd7510664183b3587eeca44bd1591
--- /dev/null
+++ b/zfp/cfp/src/cfparrays.cpp
@@ -0,0 +1,183 @@
+#include "cfparrays.h"
+
+#include "cfparray1f.cpp"
+#include "cfparray1d.cpp"
+#include "cfparray2f.cpp"
+#include "cfparray2d.cpp"
+#include "cfparray3f.cpp"
+#include "cfparray3d.cpp"
+
+export_ const cfp_api CFP_NAMESPACE = {
+  // array1f
+  {
+    cfp_array1f_ctor_default,
+    cfp_array1f_ctor,
+    cfp_array1f_ctor_copy,
+    cfp_array1f_dtor,
+
+    cfp_array1f_deep_copy,
+
+    cfp_array1f_rate,
+    cfp_array1f_set_rate,
+    cfp_array1f_cache_size,
+    cfp_array1f_set_cache_size,
+    cfp_array1f_clear_cache,
+    cfp_array1f_flush_cache,
+    cfp_array1f_compressed_size,
+    cfp_array1f_compressed_data,
+    cfp_array1f_size,
+    cfp_array1f_resize,
+
+    cfp_array1f_get_array,
+    cfp_array1f_set_array,
+    cfp_array1f_get_flat,
+    cfp_array1f_set_flat,
+    cfp_array1f_get,
+    cfp_array1f_set,
+  },
+  // array1d
+  {
+    cfp_array1d_ctor_default,
+    cfp_array1d_ctor,
+    cfp_array1d_ctor_copy,
+    cfp_array1d_dtor,
+
+    cfp_array1d_deep_copy,
+
+    cfp_array1d_rate,
+    cfp_array1d_set_rate,
+    cfp_array1d_cache_size,
+    cfp_array1d_set_cache_size,
+    cfp_array1d_clear_cache,
+    cfp_array1d_flush_cache,
+    cfp_array1d_compressed_size,
+    cfp_array1d_compressed_data,
+    cfp_array1d_size,
+    cfp_array1d_resize,
+
+    cfp_array1d_get_array,
+    cfp_array1d_set_array,
+    cfp_array1d_get_flat,
+    cfp_array1d_set_flat,
+    cfp_array1d_get,
+    cfp_array1d_set,
+  },
+  // array2f
+  {
+    cfp_array2f_ctor_default,
+    cfp_array2f_ctor,
+    cfp_array2f_ctor_copy,
+    cfp_array2f_dtor,
+
+    cfp_array2f_deep_copy,
+
+    cfp_array2f_rate,
+    cfp_array2f_set_rate,
+    cfp_array2f_cache_size,
+    cfp_array2f_set_cache_size,
+    cfp_array2f_clear_cache,
+    cfp_array2f_flush_cache,
+    cfp_array2f_compressed_size,
+    cfp_array2f_compressed_data,
+    cfp_array2f_size,
+    cfp_array2f_size_x,
+    cfp_array2f_size_y,
+    cfp_array2f_resize,
+
+    cfp_array2f_get_array,
+    cfp_array2f_set_array,
+    cfp_array2f_get_flat,
+    cfp_array2f_set_flat,
+    cfp_array2f_get,
+    cfp_array2f_set,
+  },
+  // array2d
+  {
+    cfp_array2d_ctor_default,
+    cfp_array2d_ctor,
+    cfp_array2d_ctor_copy,
+    cfp_array2d_dtor,
+
+    cfp_array2d_deep_copy,
+
+    cfp_array2d_rate,
+    cfp_array2d_set_rate,
+    cfp_array2d_cache_size,
+    cfp_array2d_set_cache_size,
+    cfp_array2d_clear_cache,
+    cfp_array2d_flush_cache,
+    cfp_array2d_compressed_size,
+    cfp_array2d_compressed_data,
+    cfp_array2d_size,
+    cfp_array2d_size_x,
+    cfp_array2d_size_y,
+    cfp_array2d_resize,
+
+    cfp_array2d_get_array,
+    cfp_array2d_set_array,
+    cfp_array2d_get_flat,
+    cfp_array2d_set_flat,
+    cfp_array2d_get,
+    cfp_array2d_set,
+  },
+  // array3f
+  {
+    cfp_array3f_ctor_default,
+    cfp_array3f_ctor,
+    cfp_array3f_ctor_copy,
+    cfp_array3f_dtor,
+
+    cfp_array3f_deep_copy,
+
+    cfp_array3f_rate,
+    cfp_array3f_set_rate,
+    cfp_array3f_cache_size,
+    cfp_array3f_set_cache_size,
+    cfp_array3f_clear_cache,
+    cfp_array3f_flush_cache,
+    cfp_array3f_compressed_size,
+    cfp_array3f_compressed_data,
+    cfp_array3f_size,
+    cfp_array3f_size_x,
+    cfp_array3f_size_y,
+    cfp_array3f_size_z,
+    cfp_array3f_resize,
+
+    cfp_array3f_get_array,
+    cfp_array3f_set_array,
+    cfp_array3f_get_flat,
+    cfp_array3f_set_flat,
+    cfp_array3f_get,
+    cfp_array3f_set,
+  },
+  // array3d
+  {
+    cfp_array3d_ctor_default,
+    cfp_array3d_ctor,
+    cfp_array3d_ctor_copy,
+    cfp_array3d_dtor,
+
+    cfp_array3d_deep_copy,
+
+    cfp_array3d_rate,
+    cfp_array3d_set_rate,
+    cfp_array3d_cache_size,
+    cfp_array3d_set_cache_size,
+    cfp_array3d_clear_cache,
+    cfp_array3d_flush_cache,
+    cfp_array3d_compressed_size,
+    cfp_array3d_compressed_data,
+    cfp_array3d_size,
+    cfp_array3d_size_x,
+    cfp_array3d_size_y,
+    cfp_array3d_size_z,
+    cfp_array3d_resize,
+
+    cfp_array3d_get_array,
+    cfp_array3d_set_array,
+    cfp_array3d_get_flat,
+    cfp_array3d_set_flat,
+    cfp_array3d_get,
+    cfp_array3d_set,
+  },
+};
diff --git a/zfp/examples/CMakeLists.txt b/zfp/examples/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c879beab60b10aeb05e085a3f7a8d33333725c2d
--- /dev/null
+++ b/zfp/examples/CMakeLists.txt
@@ -0,0 +1,35 @@
+add_executable(diffusion diffusion.cpp)
+target_link_libraries(diffusion zfp)
+target_compile_definitions(diffusion PRIVATE ${zfp_defs})
+
+add_executable(diffusionC diffusionC.c)
+target_link_libraries(diffusionC cfp)
+target_compile_definitions(diffusionC PRIVATE ${zfp_defs})
+
+add_executable(inplace inplace.c)
+target_link_libraries(inplace zfp)
+target_compile_definitions(inplace PRIVATE ${zfp_defs})
+
+add_executable(iterator iterator.cpp)
+target_link_libraries(iterator zfp)
+target_compile_definitions(iterator PRIVATE ${zfp_defs})
+
+add_executable(pgm pgm.c)
+target_link_libraries(pgm zfp)
+target_compile_definitions(pgm PRIVATE ${zfp_defs})
+
+add_executable(simple simple.c)
+target_link_libraries(simple zfp)
+target_compile_definitions(simple PRIVATE ${zfp_defs})
+
+add_executable(speed speed.c)
+target_link_libraries(speed zfp)
+target_compile_definitions(speed PRIVATE ${zfp_defs})
+
+if(HAVE_LIBM_MATH)
+  target_link_libraries(diffusion m)
+  target_link_libraries(diffusionC m)
+  target_link_libraries(inplace m)
+  target_link_libraries(pgm m)
+  target_link_libraries(simple m)
+endif()
diff --git a/zfp/examples/Makefile b/zfp/examples/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..bb44b1e17ac76820d7f39b0e12ea1c5aaa11afc0
--- /dev/null
+++ b/zfp/examples/Makefile
@@ -0,0 +1,35 @@
+include ../Config
+
+BINDIR = ../bin
+TARGETS = $(BINDIR)/diffusion\
+	  $(BINDIR)/inplace\
+	  $(BINDIR)/iterator\
+	  $(BINDIR)/pgm\
+	  $(BINDIR)/simple\
+	  $(BINDIR)/speed
+LIBS = -L../lib -lzfp
+CLIBS = $(LIBS) -lm
+CXXLIBS = $(LIBS)
+
+all: $(TARGETS)
+
+$(BINDIR)/diffusion: diffusion.cpp ../lib/$(LIBZFP)
+	$(CXX) $(CXXFLAGS) -I../array diffusion.cpp $(CXXLIBS) -o $@
+
+$(BINDIR)/inplace: inplace.c ../lib/$(LIBZFP)
+	$(CC) $(CFLAGS) inplace.c $(CLIBS) -o $@
+
+$(BINDIR)/iterator: iterator.cpp ../lib/$(LIBZFP)
+	$(CXX) $(CXXFLAGS) -I../array iterator.cpp $(CXXLIBS) -o $@
+
+$(BINDIR)/pgm: pgm.c ../lib/$(LIBZFP)
+	$(CC) $(CFLAGS) pgm.c $(CLIBS) -o $@
+
+$(BINDIR)/simple: simple.c ../lib/$(LIBZFP)
+	$(CC) $(CFLAGS) simple.c $(CLIBS) -o $@
+
+$(BINDIR)/speed: speed.c ../lib/$(LIBZFP)
+	$(CC) $(CFLAGS) speed.c $(CLIBS) -o $@
+
+clean:
+	rm -f $(TARGETS)
diff --git a/zfp/examples/array2d.h b/zfp/examples/array2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..861fa25a9a27d72864b5a763c20a975b8cb104a3
--- /dev/null
+++ b/zfp/examples/array2d.h
@@ -0,0 +1,49 @@
+#ifndef ARRAY2D_H
+#define ARRAY2D_H
+
+#include <climits>
+#include <vector>
+
+typedef unsigned int uint;
+
+// uncompressed 2D double-precision array (for comparison)
+namespace raw {
+class array2d {
+public:
+  array2d() : nx(0), ny(0) {}
+  array2d(uint nx, uint ny, double rate = 0.0, const double* p = 0, size_t csize = 0) : nx(nx), ny(ny), data(nx * ny, 0.0) {}
+  void resize(uint nx, uint ny) { this->nx = nx; this->ny = ny; data.resize(nx * ny, 0.0); }
+  size_t size() const { return data.size(); }
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+  double rate() const { return CHAR_BIT * sizeof(double); }
+  size_t cache_size() const { return 0; }
+  double& operator()(uint x, uint y) { return data[x + nx * y]; }
+  const double& operator()(uint x, uint y) const { return data[x + nx * y]; }
+  double& operator[](uint i) { return data[i]; }
+  const double& operator[](uint i) const { return data[i]; }
+  class iterator {
+  public:
+    double& operator*() const { return array->operator[](index); }
+    iterator& operator++() { index++; return *this; }
+    iterator operator++(int) { iterator p = *this; index++; return p; }
+    bool operator==(const iterator& it) const { return array == it.array && index == it.index; }
+    bool operator!=(const iterator& it) const { return !operator==(it); }
+    uint i() const { return index % array->nx; }
+    uint j() const { return index / array->nx; }
+  protected:
+    friend class array2d;
+    iterator(array2d* array, uint index) : array(array), index(index) {}
+    array2d* array;
+    uint index;
+  };
+  iterator begin() { return iterator(this, 0); }
+  iterator end() { return iterator(this, nx * ny); }
+protected:
+  uint nx;
+  uint ny;
+  std::vector<double> data;
+};
+}
+
+#endif
diff --git a/zfp/examples/diffusion.cpp b/zfp/examples/diffusion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3512bff428c34891d155cd9f7768369e4e57f137
--- /dev/null
+++ b/zfp/examples/diffusion.cpp
@@ -0,0 +1,281 @@
+// forward Euler finite difference solution to the heat equation on a 2D grid
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include "zfparray2.h"
+#include "array2d.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+// constants used in the solution
+class Constants {
+public:
+  Constants(int nx, int ny, int nt) :
+    nx(nx),
+    ny(ny),
+    nt(nt),
+    x0((nx - 1) / 2),
+    y0((ny - 1) / 2),
+    k(0.04),
+    dx(2.0 / (std::max(nx, ny) - 1)),
+    dy(2.0 / (std::max(nx, ny) - 1)),
+    dt(0.5 * (dx * dx + dy * dy) / (8 * k)),
+    tfinal(nt ? nt * dt : 1.0),
+    pi(3.14159265358979323846)
+  {}
+
+  int nx;        // grid points in x
+  int ny;        // grid points in y
+  int nt;        // number of time steps (0 for default)
+  int x0;        // x location of heat source
+  int y0;        // y location of heat source
+  double k;      // diffusion constant
+  double dx;     // grid spacing in x
+  double dy;     // grid spacing in y
+  double dt;     // time step
+  double tfinal; // minimum time to run solution to
+  double pi;     // 3.141...
+};
+
+template <class array2d>
+inline void
+time_step_parallel(array2d& u, const Constants& c);
+
+// advance solution in parallel via thread-safe views
+template <>
+inline void
+time_step_parallel(zfp::array2d& u, const Constants& c)
+{
+#ifdef _OPENMP
+  // flush shared cache to ensure cache consistency across threads
+  u.flush_cache();
+  // compute du/dt in parallel
+  zfp::array2d du(c.nx, c.ny, u.rate(), 0, u.cache_size());
+  #pragma omp parallel
+  {
+    // create read-only private view of entire array u
+    zfp::array2d::private_const_view myu(&u);
+    // create read-write private view into rectangular subset of du
+    zfp::array2d::private_view mydu(&du);
+    mydu.partition(omp_get_thread_num(), omp_get_num_threads());
+    // process rectangular region owned by this thread
+    for (uint j = 0; j < mydu.size_y(); j++) {
+      int y = mydu.global_y(j);
+      if (1 <= y && y <= c.ny - 2)
+        for (uint i = 0; i < mydu.size_x(); i++) {
+          int x = mydu.global_x(i);
+          if (1 <= x && x <= c.nx - 2) {
+            double uxx = (myu(x - 1, y) - 2 * myu(x, y) + myu(x + 1, y)) / (c.dx * c.dx);
+            double uyy = (myu(x, y - 1) - 2 * myu(x, y) + myu(x, y + 1)) / (c.dy * c.dy);
+            mydu(i, j) = c.dt * c.k * (uxx + uyy);
+          }
+        }
+    }
+    // compress all private cached blocks to shared storage
+    mydu.flush_cache();
+  }
+  // take forward Euler step in serial
+  for (uint i = 0; i < u.size(); i++)
+    u[i] += du[i];
+#endif
+}
+
+// dummy template instantiation; never executed
+template <>
+inline void
+time_step_parallel(raw::array2d& u, const Constants& c)
+{
+}
+
+// advance solution using integer array indices
+template <class array2d>
+inline void
+time_step_indexed(array2d& u, const Constants& c)
+{
+  // compute du/dt
+  array2d du(c.nx, c.ny, u.rate(), 0, u.cache_size());
+  for (int y = 1; y < c.ny - 1; y++) {
+    for (int x = 1; x < c.nx - 1; x++) {
+      double uxx = (u(x - 1, y) - 2 * u(x, y) + u(x + 1, y)) / (c.dx * c.dx);
+      double uyy = (u(x, y - 1) - 2 * u(x, y) + u(x, y + 1)) / (c.dy * c.dy);
+      du(x, y) = c.dt * c.k * (uxx + uyy);
+    }
+  }
+  // take forward Euler step
+  for (uint i = 0; i < u.size(); i++)
+    u[i] += du[i];
+}
+
+// advance solution using array iterators
+template <class array2d>
+inline void
+time_step_iterated(array2d& u, const Constants& c)
+{
+  // compute du/dt
+  array2d du(c.nx, c.ny, u.rate(), 0, u.cache_size());
+  for (typename array2d::iterator p = du.begin(); p != du.end(); p++) {
+    int x = p.i();
+    int y = p.j();
+    if (1 <= x && x <= c.nx - 2 &&
+        1 <= y && y <= c.ny - 2) {
+      double uxx = (u(x - 1, y) - 2 * u(x, y) + u(x + 1, y)) / (c.dx * c.dx);
+      double uyy = (u(x, y - 1) - 2 * u(x, y) + u(x, y + 1)) / (c.dy * c.dy);
+      *p = c.dt * c.k * (uxx + uyy);
+    }
+  }
+  // take forward Euler step
+  for (typename array2d::iterator p = u.begin(), q = du.begin(); p != u.end(); p++, q++)
+    *p += *q;
+}
+
+// solve heat equation using 
+template <class array2d>
+inline double
+solve(array2d& u, const Constants& c, bool iterator, bool parallel)
+{
+  // initialize u with point heat source (u is assumed to be zero initialized)
+  u(c.x0, c.y0) = 1;
+
+  // iterate until final time
+  double t;
+  for (t = 0; t < c.tfinal; t += c.dt) {
+    std::cerr << "t=" << std::setprecision(6) << std::fixed << t << std::endl;
+    if (parallel)
+      time_step_parallel(u, c);
+    else if (iterator)
+      time_step_iterated(u, c);
+    else
+      time_step_indexed(u, c);
+  }
+
+  return t;
+}
+
+// compute sum of array values
+template <class array2d>
+inline double
+total(const array2d& u)
+{
+  double s = 0;
+  const int nx = u.size_x();
+  const int ny = u.size_y();
+  for (int y = 1; y < ny - 1; y++)
+    for (int x = 1; x < nx - 1; x++)
+      s += u(x, y);
+  return s;
+}
+
+// compute root mean square error with respect to exact solution
+template <class array2d>
+inline double
+error(const array2d& u, const Constants& c, double t)
+{
+  double e = 0;
+  for (int y = 1; y < c.ny - 1; y++) {
+    double py = c.dy * (y - c.y0);
+    for (int x = 1; x < c.nx - 1; x++) {
+      double px = c.dx * (x - c.x0);
+      double f = u(x, y);
+      double g = c.dx * c.dy * std::exp(-(px * px + py * py) / (4 * c.k * t)) / (4 * c.pi * c.k * t);
+      e += (f - g) * (f - g);
+    }
+  }
+  return std::sqrt(e / ((c.nx - 2) * (c.ny - 2)));
+}
+
+inline int
+usage()
+{
+  std::cerr << "Usage: diffusion [options]" << std::endl;
+  std::cerr << "Options:" << std::endl;
+  std::cerr << "-i : traverse arrays using iterators" << std::endl;
+  std::cerr << "-n <nx> <ny> : number of grid points" << std::endl;
+#ifdef _OPENMP
+  std::cerr << "-p : use multithreading (only with compressed arrays)" << std::endl;
+#endif
+  std::cerr << "-t <nt> : number of time steps" << std::endl;
+  std::cerr << "-r <rate> : use compressed arrays with 'rate' bits/value" << std::endl;
+  std::cerr << "-c <blocks> : use 'blocks' 4x4 blocks of cache" << std::endl;
+  return EXIT_FAILURE;
+}
+
+int main(int argc, char* argv[])
+{
+  int nx = 100;
+  int ny = 100;
+  int nt = 0;
+  double rate = 64;
+  bool iterator = false;
+  bool compression = false;
+  bool parallel = false;
+  int cache = 0;
+
+  // parse command-line options
+  for (int i = 1; i < argc; i++)
+    if (std::string(argv[i]) == "-i")
+      iterator = true;
+    else if (std::string(argv[i]) == "-n") {
+      if (++i == argc || sscanf(argv[i], "%i", &nx) != 1 ||
+          ++i == argc || sscanf(argv[i], "%i", &ny) != 1)
+        return usage();
+    }
+#ifdef _OPENMP
+    else if (std::string(argv[i]) == "-p")
+      parallel = true;
+#endif
+    else if (std::string(argv[i]) == "-t") {
+      if (++i == argc || sscanf(argv[i], "%i", &nt) != 1)
+        return usage();
+    }
+    else if (std::string(argv[i]) == "-r") {
+      if (++i == argc || sscanf(argv[i], "%lf", &rate) != 1)
+        return usage();
+      compression = true;
+    }
+    else if (std::string(argv[i]) == "-c") {
+      if (++i == argc || sscanf(argv[i], "%i", &cache) != 1)
+        return usage();
+    }
+    else
+      return usage();
+
+  if (parallel && !compression) {
+    fprintf(stderr, "multithreading requires compressed arrays\n");
+    return EXIT_FAILURE;
+  }
+  if (parallel && iterator) {
+    fprintf(stderr, "multithreading does not support iterators\n");
+    return EXIT_FAILURE;
+  }
+
+  Constants c(nx, ny, nt);
+
+  double sum;
+  double err;
+  if (compression) {
+    // solve problem using compressed arrays
+    zfp::array2d u(nx, ny, rate, 0, cache * 4 * 4 * sizeof(double));
+    rate = u.rate();
+    double t = solve(u, c, iterator, parallel);
+    sum = total(u);
+    err = error(u, c, t);
+  }
+  else {
+    // solve problem using uncompressed arrays
+    raw::array2d u(nx, ny);
+    double t = solve(u, c, iterator, parallel);
+    sum = total(u);
+    err = error(u, c, t);
+  }
+
+  std::cerr.unsetf(std::ios::fixed);
+  std::cerr << "rate=" << rate << " sum=" << std::fixed << sum << " error=" << std::setprecision(6) << std::scientific << err << std::endl;
+
+  return 0;
+}
diff --git a/zfp/examples/diffusionC.c b/zfp/examples/diffusionC.c
new file mode 100644
index 0000000000000000000000000000000000000000..99a5c3db0de3bfee6726b41c7e8cd769d483aa80
--- /dev/null
+++ b/zfp/examples/diffusionC.c
@@ -0,0 +1,267 @@
+// forward Euler finite difference solution to the heat equation on a 2D grid
+// (ported to C, from diffusion.cpp)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "cfparrays.h"
+#define _ (CFP_NAMESPACE.array2d)
+
+#define MAX(x, y) (((nx) > (ny)) ? (nx) : (ny))
+
+// constants used in the solution
+typedef struct {
+  int nx;        // grid points in x
+  int ny;        // grid points in y
+  int nt;        // number of time steps (0 for default)
+  int x0;        // x location of heat source
+  int y0;        // y location of heat source
+  double k;      // diffusion constant
+  double dx;     // grid spacing in x
+  double dy;     // grid spacing in y
+  double dt;     // time step
+  double tfinal; // minimum time to run solution to
+  double pi;     // 3.141...
+} constants;
+
+void
+init_constants(constants* c, int nx, int ny, int nt)
+{
+  c->nx = nx;
+  c->ny = ny;
+  c->nt = nt;
+  c->x0 = (nx - 1) / 2;
+  c->y0 = (ny - 1) / 2;
+  c->k = 0.04;
+  c->dx = 2.0 / (MAX(nx, ny) - 1);
+  c->dy = 2.0 / (MAX(nx, ny) - 1);
+  c->dt = 0.5 * (c->dx * c->dx + c->dy * c->dy) / (8 * c->k);
+  c->tfinal = nt ? nt * c->dt : 1.0;
+  c->pi = 3.14159265358979323846;
+}
+
+// advance solution using integer array indices
+static void
+time_step_indexed_compressed(cfp_array2d* u, const constants* c)
+{
+  // compute du/dt
+  cfp_array2d* du = _.ctor(c->nx, c->ny, _.rate(u), 0, _.cache_size(u));
+  int x, y;
+  for (y = 1; y < c->ny - 1; y++) {
+    for (x = 1; x < c->nx - 1; x++) {
+      double uxx = (_.get(u, x - 1, y) - 2 * _.get(u, x, y) + _.get(u, x + 1, y)) / (c->dx * c->dx);
+      double uyy = (_.get(u, x, y - 1) - 2 * _.get(u, x, y) + _.get(u, x, y + 1)) / (c->dy * c->dy);
+      _.set(du, x, y, c->dt * c->k * (uxx + uyy));
+    }
+  }
+  // take forward Euler step
+  uint i;
+  for (i = 0; i < _.size(u); i++) {
+    // u[i] += du[i]
+    double val = _.get_flat(u, i) + _.get_flat(du, i);
+    _.set_flat(u, i, val);
+  }
+
+  _.dtor(du);
+}
+
+// advance solution using integer array indices
+static void
+time_step_indexed(double* u, const constants* c)
+{
+  // compute du/dt
+  double* du = calloc(c->nx * c->ny, sizeof(double));
+  int x, y;
+  for (y = 1; y < c->ny - 1; y++) {
+    for (x = 1; x < c->nx - 1; x++) {
+      double uxx = (u[y*c->nx + (x - 1)] - 2 * u[y*c->nx + x] + u[y*c->nx + (x + 1)]) / (c->dx * c->dx);
+      double uyy = (u[(y - 1)*c->nx + x] - 2 * u[y*c->nx + x] + u[(y + 1)*c->nx + x]) / (c->dy * c->dy);
+      du[y*c->nx + x] = c->dt * c->k * (uxx + uyy);
+    }
+  }
+  // take forward Euler step
+  uint i;
+  for (i = 0; i < (c->nx * c->ny); i++) {
+    // u[i] += du[i]
+    u[i] += du[i];
+  }
+
+  free(du);
+}
+
+// solve heat equation using 
+static double
+solve_compressed(cfp_array2d* u, const constants* c)
+{
+  // initialize u with point heat source (u is assumed to be zero initialized)
+  _.set(u, c->x0, c->y0, 1);
+
+  // iterate until final time
+  double t;
+  for (t = 0; t < c->tfinal; t += c->dt) {
+    fprintf(stderr, "t=%lf\n", t);
+    time_step_indexed_compressed(u, c);
+  }
+
+  return t;
+}
+
+static double
+solve(double* u, const constants* c)
+{
+  // initialize u with point heat source (u is assumed to be zero initialized)
+  u[c->y0*c->nx + c->x0] = 1;
+
+  // iterate until final time
+  double t;
+  for (t = 0; t < c->tfinal; t += c->dt) {
+    fprintf(stderr, "t=%lf\n", t);
+    time_step_indexed(u, c);
+  }
+
+  return t;
+}
+
+// compute sum of array values
+static double
+total_compressed(const cfp_array2d* u)
+{
+  double s = 0;
+  const int nx = _.size_x(u);
+  const int ny = _.size_y(u);
+  int x, y;
+  for (y = 1; y < ny - 1; y++)
+    for (x = 1; x < nx - 1; x++)
+      s += _.get(u, x, y);
+  return s;
+}
+
+// compute sum of array values
+static double
+total(const double* u, const int nx, const int ny)
+{
+  double s = 0;
+  int x, y;
+  for (y = 1; y < ny - 1; y++)
+    for (x = 1; x < nx - 1; x++)
+      s += u[y*nx + x];
+  return s;
+}
+
+// compute root mean square error with respect to exact solution
+static double
+error_compressed(const cfp_array2d* u, const constants* c, double t)
+{
+  double e = 0;
+  int x, y;
+  for (y = 1; y < c->ny - 1; y++) {
+    double py = c->dy * (y - c->y0);
+    for (x = 1; x < c->nx - 1; x++) {
+      double px = c->dx * (x - c->x0);
+      double f = _.get(u, x, y);
+      double g = c->dx * c->dy * exp(-(px * px + py * py) / (4 * c->k * t)) / (4 * c->pi * c->k * t);
+      e += (f - g) * (f - g);
+    }
+  }
+  return sqrt(e / ((c->nx - 2) * (c->ny - 2)));
+}
+
+// compute root mean square error with respect to exact solution
+static double
+error(const double* u, const constants* c, double t)
+{
+  double e = 0;
+  int x, y;
+  for (y = 1; y < c->ny - 1; y++) {
+    double py = c->dy * (y - c->y0);
+    for (x = 1; x < c->nx - 1; x++) {
+      double px = c->dx * (x - c->x0);
+      double f = u[y*c->nx + x];
+      double g = c->dx * c->dy * exp(-(px * px + py * py) / (4 * c->k * t)) / (4 * c->pi * c->k * t);
+      e += (f - g) * (f - g);
+    }
+  }
+  return sqrt(e / ((c->nx - 2) * (c->ny - 2)));
+}
+
+static int
+usage()
+{
+  fprintf(stderr, "Usage: diffusionC [options]\n");
+  fprintf(stderr, "Options:\n");
+  fprintf(stderr, "-n <nx> <ny> : number of grid points\n");
+  fprintf(stderr, "-t <nt> : number of time steps\n");
+  fprintf(stderr, "-r <rate> : use compressed arrays with 'rate' bits/value\n");
+  fprintf(stderr, "-c <blocks> : use 'blocks' 4x4 blocks of cache\n");
+  return EXIT_FAILURE;
+}
+
+int main(int argc, char* argv[])
+{
+  int nx = 100;
+  int ny = 100;
+  int nt = 0;
+  double rate = 64;
+  int compression = 0;
+  int cache = 0;
+
+  // parse command-line options
+  int i;
+  for (i = 1; i < argc; i++) {
+    if (argv[i][0] != '-' || argv[i][2])
+      return usage();
+    switch(argv[i][1]) {
+      case 'n':
+        if (++i == argc || sscanf(argv[i], "%d", &nx) != 1 ||
+            ++i == argc || sscanf(argv[i], "%d", &ny) != 1)
+          return usage();
+        break;
+      case 't':
+        if (++i == argc || sscanf(argv[i], "%d", &nt) != 1)
+          return usage();
+        break;
+      case 'r':
+        if (++i == argc || sscanf(argv[i], "%lf", &rate) != 1)
+          return usage();
+        compression = 1;
+        break;
+      case 'c':
+        if (++i == argc || sscanf(argv[i], "%d", &cache) != 1)
+          return usage();
+    }
+  }
+
+  constants* c = malloc(sizeof(constants));
+  init_constants(c, nx, ny, nt);
+
+  double sum;
+  double err;
+  if (compression) {
+    // solve problem using compressed arrays
+    cfp_array2d* u = _.ctor(nx, ny, rate, 0, cache * 4 * 4 * sizeof(double));
+
+    rate = _.rate(u);
+    double t = solve_compressed(u, c);
+    sum = total_compressed(u);
+    err = error_compressed(u, c, t);
+
+    _.dtor(u);
+  }
+  else {
+    // solve problem using primitive arrays
+    double* u = calloc(nx * ny, sizeof(double));
+
+    double t = solve(u, c);
+    sum = total(u, nx, ny);
+    err = error(u, c, t);
+
+    free(u);
+  }
+
+  fprintf(stderr, "rate=%g sum=%g error=%.6e\n", rate, sum, err);
+
+  free(c);
+
+  return 0;
+}
diff --git a/zfp/examples/inplace.c b/zfp/examples/inplace.c
new file mode 100644
index 0000000000000000000000000000000000000000..3764166b58c6653cfa4f0f087da29d7c08ef8088
--- /dev/null
+++ b/zfp/examples/inplace.c
@@ -0,0 +1,156 @@
+/* example illustrating in-place compression and decompression */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "zfp.h"
+
+/* compress and decompress contiguous blocks */
+static int
+process(double* buffer, uint blocks, double tolerance)
+{
+  zfp_stream* zfp;   /* compressed stream */
+  bitstream* stream; /* bit stream to write to or read from */
+  size_t* offset;    /* per-block bit offset in compressed stream */
+  double* ptr;       /* pointer to block being processed */
+  size_t bufsize;    /* byte size of uncompressed storage */
+  size_t zfpsize;    /* byte size of compressed stream */
+  uint minbits;      /* min bits per block */
+  uint maxbits;      /* max bits per block */
+  uint maxprec;      /* max precision */
+  int minexp;        /* min bit plane encoded */
+  uint bits;         /* size of compressed block */
+  uint i;
+
+  /* maintain offset to beginning of each variable-length block */
+  offset = malloc(blocks * sizeof(size_t));
+
+  /* associate bit stream with same storage as input */
+  bufsize = blocks * 4 * 4 * sizeof(*buffer);
+  stream = stream_open(buffer, bufsize);
+
+  /* allocate meta data for a compressed stream */
+  zfp = zfp_stream_open(stream);
+
+  /* set tolerance for fixed-accuracy mode */
+  zfp_stream_set_accuracy(zfp, tolerance);
+
+  /* set maxbits to guard against prematurely overwriting the input */
+  zfp_stream_params(zfp, &minbits, &maxbits, &maxprec, &minexp);
+  maxbits = 4 * 4 * sizeof(*buffer) * CHAR_BIT;
+  zfp_stream_set_params(zfp, minbits, maxbits, maxprec, minexp);
+
+  /* compress one block at a time in sequential order */
+  ptr = buffer;
+  for (i = 0; i < blocks; i++) {
+    offset[i] = stream_wtell(stream);
+    bits = zfp_encode_block_double_2(zfp, ptr);
+    if (!bits) {
+      fprintf(stderr, "compression failed\n");
+      return 0;
+    }
+    printf("block #%u offset=%4u size=%4u\n", i, (uint)offset[i], bits);
+    ptr += 4 * 4;
+  }
+  /* important: flush any buffered compressed bits */
+  stream_flush(stream);
+
+  /* print out size */
+  zfpsize = stream_size(stream);
+  printf("compressed %u bytes to %u bytes\n", (uint)bufsize, (uint)zfpsize);
+
+  /* decompress one block at a time in reverse order */
+  for (i = blocks; i--;) {
+    ptr -= 4 * 4;
+    stream_rseek(stream, offset[i]);
+    if (!zfp_decode_block_double_2(zfp, ptr)) {
+      fprintf(stderr, "decompression failed\n");
+      return 0;
+    }
+  }
+
+  /* clean up */
+  zfp_stream_close(zfp);
+  stream_close(stream);
+  free(offset);
+
+  return 1;
+}
+
+int main(int argc, char* argv[])
+{
+  double tolerance = 1e-6;
+  double* array;
+  double* buffer;
+  uint bx = 2;
+  uint by = 4;
+  uint nx = 4 * bx;
+  uint ny = 4 * by;
+  uint blocks = bx * by;
+  uint x, y;
+  uint i, j, k;
+  int status;
+
+  switch (argc) {
+    case 2:
+      if (sscanf(argv[1], "%lf", &tolerance) != 1)
+        goto usage;
+      /* FALLTHROUGH */
+    case 1:
+      break;
+    default:
+    usage:
+      fprintf(stderr, "Usage: inline [tolerance]\n");
+      return EXIT_FAILURE;
+  }
+
+  printf("tolerance=%g\n", tolerance);
+
+  /* initialize array to be compressed */
+  printf("original %ux%u array:\n", nx, ny);
+  array = malloc(nx * ny * sizeof(double));
+  for (y = 0; y < ny; y++) {
+    for (x = 0; x < nx; x++) {
+      double u = 2 * (x + 0.5) / nx;
+      double v = asin(1.0) * (y + 0.5);
+      double f = exp(-u * u) * sin(v) / v;
+      printf("%9.6f%c", f, x == nx - 1 ? '\n' : ' ');
+      array[x + nx * y] = f;
+    }
+  }
+
+  /* reorganize array into 4x4 blocks */
+  buffer = malloc(blocks * 4 * 4 * sizeof(double));
+  for (k = 0; k < blocks; k++)
+    for (j = 0; j < 4; j++)
+      for (i = 0; i < 4; i++) {
+        uint x = 4 * (k & 1) + i;
+        uint y = 4 * (k / 2) + j;
+        buffer[i + 4 * (j + 4 * k)] = array[x + nx * y];
+      }
+
+  status = process(buffer, blocks, tolerance);
+  if (status) {
+    /* reorganize blocks into array */
+    for (k = 0; k < blocks; k++)
+      for (j = 0; j < 4; j++)
+        for (i = 0; i < 4; i++) {
+          uint x = 4 * (k & 1) + i;
+          uint y = 4 * (k / 2) + j;
+          array[x + nx * y] = buffer[i + 4 * (j + 4 * k)];
+        }
+
+    /* print out modified array*/
+    printf("decompressed %ux%u array:\n", nx, ny);
+    for (y = 0; y < ny; y++)
+      for (x = 0; x < nx; x++)
+        printf("%9.6f%c", array[x + nx * y], x == nx - 1 ? '\n' : ' ');
+  }
+
+  free(buffer);
+  free(array);
+
+  return status ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/zfp/examples/iterator.cpp b/zfp/examples/iterator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..698692ff2a13b82265a1ebb93c9523bb4537e30b
--- /dev/null
+++ b/zfp/examples/iterator.cpp
@@ -0,0 +1,74 @@
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include "zfparray1.h"
+#include "zfparray2.h"
+#include "zfparray3.h"
+
+void print1(zfp::array1<double>::pointer p, size_t n)
+{
+  for (size_t i = 0; i < n; i++)
+    std::cout << p[i] << std::endl;
+}
+
+void print2(zfp::array2<double>::pointer p, size_t n)
+{
+  while (n--)
+    std::cout << *p++ << std::endl;
+}
+
+void print3(zfp::array1<double>::iterator begin, zfp::array1<double>::iterator end)
+{
+  for (zfp::array1<double>::iterator p = begin; p != end; p++)
+    std::cout << *p << std::endl;
+}
+
+int main()
+{
+  // some fun with 1D arrays
+  zfp::array1<double> v(10, 64.0);
+  // initialize and print array of random values
+  for (zfp::array1<double>::iterator p = v.begin(); p != v.end(); p++)
+    *p = rand();
+  std::cout << "random array" << std::endl;
+  print1(&v[0], v.size());
+  std::cout << std::endl;
+  // sorting is possible via random access iterators (1D arrays only)
+  std::sort(v.begin(), v.end());
+  // print array using iteration
+  std::cout << "sorted array" << std::endl;
+  print3(v.begin(), v.end());
+  std::cout << std::endl;
+
+  // some fun with 2D arrays
+  zfp::array2<double> a(5, 7, 64.0);
+  // print array indices visited in block-order traversal
+  std::cout << "block order (x, y) indices" << std::endl;
+  for (zfp::array2<double>::iterator p = a.begin(); p != a.end(); p++) {
+    std::cout << "(" << p.i() << ", " << p.j() << ")" << std::endl;
+    *p = p.i() + 10 * p.j();
+  }
+  std::cout << std::endl;
+  // print array contents in row-major order
+  std::cout << "row-major order yx indices" << std::endl;
+  print2(&a[0], a.size());
+  std::cout << std::endl;
+  // pointer arithmetic
+  std::cout << a.size_x() << " * " << a.size_y() << " = " << (&*a.end() - &*a.begin()) << std::endl;
+  // min and max values
+  std::cout << "min = " << *std::min_element(a.begin(), a.end()) << std::endl;
+  std::cout << "max = " << *std::max_element(a.begin(), a.end()) << std::endl;
+  std::cout << std::endl;
+
+  // some fun with 3D arrays
+  zfp::array3<double> b(7, 2, 5, 64.0);
+  // print array indices visited in block-order traversal
+  std::cout << "block order (x, y, z) indices" << std::endl;
+  for (zfp::array3<double>::iterator p = b.begin(); p != b.end(); p++)
+    std::cout << "(" << p.i() << ", " << p.j() << ", " << p.k() << ")" << std::endl;
+  std::cout << std::endl;
+  // pointer arithmetic
+  std::cout << b.size_x() << " * " << b.size_y() << " * " << b.size_z() << " = " << (&*b.end() - &*b.begin()) << std::endl;
+
+  return 0;
+}
diff --git a/zfp/examples/pgm.c b/zfp/examples/pgm.c
new file mode 100644
index 0000000000000000000000000000000000000000..c23ecb2d5ef714fc8e8342754e738e80e6e30291
--- /dev/null
+++ b/zfp/examples/pgm.c
@@ -0,0 +1,112 @@
+/* simple example that shows how zfp can be used to compress pgm images */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "zfp.h"
+
+int main(int argc, char* argv[])
+{
+  double rate = 0;
+  uint nx, ny;
+  uint x, y;
+  char line[0x100];
+  uchar* image;
+  zfp_field* field;
+  zfp_stream* zfp;
+  bitstream* stream;
+  void* buffer;
+  size_t bytes;
+  size_t size;
+
+  switch (argc) {
+    case 2:
+      if (sscanf(argv[1], "%lf", &rate) != 1)
+        goto usage;
+      break;
+    default:
+    usage:
+      fprintf(stderr, "Usage: pgm <rate|-precision> <input.pgm >output.pgm\n");
+      return EXIT_FAILURE;
+  }
+
+  /* read pgm header */
+  if (!fgets(line, sizeof(line), stdin) || strcmp(line, "P5\n") ||
+      !fgets(line, sizeof(line), stdin) || sscanf(line, "%u%u", &nx, &ny) != 2 ||
+      !fgets(line, sizeof(line), stdin) || strcmp(line, "255\n")) {
+    fprintf(stderr, "error opening image\n");
+    return EXIT_FAILURE;
+  }
+
+  if ((nx & 3u) || (ny & 3u)) {
+    fprintf(stderr, "image dimensions must be multiples of four\n");
+    return EXIT_FAILURE;
+  }
+
+  /* read image data */
+  image = malloc(nx * ny);
+  if (fread(image, sizeof(*image), nx * ny, stdin) != nx * ny) {
+    fprintf(stderr, "error reading image\n");
+    return EXIT_FAILURE;
+  }
+
+  /* create input array */
+  field = zfp_field_2d(image, zfp_type_int32, nx, ny);
+
+  /* initialize compressed stream */
+  zfp = zfp_stream_open(NULL);
+  if (rate < 0)
+    zfp_stream_set_precision(zfp, (uint)floor(0.5 - rate));
+  else
+    zfp_stream_set_rate(zfp, rate, zfp_type_int32, 2, 0);
+  bytes = zfp_stream_maximum_size(zfp, field);
+  buffer = malloc(bytes);
+  stream = stream_open(buffer, bytes);
+  zfp_stream_set_bit_stream(zfp, stream);
+  zfp_field_free(field);
+
+  /* compress */
+  for (y = 0; y < ny; y += 4)
+    for (x = 0; x < nx; x += 4) {
+      uchar ublock[16];
+      int32 iblock[16];
+      uint i, j;
+      for (j = 0; j < 4; j++)
+        for (i = 0; i < 4; i++)
+          ublock[i + 4 * j] = image[x + i + nx * (y + j)];
+      zfp_promote_uint8_to_int32(iblock, ublock, 2);
+      zfp_encode_block_int32_2(zfp, iblock);
+    }
+
+  zfp_stream_flush(zfp);
+  size = zfp_stream_compressed_size(zfp);
+  fprintf(stderr, "%u compressed bytes (%.2f bps)\n", (uint)size, (double)size * CHAR_BIT / (nx * ny));
+
+  /* decompress */
+  zfp_stream_rewind(zfp);
+  for (y = 0; y < ny; y += 4)
+    for (x = 0; x < nx; x += 4) {
+      int32 iblock[16];
+      uchar ublock[16];
+      uint i, j;
+      zfp_decode_block_int32_2(zfp, iblock);
+      zfp_demote_int32_to_uint8(ublock, iblock, 2);
+      for (j = 0; j < 4; j++)
+        for (i = 0; i < 4; i++)
+          image[x + i + nx * (y + j)] = ublock[i + 4 * j];
+    }
+  zfp_stream_close(zfp);
+  stream_close(stream);
+  free(buffer);
+
+  /* output reconstructed image */
+  printf("P5\n");
+  printf("%u %u\n", nx, ny);
+  printf("255\n");
+  fwrite(image, sizeof(*image), nx * ny, stdout);
+  free(image);
+
+  return 0;
+}
diff --git a/zfp/examples/simple.c b/zfp/examples/simple.c
new file mode 100644
index 0000000000000000000000000000000000000000..2ccb597756a9031848e37b9bbe49582364a23456
--- /dev/null
+++ b/zfp/examples/simple.c
@@ -0,0 +1,99 @@
+/* minimal code example showing how to call the zfp (de)compressor */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "zfp.h"
+
+/* compress or decompress array */
+static int
+compress(double* array, int nx, int ny, int nz, double tolerance, int decompress)
+{
+  int status = 0;    /* return value: 0 = success */
+  zfp_type type;     /* array scalar type */
+  zfp_field* field;  /* array meta data */
+  zfp_stream* zfp;   /* compressed stream */
+  void* buffer;      /* storage for compressed stream */
+  size_t bufsize;    /* byte size of compressed buffer */
+  bitstream* stream; /* bit stream to write to or read from */
+  size_t zfpsize;    /* byte size of compressed stream */
+
+  /* allocate meta data for the 3D array a[nz][ny][nx] */
+  type = zfp_type_double;
+  field = zfp_field_3d(array, type, nx, ny, nz);
+
+  /* allocate meta data for a compressed stream */
+  zfp = zfp_stream_open(NULL);
+
+  /* set compression mode and parameters via one of three functions */
+/*  zfp_stream_set_rate(zfp, rate, type, 3, 0); */
+/*  zfp_stream_set_precision(zfp, precision); */
+  zfp_stream_set_accuracy(zfp, tolerance);
+
+  /* allocate buffer for compressed data */
+  bufsize = zfp_stream_maximum_size(zfp, field);
+  buffer = malloc(bufsize);
+
+  /* associate bit stream with allocated buffer */
+  stream = stream_open(buffer, bufsize);
+  zfp_stream_set_bit_stream(zfp, stream);
+  zfp_stream_rewind(zfp);
+
+  /* compress or decompress entire array */
+  if (decompress) {
+    /* read compressed stream and decompress array */
+    zfpsize = fread(buffer, 1, bufsize, stdin);
+    if (!zfp_decompress(zfp, field)) {
+      fprintf(stderr, "decompression failed\n");
+      status = 1;
+    }
+  }
+  else {
+    /* compress array and output compressed stream */
+    zfpsize = zfp_compress(zfp, field);
+    if (!zfpsize) {
+      fprintf(stderr, "compression failed\n");
+      status = 1;
+    }
+    else
+      fwrite(buffer, 1, zfpsize, stdout);
+  }
+
+  /* clean up */
+  zfp_field_free(field);
+  zfp_stream_close(zfp);
+  stream_close(stream);
+  free(buffer);
+  free(array);
+
+  return status;
+}
+
+int main(int argc, char* argv[])
+{
+  /* use -d to decompress rather than compress data */
+  int decompress = (argc == 2 && !strcmp(argv[1], "-d"));
+
+  /* allocate 100x100x100 array of doubles */
+  int nx = 100;
+  int ny = 100;
+  int nz = 100;
+  double* array = malloc(nx * ny * nz * sizeof(double));
+
+  if (!decompress) {
+    /* initialize array to be compressed */
+    int i, j, k;
+    for (k = 0; k < nz; k++)
+      for (j = 0; j < ny; j++)
+        for (i = 0; i < nx; i++) {
+          double x = 2.0 * i / nx;
+          double y = 2.0 * j / ny;
+          double z = 2.0 * k / nz;
+          array[i + nx * (j + ny * k)] = exp(-(x * x + y * y + z * z));
+        }
+  }
+
+  /* compress or decompress array */
+  return compress(array, nx, ny, nz, 1e-3, decompress);
+}
diff --git a/zfp/examples/speed.c b/zfp/examples/speed.c
new file mode 100644
index 0000000000000000000000000000000000000000..9332605d58eacfbc0709a51bfdbc2cd8a6de9226
--- /dev/null
+++ b/zfp/examples/speed.c
@@ -0,0 +1,136 @@
+/* measure the throughput of encoding and decoding 3D blocks of doubles */
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "zfp.h"
+
+/* example 3D block of (reinterpreted) doubles */
+static const uint64 block[] = {
+UINT64C(0xbf7c3a7bb8495ca9),
+UINT64C(0xbf79f9d9058ffdaf),
+UINT64C(0xbf77c7abd0b61999),
+UINT64C(0xbf75a42c806bd1da),
+UINT64C(0xbf738f8f740b8ea8),
+UINT64C(0xbf718a050399fef8),
+UINT64C(0xbf6f2772ff8c30fe),
+UINT64C(0xbf6b59aa63d22f68),
+UINT64C(0xbf67aaf8b80cff9e),
+UINT64C(0xbf641b9e71983592),
+UINT64C(0xbf60abd3f723f2b7),
+UINT64C(0xbf5ab7934169cc04),
+UINT64C(0xbf54574f6f4897d3),
+UINT64C(0xbf4c6e39da7fb99b),
+UINT64C(0xbf40ae5826a893d1),
+UINT64C(0xbf25bce8e19d48e1),
+UINT64C(0x3f253bfed65904d7),
+UINT64C(0x3f3f18ab46a04cf3),
+UINT64C(0x3f4948e7cb74278b),
+UINT64C(0x3f51427b51aeec2e),
+UINT64C(0x3f55a0716d8b4b6b),
+UINT64C(0x3f59be96aeaac56f),
+UINT64C(0x3f5d9d3ba7bfd327),
+UINT64C(0x3f609e608469e93e),
+UINT64C(0x3f624ecbcfa3832c),
+UINT64C(0x3f63e0202ae84b4d),
+UINT64C(0x3f6552a61a3f4812),
+UINT64C(0x3f66a6ae305af268),
+UINT64C(0x3f67dc910e9935bc),
+UINT64C(0x3f68f4af65036ff7),
+UINT64C(0x3f69ef71f24e7182),
+UINT64C(0x3f6acd4983da7d43),
+UINT64C(0x3f6b8eaef5b348a0),
+UINT64C(0x3f6c3423328ffb7a),
+UINT64C(0x3f6cbe2f33d33034),
+UINT64C(0x3f6d2d64018af3ac),
+UINT64C(0x3f6d825ab270c540),
+UINT64C(0x3f6dbdb46be996cc),
+UINT64C(0x3f6de01a6205cca9),
+UINT64C(0x3f6dea3dd7813daf),
+UINT64C(0x3f6ddcd81dc33335),
+UINT64C(0x3f6db8aa94de690f),
+UINT64C(0x3f6d7e7eab910d8f),
+UINT64C(0x3f6d2f25df44c187),
+UINT64C(0x3f6ccb79bc0e9844),
+UINT64C(0x3f6c545bdcaf1795),
+UINT64C(0x3f6bcab5ea9237c4),
+UINT64C(0x3f6b2f799dcf639b),
+UINT64C(0x3f6a83a0bd297862),
+UINT64C(0x3f69c82d1e0ec5de),
+UINT64C(0x3f68fe28a4990e53),
+UINT64C(0x3f6826a5438d8685),
+UINT64C(0x3f6742bcfc5cd5b2),
+UINT64C(0x3f665391df231599),
+UINT64C(0x3f655a4e0aa7d278),
+UINT64C(0x3f645823ac5e0b09),
+UINT64C(0x3f634e4d00643085),
+UINT64C(0x3f623e0c518426a3),
+UINT64C(0x3f6128abf933439a),
+UINT64C(0x3f600f7e5f92501c),
+UINT64C(0x3f5de7bbf6db0eb7),
+UINT64C(0x3f5bae5aa4792e11),
+UINT64C(0x3f5975adf0453ea2),
+UINT64C(0x3f57409b1fdc65c4),
+};
+
+int main(int argc, char* argv[])
+{
+  uint blocks = 0x200000;
+  double rate = 1;
+  zfp_field* field;
+  uint insize;
+  zfp_stream* zfp;
+  bitstream* stream;
+  void* buffer;
+  size_t bytes;
+  clock_t c;
+  double time;
+  uint i;
+
+  switch (argc) {
+    case 3:
+      sscanf(argv[2], "%u", &blocks);
+      /* FALLTHROUGH */
+    case 2:
+      sscanf(argv[1], "%lf", &rate);
+      break;
+  }
+
+  /* declare array to compress */
+  field = zfp_field_3d(NULL, zfp_type_double, 4, 4, 4 * blocks);
+  insize = blocks * sizeof(block);
+
+  /* allocate storage for compressed bit stream */
+  zfp = zfp_stream_open(NULL);
+  zfp_stream_set_rate(zfp, rate, zfp_field_type(field), zfp_field_dimensionality(field), 0);
+  bytes = zfp_stream_maximum_size(zfp, field);
+  buffer = malloc(bytes);
+  stream = stream_open(buffer, bytes);
+  zfp_stream_set_bit_stream(zfp, stream);
+  zfp_field_free(field);
+
+  /* compress */
+  c = clock();
+  for (i = 0; i < blocks; i++)
+    zfp_encode_block_double_3(zfp, (const double*)block);
+  zfp_stream_flush(zfp);
+  time = (double)(clock() - c) / CLOCKS_PER_SEC;
+  printf("encode in=%u out=%u %.0f MB/s\n", insize, (uint)stream_size(stream), insize / (1024 * 1024 * time));
+
+  /* decompress */
+  zfp_stream_rewind(zfp);
+  c = clock();
+  for (i = 0; i < blocks; i++) {
+    double a[64];
+    zfp_decode_block_double_3(zfp, a);
+  }
+  time = (double)(clock() - c) / CLOCKS_PER_SEC;
+  printf("decode in=%u out=%u %.0f MB/s\n", (uint)stream_size(stream), insize, insize / (1024 * 1024 * time));
+
+  zfp_stream_close(zfp);
+  stream_close(stream);
+  free(buffer);
+
+  return 0;
+}
diff --git a/zfp/include/bitstream.h b/zfp/include/bitstream.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad5475fe6a719b24def0f2bc6dfe0a54bfba5108
--- /dev/null
+++ b/zfp/include/bitstream.h
@@ -0,0 +1,94 @@
+#ifndef ZFP_BITSTREAM_H
+#define ZFP_BITSTREAM_H
+
+#include <stddef.h>
+#include "zfp/types.h"
+#include "zfp/system.h"
+
+/* forward declaration of opaque type */
+typedef struct bitstream bitstream;
+
+extern_ const size_t stream_word_bits; /* bit stream granularity */
+
+#ifndef inline_
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* allocate and initialize bit stream */
+bitstream* stream_open(void* buffer, size_t bytes);
+
+/* close and deallocate bit stream */
+void stream_close(bitstream* stream);
+
+/* make a copy of bit stream to shared memory buffer */
+bitstream* stream_clone(const bitstream* stream);
+
+/* pointer to beginning of stream */
+void* stream_data(const bitstream* stream);
+
+/* current byte size of stream (if flushed) */
+size_t stream_size(const bitstream* stream);
+
+/* byte capacity of stream */
+size_t stream_capacity(const bitstream* stream);
+
+/* number of words per block */
+size_t stream_stride_block(const bitstream* stream);
+
+/* number of blocks between consecutive blocks */
+ptrdiff_t stream_stride_delta(const bitstream* stream);
+
+/* read single bit (0 or 1) */
+uint stream_read_bit(bitstream* stream);
+
+/* write single bit */
+uint stream_write_bit(bitstream* stream, uint bit);
+
+/* read 0 <= n <= 64 bits */
+uint64 stream_read_bits(bitstream* stream, uint n);
+
+/* write 0 <= n <= 64 low bits of value and return remaining bits */
+uint64 stream_write_bits(bitstream* stream, uint64 value, uint n);
+
+/* return bit offset to next bit to be read */
+size_t stream_rtell(const bitstream* stream);
+
+/* return bit offset to next bit to be written */
+size_t stream_wtell(const bitstream* stream);
+
+/* rewind stream to beginning */
+void stream_rewind(bitstream* stream);
+
+/* position stream for reading at given bit offset */
+void stream_rseek(bitstream* stream, size_t offset);
+
+/* position stream for writing at given bit offset */
+void stream_wseek(bitstream* stream, size_t offset);
+
+/* skip over the next n bits */
+void stream_skip(bitstream* stream, uint n);
+
+/* append n zero-bits to stream */
+void stream_pad(bitstream* stream, uint n);
+
+/* align stream on next word boundary */
+size_t stream_align(bitstream* stream);
+
+/* flush out any remaining buffered bits */
+size_t stream_flush(bitstream* stream);
+
+/* copy n bits from one bit stream to another */
+void stream_copy(bitstream* dst, bitstream* src, size_t n);
+
+#ifdef BIT_STREAM_STRIDED
+/* set block size in number of words and spacing in number of blocks */
+int stream_set_stride(bitstream* stream, size_t block, ptrdiff_t delta);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !inline_ */
+
+#endif
diff --git a/zfp/include/zfp.h b/zfp/include/zfp.h
new file mode 100644
index 0000000000000000000000000000000000000000..2faca1c291e3c631b5d100a4abe64999334555a2
--- /dev/null
+++ b/zfp/include/zfp.h
@@ -0,0 +1,747 @@
+/*
+** Copyright (c) 2014-2018, Lawrence Livermore National Security, LLC.
+** Produced at the Lawrence Livermore National Laboratory.
+** Authors: Peter Lindstrom, Markus Salasoo, Matt Larsen.
+** LLNL-CODE-663824.
+** All rights reserved.
+**
+** This file is part of the zfp library.
+** For details, see http://computation.llnl.gov/casc/zfp/.
+**
+** Redistribution and use in source and binary forms, with or without
+** modification, are permitted provided that the following conditions are met:
+**
+** 1. Redistributions of source code must retain the above copyright notice,
+** this list of conditions and the disclaimer below.
+**
+** 2. Redistributions in binary form must reproduce the above copyright notice,
+** this list of conditions and the disclaimer (as noted below) in the
+** documentation and/or other materials provided with the distribution.
+**
+** 3. Neither the name of the LLNS/LLNL nor the names of its contributors may
+** be used to endorse or promote products derived from this software without
+** specific prior written permission.
+**
+** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+** ARE DISCLAIMED.  IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
+** LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+** INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+** (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+** LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+** ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**
+**
+** Additional BSD Notice
+**
+** 1. This notice is required to be provided under our contract with the U.S.
+** Department of Energy (DOE).  This work was produced at Lawrence Livermore
+** National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
+
+** 2. Neither the United States Government nor Lawrence Livermore National
+** Security, LLC nor any of their employees, makes any warranty, express or
+** implied, or assumes any liability or responsibility for the accuracy,
+** completeness, or usefulness of any information, apparatus, product, or
+** process disclosed, or represents that its use would not infringe
+** privately-owned rights.
+**
+** 3. Also, reference herein to any specific commercial products, process, or
+** services by trade name, trademark, manufacturer or otherwise does not
+** necessarily constitute or imply its endorsement, recommendation, or
+** favoring by the United States Government or Lawrence Livermore National
+** Security, LLC.  The views and opinions of authors expressed herein do not
+** necessarily state or reflect those of the United States Government or
+** Lawrence Livermore National Security, LLC, and shall not be used for
+** advertising or product endorsement purposes.
+*/
+
+#ifndef ZFP_H
+#define ZFP_H
+
+#include "zfp/types.h"
+#include "zfp/system.h"
+#include "bitstream.h"
+
+/* macros ------------------------------------------------------------------ */
+
+/* stringification */
+#define _zfp_str_(x) # x
+#define _zfp_str(x) _zfp_str_(x)
+
+/* library version information */
+#define ZFP_VERSION_MAJOR 0 /* library major version number */
+#define ZFP_VERSION_MINOR 5 /* library minor version number */
+#define ZFP_VERSION_PATCH 4 /* library patch version number */
+#define ZFP_VERSION_RELEASE ZFP_VERSION_PATCH
+
+/* codec version number (see also zfp_codec_version) */
+#define ZFP_CODEC 5
+
+/* library version number (see also zfp_library_version) */
+#define ZFP_VERSION \
+  ((ZFP_VERSION_MAJOR << 8) + \
+   (ZFP_VERSION_MINOR << 4) + \
+   (ZFP_VERSION_PATCH << 0))
+
+/* library version string (see also zfp_version_string) */
+#define ZFP_VERSION_STRING \
+  _zfp_str(ZFP_VERSION_MAJOR) "." \
+  _zfp_str(ZFP_VERSION_MINOR) "." \
+  _zfp_str(ZFP_VERSION_PATCH)
+
+/* default compression parameters */
+#define ZFP_MIN_BITS     1 /* minimum number of bits per block */
+#define ZFP_MAX_BITS 16651 /* maximum number of bits per block */
+#define ZFP_MAX_PREC    64 /* maximum precision supported */
+#define ZFP_MIN_EXP  -1074 /* minimum floating-point base-2 exponent */
+
+/* header masks (enable via bitwise or; reader must use same mask) */
+#define ZFP_HEADER_MAGIC  0x1u /* embed 64-bit magic */
+#define ZFP_HEADER_META   0x2u /* embed 52-bit field metadata */
+#define ZFP_HEADER_MODE   0x4u /* embed 12- or 64-bit compression mode */
+#define ZFP_HEADER_FULL   0x7u /* embed all of the above */
+
+/* number of bits per header entry */
+#define ZFP_MAGIC_BITS       32 /* number of magic word bits */
+#define ZFP_META_BITS        52 /* number of field metadata bits */
+#define ZFP_MODE_SHORT_BITS  12 /* number of mode bits in short format */
+#define ZFP_MODE_LONG_BITS   64 /* number of mode bits in long format */
+#define ZFP_HEADER_MAX_BITS 148 /* max number of header bits */
+#define ZFP_MODE_SHORT_MAX  ((1u << ZFP_MODE_SHORT_BITS) - 2)
+
+/* types ------------------------------------------------------------------- */
+
+/* execution policy */
+typedef enum {
+  zfp_exec_serial = 0, /* serial execution (default) */
+  zfp_exec_omp    = 1, /* OpenMP multi-threaded execution */
+  zfp_exec_cuda   = 2  /* CUDA parallel execution */
+} zfp_exec_policy;
+
+/* OpenMP execution parameters */
+typedef struct {
+  uint threads;    /* number of requested threads */
+  uint chunk_size; /* number of blocks per chunk (1D only) */
+} zfp_exec_params_omp;
+
+/* execution parameters */
+typedef union {
+  zfp_exec_params_omp omp; /* OpenMP parameters */
+} zfp_exec_params;
+
+typedef struct {
+  zfp_exec_policy policy; /* execution policy (serial, omp, ...) */
+  zfp_exec_params params; /* execution parameters */
+} zfp_execution;
+
+/* compressed stream; use accessors to get/set members */
+typedef struct {
+  uint minbits;       /* minimum number of bits to store per block */
+  uint maxbits;       /* maximum number of bits to store per block */
+  uint maxprec;       /* maximum number of bit planes to store */
+  int minexp;         /* minimum floating point bit plane number to store */
+  bitstream* stream;  /* compressed bit stream */
+  zfp_execution exec; /* execution policy and parameters */
+} zfp_stream;
+
+/* compression mode */
+typedef enum {
+  zfp_mode_null            = 0, /* an invalid configuration of the 4 params */
+  zfp_mode_expert          = 1, /* expert mode (4 params set manually) */
+  zfp_mode_fixed_rate      = 2, /* fixed rate mode */
+  zfp_mode_fixed_precision = 3, /* fixed precision mode */
+  zfp_mode_fixed_accuracy  = 4  /* fixed accuracy mode */
+} zfp_mode;
+
+/* scalar type */
+typedef enum {
+  zfp_type_none   = 0, /* unspecified type */
+  zfp_type_int32  = 1, /* 32-bit signed integer */
+  zfp_type_int64  = 2, /* 64-bit signed integer */
+  zfp_type_float  = 3, /* single precision floating point */
+  zfp_type_double = 4  /* double precision floating point */
+} zfp_type;
+
+/* uncompressed array; use accessors to get/set members */
+typedef struct {
+  zfp_type type;       /* scalar type (e.g. int32, double) */
+  uint nx, ny, nz, nw; /* sizes (zero for unused dimensions) */
+  int sx, sy, sz, sw;  /* strides (zero for contiguous array a[nw][nz][ny][nx]) */
+  void* data;          /* pointer to array data */
+} zfp_field;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* public data ------------------------------------------------------------- */
+
+extern_ const uint zfp_codec_version;         /* codec version ZFP_CODEC */
+extern_ const uint zfp_library_version;       /* library version ZFP_VERSION */
+extern_ const char* const zfp_version_string; /* verbose version string */
+
+/* high-level API: utility functions --------------------------------------- */
+
+size_t          /* byte size of scalar type */
+zfp_type_size(
+  zfp_type type /* scalar type */
+);
+
+/* high-level API: compressed stream construction/destruction -------------- */
+
+/* open compressed stream and associate with bit stream */
+zfp_stream*         /* allocated compressed stream */
+zfp_stream_open(
+  bitstream* stream /* bit stream to read from and write to (may be NULL) */
+);
+
+/* close and deallocate compressed stream (does not affect bit stream) */
+void
+zfp_stream_close(
+  zfp_stream* stream /* compressed stream */
+);
+
+/* high-level API: compressed stream inspectors ---------------------------- */
+
+/* bit stream associated with compressed stream */
+bitstream*                 /* bit stream associated with compressed stream */
+zfp_stream_bit_stream(
+  const zfp_stream* stream /* compressed stream */
+);
+
+/* returns enum of compression mode */
+zfp_mode                     /* enum for compression mode */
+zfp_stream_compression_mode(
+  const zfp_stream* zfp      /* compressed stream */
+);
+
+/* get all compression parameters in a compact representation */
+uint64                     /* 12- or 64-bit encoding of parameters */
+zfp_stream_mode(
+  const zfp_stream* zfp    /* compressed stream */
+);
+
+/* get all compression parameters (pointers may be NULL) */
+void
+zfp_stream_params(
+  const zfp_stream* stream, /* compressed stream */
+  uint* minbits,            /* minimum number of bits per 4^d block */
+  uint* maxbits,            /* maximum number of bits per 4^d block */
+  uint* maxprec,            /* maximum precision (# bit planes coded) */
+  int* minexp               /* minimum base-2 exponent; error <= 2^minexp */
+);
+
+/* byte size of sequentially compressed stream (call after compression) */
+size_t                     /* actual number of bytes of compressed storage */
+zfp_stream_compressed_size(
+  const zfp_stream* stream /* compressed stream */
+);
+
+/* conservative estimate of compressed size in bytes */
+size_t                      /* maximum number of bytes of compressed storage */
+zfp_stream_maximum_size(
+  const zfp_stream* stream, /* compressed stream */
+  const zfp_field* field    /* array to compress */
+);
+
+/* high-level API: initialization of compressed stream parameters ---------- */
+
+/* associate bit stream with compressed stream */
+void
+zfp_stream_set_bit_stream(
+  zfp_stream* stream, /* compressed stream */
+  bitstream* bs       /* bit stream to read from and write to */
+);
+
+/* set size in compressed bits/scalar (fixed-rate mode) */
+double                /* actual rate in compressed bits/scalar */
+zfp_stream_set_rate(
+  zfp_stream* stream, /* compressed stream */
+  double rate,        /* desired rate in compressed bits/scalar */
+  zfp_type type,      /* scalar type to compress */
+  uint dims,          /* array dimensionality (1, 2, or 3) */
+  int wra             /* nonzero if write random access is needed */
+);
+
+/* set precision in uncompressed bits/scalar (fixed-precision mode) */
+uint                  /* actual precision */
+zfp_stream_set_precision(
+  zfp_stream* stream, /* compressed stream */
+  uint precision      /* desired precision in uncompressed bits/scalar */
+);
+
+/* set accuracy as absolute error tolerance (fixed-accuracy mode) */
+double                /* actual error tolerance */
+zfp_stream_set_accuracy(
+  zfp_stream* stream, /* compressed stream */
+  double tolerance    /* desired error tolerance */
+);
+
+/* set all compression parameters from compact representation */
+/* compression params are only set on stream upon success */
+zfp_mode              /* non (zfp_mode_null) upon success */
+zfp_stream_set_mode(
+  zfp_stream* stream, /* compressed stream */
+  uint64 mode         /* 12- or 64-bit encoding of parameters */
+);
+
+/* set all compression parameters (expert mode) */
+int                   /* nonzero upon success */
+zfp_stream_set_params(
+  zfp_stream* stream, /* compressed stream */
+  uint minbits,       /* minimum number of bits per 4^d block */
+  uint maxbits,       /* maximum number of bits per 4^d block */
+  uint maxprec,       /* maximum precision (# bit planes coded) */
+  int minexp          /* minimum base-2 exponent; error <= 2^minexp */
+);
+
+/* high-level API: execution policy ---------------------------------------- */
+
+/* current execution policy */
+zfp_exec_policy
+zfp_stream_execution(
+  const zfp_stream* stream /* compressed stream */
+);
+
+/* number of OpenMP threads to use */
+uint                       /* number of threads (0 for default) */
+zfp_stream_omp_threads(
+  const zfp_stream* stream /* compressed stream */
+);
+
+/* number of blocks per OpenMP chunk (1D only) */
+uint                       /* number of blocks per chunk (0 for default) */
+zfp_stream_omp_chunk_size(
+  const zfp_stream* stream /* compressed stream */
+);
+
+/* set execution policy */
+int                      /* nonzero upon success */
+zfp_stream_set_execution(
+  zfp_stream* stream,    /* compressed stream */
+  zfp_exec_policy policy /* execution policy */
+);
+
+/* set OpenMP execution policy and number of threads */
+int                   /* nonzero upon success */
+zfp_stream_set_omp_threads(
+  zfp_stream* stream, /* compressed stream */
+  uint threads        /* number of OpenMP threads to use (0 for default) */
+);
+
+/* set OpenMP execution policy and number of blocks per chunk (1D only) */
+int                   /* nonzero upon success */
+zfp_stream_set_omp_chunk_size(
+  zfp_stream* stream, /* compressed stream */
+  uint chunk_size     /* number of blocks per chunk (0 for default) */
+);
+
+/* high-level API: uncompressed array construction/destruction ------------- */
+
+/* allocate field struct */
+zfp_field* /* pointer to default initialized field */
+zfp_field_alloc();
+
+/* allocate metadata for 1D field f[nx] */
+zfp_field*       /* allocated field metadata */
+zfp_field_1d(
+  void* pointer, /* pointer to uncompressed scalars (may be NULL) */
+  zfp_type type, /* scalar type */
+  uint nx        /* number of scalars */
+);
+
+/* allocate metadata for 2D field f[ny][nx] */
+zfp_field*       /* allocated field metadata */
+zfp_field_2d(
+  void* pointer, /* pointer to uncompressed scalars (may be NULL) */
+  zfp_type type, /* scalar type */
+  uint nx,       /* number of scalars in x dimension */
+  uint ny        /* number of scalars in y dimension */
+);
+
+/* allocate metadata for 3D field f[nz][ny][nx] */
+zfp_field*       /* allocated field metadata */
+zfp_field_3d(
+  void* pointer, /* pointer to uncompressed scalars (may be NULL) */
+  zfp_type type, /* scalar type */
+  uint nx,       /* number of scalars in x dimension */
+  uint ny,       /* number of scalars in y dimension */
+  uint nz        /* number of scalars in z dimension */
+);
+
+/* allocate metadata for 4D field f[nw][nz][ny][nx] */
+zfp_field*       /* allocated field metadata */
+zfp_field_4d(
+  void* pointer, /* pointer to uncompressed scalars (may be NULL) */
+  zfp_type type, /* scalar type */
+  uint nx,       /* number of scalars in x dimension */
+  uint ny,       /* number of scalars in y dimension */
+  uint nz,       /* number of scalars in z dimension */
+  uint nw        /* number of scalars in w dimension */
+);
+
+/* deallocate field metadata */
+void
+zfp_field_free(
+  zfp_field* field /* field metadata */
+);
+
+/* high-level API: uncompressed array inspectors --------------------------- */
+
+/* pointer to first scalar in field */
+void*                    /* array pointer */
+zfp_field_pointer(
+  const zfp_field* field /* field metadata */
+);
+
+/* field scalar type */
+zfp_type                 /* scalar type */
+zfp_field_type(
+  const zfp_field* field /* field metadata */
+);
+
+/* precision of field scalar type */
+uint                     /* scalar type precision in number of bits */
+zfp_field_precision(
+  const zfp_field* field /* field metadata */
+);
+
+/* field dimensionality (1, 2, or 3) */
+uint                     /* number of dimensions */
+zfp_field_dimensionality(
+  const zfp_field* field /* field metadata */
+);
+
+/* field size in number of scalars */
+size_t                    /* total number of scalars */
+zfp_field_size(
+  const zfp_field* field, /* field metadata */
+  uint* size              /* number of scalars per dimension (may be NULL) */
+);
+
+/* field strides per dimension */
+int                       /* zero if array is contiguous */
+zfp_field_stride(
+  const zfp_field* field, /* field metadata */
+  int* stride             /* stride in scalars per dimension (may be NULL) */
+);
+
+/* field scalar type and dimensions */
+uint64                   /* compact 52-bit encoding of metadata */
+zfp_field_metadata(
+  const zfp_field* field /* field metadata */
+);
+
+/* high-level API: uncompressed array specification ------------------------ */
+
+/* set pointer to first scalar in field */
+void
+zfp_field_set_pointer(
+  zfp_field* field, /* field metadata */
+  void* pointer     /* pointer to first scalar */
+);
+
+/* set field scalar type */
+zfp_type            /* actual scalar type */
+zfp_field_set_type(
+  zfp_field* field, /* field metadata */
+  zfp_type type     /* desired scalar type */
+);
+
+/* set 1D field size */
+void
+zfp_field_set_size_1d(
+  zfp_field* field, /* field metadata */
+  uint nx           /* number of scalars */
+);
+
+/* set 2D field size */
+void
+zfp_field_set_size_2d(
+  zfp_field* field, /* field metadata */
+  uint nx,          /* number of scalars in x dimension */
+  uint ny           /* number of scalars in y dimension */
+);
+
+/* set 3D field size */
+void
+zfp_field_set_size_3d(
+  zfp_field* field, /* field metadata */
+  uint nx,          /* number of scalars in x dimension */
+  uint ny,          /* number of scalars in y dimension */
+  uint nz           /* number of scalars in z dimension */
+);
+
+/* set 4D field size */
+void
+zfp_field_set_size_4d(
+  zfp_field* field, /* field metadata */
+  uint nx,          /* number of scalars in x dimension */
+  uint ny,          /* number of scalars in y dimension */
+  uint nz,          /* number of scalars in z dimension */
+  uint nw           /* number of scalars in w dimension */
+);
+
+/* set 1D field stride in number of scalars */
+void
+zfp_field_set_stride_1d(
+  zfp_field* field, /* field metadata */
+  int sx            /* stride in number of scalars: &f[1] - &f[0] */
+);
+
+/* set 2D field strides in number of scalars */
+void
+zfp_field_set_stride_2d(
+  zfp_field* field, /* field metadata */
+  int sx,           /* stride in x dimension: &f[0][1] - &f[0][0] */
+  int sy            /* stride in y dimension: &f[1][0] - &f[0][0] */
+);
+
+/* set 3D field strides in number of scalars */
+void
+zfp_field_set_stride_3d(
+  zfp_field* field, /* field metadata */
+  int sx,           /* stride in x dimension: &f[0][0][1] - &f[0][0][0] */
+  int sy,           /* stride in y dimension: &f[0][1][0] - &f[0][0][0] */
+  int sz            /* stride in z dimension: &f[1][0][0] - &f[0][0][0] */
+);
+
+/* set 4D field strides in number of scalars */
+void
+zfp_field_set_stride_4d(
+  zfp_field* field, /* field metadata */
+  int sx,           /* stride in x dimension: &f[0][0][0][1] - &f[0][0][0][0] */
+  int sy,           /* stride in y dimension: &f[0][0][1][0] - &f[0][0][0][0] */
+  int sz,           /* stride in z dimension: &f[0][1][0][0] - &f[0][0][0][0] */
+  int sw            /* stride in w dimension: &f[1][0][0][0] - &f[0][0][0][0] */
+);
+
+/* set field scalar type and dimensions */
+int                 /* nonzero upon success */
+zfp_field_set_metadata(
+  zfp_field* field, /* field metadata */
+  uint64 meta       /* compact 52-bit encoding of metadata */
+);
+
+/* high-level API: compression and decompression --------------------------- */
+
+/* compress entire field (nonzero return value upon success) */
+size_t                   /* cumulative number of bytes of compressed storage */
+zfp_compress(
+  zfp_stream* stream,    /* compressed stream */
+  const zfp_field* field /* field metadata */
+);
+
+/* decompress entire field (nonzero return value upon success) */
+size_t                /* cumulative number of bytes of compressed storage */
+zfp_decompress(
+  zfp_stream* stream, /* compressed stream */
+  zfp_field* field    /* field metadata */
+);
+
+/* write compression parameters and field metadata (optional) */
+size_t                    /* number of bits written or zero upon failure */
+zfp_write_header(
+  zfp_stream* stream,     /* compressed stream */
+  const zfp_field* field, /* field metadata */
+  uint mask               /* information to write */
+);
+
+/* read compression parameters and field metadata when previously written */
+size_t                /* number of bits read or zero upon failure */
+zfp_read_header(
+  zfp_stream* stream, /* compressed stream */
+  zfp_field* field,   /* field metadata */
+  uint mask           /* information to read */
+);
+
+/* low-level API: stream manipulation -------------------------------------- */
+
+/* flush bit stream--must be called after last encode call or between seeks */
+size_t
+zfp_stream_flush(
+  zfp_stream* stream /* compressed bit stream */
+);
+
+/* align bit stream on next word boundary (decoding analogy to flush) */
+size_t
+zfp_stream_align(
+  zfp_stream* stream /* compressed bit stream */
+);
+
+/* rewind bit stream to beginning for compression or decompression */
+void
+zfp_stream_rewind(
+  zfp_stream* stream /* compressed bit stream */
+);
+
+/* low-level API: encoder -------------------------------------------------- */
+
+/*
+The functions below all compress either a complete contiguous d-dimensional
+block of 4^d scalars or a complete or partial block assembled from a strided
+array.  In the latter case, p points to the first scalar; (nx, ny, nz) specify
+the size of the block, with 1 <= nx, ny, nz <= 4; and (sx, sy, sz) specify the
+strides, i.e. the number of scalars to advance to get to the next scalar along
+each dimension.  The functions return the number of bits of compressed storage
+needed for the compressed block.
+*/
+
+/* encode 1D contiguous block of 4 values */
+uint zfp_encode_block_int32_1(zfp_stream* stream, const int32* block);
+uint zfp_encode_block_int64_1(zfp_stream* stream, const int64* block);
+uint zfp_encode_block_float_1(zfp_stream* stream, const float* block);
+uint zfp_encode_block_double_1(zfp_stream* stream, const double* block);
+
+/* encode 1D complete or partial block from strided array */
+uint zfp_encode_block_strided_int32_1(zfp_stream* stream, const int32* p, int sx);
+uint zfp_encode_block_strided_int64_1(zfp_stream* stream, const int64* p, int sx);
+uint zfp_encode_block_strided_float_1(zfp_stream* stream, const float* p, int sx);
+uint zfp_encode_block_strided_double_1(zfp_stream* stream, const double* p, int sx);
+uint zfp_encode_partial_block_strided_int32_1(zfp_stream* stream, const int32* p, uint nx, int sx);
+uint zfp_encode_partial_block_strided_int64_1(zfp_stream* stream, const int64* p, uint nx, int sx);
+uint zfp_encode_partial_block_strided_float_1(zfp_stream* stream, const float* p, uint nx, int sx);
+uint zfp_encode_partial_block_strided_double_1(zfp_stream* stream, const double* p, uint nx, int sx);
+
+/* encode 2D contiguous block of 4x4 values */
+uint zfp_encode_block_int32_2(zfp_stream* stream, const int32* block);
+uint zfp_encode_block_int64_2(zfp_stream* stream, const int64* block);
+uint zfp_encode_block_float_2(zfp_stream* stream, const float* block);
+uint zfp_encode_block_double_2(zfp_stream* stream, const double* block);
+
+/* encode 2D complete or partial block from strided array */
+uint zfp_encode_partial_block_strided_int32_2(zfp_stream* stream, const int32* p, uint nx, uint ny, int sx, int sy);
+uint zfp_encode_partial_block_strided_int64_2(zfp_stream* stream, const int64* p, uint nx, uint ny, int sx, int sy);
+uint zfp_encode_partial_block_strided_float_2(zfp_stream* stream, const float* p, uint nx, uint ny, int sx, int sy);
+uint zfp_encode_partial_block_strided_double_2(zfp_stream* stream, const double* p, uint nx, uint ny, int sx, int sy);
+uint zfp_encode_block_strided_int32_2(zfp_stream* stream, const int32* p, int sx, int sy);
+uint zfp_encode_block_strided_int64_2(zfp_stream* stream, const int64* p, int sx, int sy);
+uint zfp_encode_block_strided_float_2(zfp_stream* stream, const float* p, int sx, int sy);
+uint zfp_encode_block_strided_double_2(zfp_stream* stream, const double* p, int sx, int sy);
+
+/* encode 3D contiguous block of 4x4x4 values */
+uint zfp_encode_block_int32_3(zfp_stream* stream, const int32* block);
+uint zfp_encode_block_int64_3(zfp_stream* stream, const int64* block);
+uint zfp_encode_block_float_3(zfp_stream* stream, const float* block);
+uint zfp_encode_block_double_3(zfp_stream* stream, const double* block);
+
+/* encode 3D complete or partial block from strided array */
+uint zfp_encode_block_strided_int32_3(zfp_stream* stream, const int32* p, int sx, int sy, int sz);
+uint zfp_encode_block_strided_int64_3(zfp_stream* stream, const int64* p, int sx, int sy, int sz);
+uint zfp_encode_block_strided_float_3(zfp_stream* stream, const float* p, int sx, int sy, int sz);
+uint zfp_encode_block_strided_double_3(zfp_stream* stream, const double* p, int sx, int sy, int sz);
+uint zfp_encode_partial_block_strided_int32_3(zfp_stream* stream, const int32* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
+uint zfp_encode_partial_block_strided_int64_3(zfp_stream* stream, const int64* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
+uint zfp_encode_partial_block_strided_float_3(zfp_stream* stream, const float* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
+uint zfp_encode_partial_block_strided_double_3(zfp_stream* stream, const double* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
+
+/* encode 4D contiguous block of 4x4x4x4 values */
+uint zfp_encode_block_int32_4(zfp_stream* stream, const int32* block);
+uint zfp_encode_block_int64_4(zfp_stream* stream, const int64* block);
+uint zfp_encode_block_float_4(zfp_stream* stream, const float* block);
+uint zfp_encode_block_double_4(zfp_stream* stream, const double* block);
+
+/* encode 4D complete or partial block from strided array */
+uint zfp_encode_block_strided_int32_4(zfp_stream* stream, const int32* p, int sx, int sy, int sz, int sw);
+uint zfp_encode_block_strided_int64_4(zfp_stream* stream, const int64* p, int sx, int sy, int sz, int sw);
+uint zfp_encode_block_strided_float_4(zfp_stream* stream, const float* p, int sx, int sy, int sz, int sw);
+uint zfp_encode_block_strided_double_4(zfp_stream* stream, const double* p, int sx, int sy, int sz, int sw);
+uint zfp_encode_partial_block_strided_int32_4(zfp_stream* stream, const int32* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
+uint zfp_encode_partial_block_strided_int64_4(zfp_stream* stream, const int64* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
+uint zfp_encode_partial_block_strided_float_4(zfp_stream* stream, const float* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
+uint zfp_encode_partial_block_strided_double_4(zfp_stream* stream, const double* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
+
+/* low-level API: decoder -------------------------------------------------- */
+
+/*
+Each function below decompresses a single block and returns the number of bits
+of compressed storage consumed.  See corresponding encoder functions above for
+further details.
+*/
+
+/* decode 1D contiguous block of 4 values */
+uint zfp_decode_block_int32_1(zfp_stream* stream, int32* block);
+uint zfp_decode_block_int64_1(zfp_stream* stream, int64* block);
+uint zfp_decode_block_float_1(zfp_stream* stream, float* block);
+uint zfp_decode_block_double_1(zfp_stream* stream, double* block);
+
+/* decode 1D complete or partial block from strided array */
+uint zfp_decode_block_strided_int32_1(zfp_stream* stream, int32* p, int sx);
+uint zfp_decode_block_strided_int64_1(zfp_stream* stream, int64* p, int sx);
+uint zfp_decode_block_strided_float_1(zfp_stream* stream, float* p, int sx);
+uint zfp_decode_block_strided_double_1(zfp_stream* stream, double* p, int sx);
+uint zfp_decode_partial_block_strided_int32_1(zfp_stream* stream, int32* p, uint nx, int sx);
+uint zfp_decode_partial_block_strided_int64_1(zfp_stream* stream, int64* p, uint nx, int sx);
+uint zfp_decode_partial_block_strided_float_1(zfp_stream* stream, float* p, uint nx, int sx);
+uint zfp_decode_partial_block_strided_double_1(zfp_stream* stream, double* p, uint nx, int sx);
+
+/* decode 2D contiguous block of 4x4 values */
+uint zfp_decode_block_int32_2(zfp_stream* stream, int32* block);
+uint zfp_decode_block_int64_2(zfp_stream* stream, int64* block);
+uint zfp_decode_block_float_2(zfp_stream* stream, float* block);
+uint zfp_decode_block_double_2(zfp_stream* stream, double* block);
+
+/* decode 2D complete or partial block from strided array */
+uint zfp_decode_block_strided_int32_2(zfp_stream* stream, int32* p, int sx, int sy);
+uint zfp_decode_block_strided_int64_2(zfp_stream* stream, int64* p, int sx, int sy);
+uint zfp_decode_block_strided_float_2(zfp_stream* stream, float* p, int sx, int sy);
+uint zfp_decode_block_strided_double_2(zfp_stream* stream, double* p, int sx, int sy);
+uint zfp_decode_partial_block_strided_int32_2(zfp_stream* stream, int32* p, uint nx, uint ny, int sx, int sy);
+uint zfp_decode_partial_block_strided_int64_2(zfp_stream* stream, int64* p, uint nx, uint ny, int sx, int sy);
+uint zfp_decode_partial_block_strided_float_2(zfp_stream* stream, float* p, uint nx, uint ny, int sx, int sy);
+uint zfp_decode_partial_block_strided_double_2(zfp_stream* stream, double* p, uint nx, uint ny, int sx, int sy);
+
+/* decode 3D contiguous block of 4x4x4 values */
+uint zfp_decode_block_int32_3(zfp_stream* stream, int32* block);
+uint zfp_decode_block_int64_3(zfp_stream* stream, int64* block);
+uint zfp_decode_block_float_3(zfp_stream* stream, float* block);
+uint zfp_decode_block_double_3(zfp_stream* stream, double* block);
+
+/* decode 3D complete or partial block from strided array */
+uint zfp_decode_block_strided_int32_3(zfp_stream* stream, int32* p, int sx, int sy, int sz);
+uint zfp_decode_block_strided_int64_3(zfp_stream* stream, int64* p, int sx, int sy, int sz);
+uint zfp_decode_block_strided_float_3(zfp_stream* stream, float* p, int sx, int sy, int sz);
+uint zfp_decode_block_strided_double_3(zfp_stream* stream, double* p, int sx, int sy, int sz);
+uint zfp_decode_partial_block_strided_int32_3(zfp_stream* stream, int32* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
+uint zfp_decode_partial_block_strided_int64_3(zfp_stream* stream, int64* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
+uint zfp_decode_partial_block_strided_float_3(zfp_stream* stream, float* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
+uint zfp_decode_partial_block_strided_double_3(zfp_stream* stream, double* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
+
+/* decode 4D contiguous block of 4x4x4x4 values */
+uint zfp_decode_block_int32_4(zfp_stream* stream, int32* block);
+uint zfp_decode_block_int64_4(zfp_stream* stream, int64* block);
+uint zfp_decode_block_float_4(zfp_stream* stream, float* block);
+uint zfp_decode_block_double_4(zfp_stream* stream, double* block);
+
+/* decode 4D complete or partial block from strided array */
+uint zfp_decode_block_strided_int32_4(zfp_stream* stream, int32* p, int sx, int sy, int sz, int sw);
+uint zfp_decode_block_strided_int64_4(zfp_stream* stream, int64* p, int sx, int sy, int sz, int sw);
+uint zfp_decode_block_strided_float_4(zfp_stream* stream, float* p, int sx, int sy, int sz, int sw);
+uint zfp_decode_block_strided_double_4(zfp_stream* stream, double* p, int sx, int sy, int sz, int sw);
+uint zfp_decode_partial_block_strided_int32_4(zfp_stream* stream, int32* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
+uint zfp_decode_partial_block_strided_int64_4(zfp_stream* stream, int64* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
+uint zfp_decode_partial_block_strided_float_4(zfp_stream* stream, float* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
+uint zfp_decode_partial_block_strided_double_4(zfp_stream* stream, double* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
+
+/* low-level API: utility functions ---------------------------------------- */
+
+/* convert dims-dimensional contiguous block to 32-bit integer type */
+void zfp_promote_int8_to_int32(int32* oblock, const int8* iblock, uint dims);
+void zfp_promote_uint8_to_int32(int32* oblock, const uint8* iblock, uint dims);
+void zfp_promote_int16_to_int32(int32* oblock, const int16* iblock, uint dims);
+void zfp_promote_uint16_to_int32(int32* oblock, const uint16* iblock, uint dims);
+
+/* convert dims-dimensional contiguous block from 32-bit integer type */
+void zfp_demote_int32_to_int8(int8* oblock, const int32* iblock, uint dims);
+void zfp_demote_int32_to_uint8(uint8* oblock, const int32* iblock, uint dims);
+void zfp_demote_int32_to_int16(int16* oblock, const int32* iblock, uint dims);
+void zfp_demote_int32_to_uint16(uint16* oblock, const int32* iblock, uint dims);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/zfp/include/zfp/macros.h b/zfp/include/zfp/macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..be3655c701356db28da52af3ee63bf2a13615b7e
--- /dev/null
+++ b/zfp/include/zfp/macros.h
@@ -0,0 +1,7 @@
+#ifndef ZFP_MACROS_H
+#define ZFP_MACROS_H
+
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+
+#endif
diff --git a/zfp/include/zfp/system.h b/zfp/include/zfp/system.h
new file mode 100644
index 0000000000000000000000000000000000000000..5394196482551e342c9f673d2f398469a487cd59
--- /dev/null
+++ b/zfp/include/zfp/system.h
@@ -0,0 +1,47 @@
+#ifndef ZFP_SYSTEM_H
+#define ZFP_SYSTEM_H
+
+#if __STDC_VERSION__ >= 199901L
+  #define restrict_ restrict
+#else
+  #define restrict_
+#endif
+
+/* macros for exporting and importing symbols */
+#ifdef _MSC_VER
+  #define export_ __declspec(dllexport)
+  /* export (import) symbols when ZFP_SOURCE is (is not) defined */
+  #ifdef ZFP_SOURCE
+    #ifdef __cplusplus
+      #define extern_ extern "C" __declspec(dllexport)
+    #else
+      #define extern_ extern     __declspec(dllexport)
+    #endif
+  #else
+    #ifdef __cplusplus
+      #define extern_ extern "C" __declspec(dllimport)
+    #else
+      #define extern_ extern     __declspec(dllimport)
+    #endif
+  #endif
+#else /* !_MSC_VER */
+  #define export_
+  #ifdef __cplusplus
+    #define extern_ extern "C"
+  #else
+    #define extern_ extern
+  #endif
+#endif
+
+#ifdef __GNUC__
+  /* L1 cache line size for alignment purposes */
+  #ifndef ZFP_CACHE_LINE_SIZE
+    #define ZFP_CACHE_LINE_SIZE 0x100
+  #endif
+  #define align_(n) __attribute__((aligned(n)))
+  #define cache_align_(x) x align_(ZFP_CACHE_LINE_SIZE)
+#else
+  #define cache_align_(x) x
+#endif
+
+#endif
diff --git a/zfp/include/zfp/types.h b/zfp/include/zfp/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..b501ca2932d958023d75faf32e994593114b41de
--- /dev/null
+++ b/zfp/include/zfp/types.h
@@ -0,0 +1,74 @@
+#ifndef ZFP_TYPES_H
+#define ZFP_TYPES_H
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+#if __STDC_VERSION__ >= 199901L
+  /* C99: use standard integer types */
+  #include <stdint.h>
+  #define INT64C(x) INT64_C(x)
+  #define UINT64C(x) UINT64_C(x)
+  typedef int8_t int8;
+  typedef uint8_t uint8;
+  typedef int16_t int16;
+  typedef uint16_t uint16;
+  typedef int32_t int32;
+  typedef uint32_t uint32;
+  typedef int64_t int64;
+  typedef uint64_t uint64;
+#else
+  /* C89: assume common integer types */
+  typedef signed char int8;
+  typedef unsigned char uint8;
+  typedef signed short int16;
+  typedef unsigned short uint16;
+
+  /* assume 32-bit integers (LP64, LLP64) */
+  typedef signed int int32;
+  typedef unsigned int uint32;
+
+  /* determine 64-bit data model */
+  #if defined(_WIN32) || defined(_WIN64)
+    /* assume ILP32 or LLP64 (MSVC, MinGW) */
+    #define ZFP_LLP64 1
+  #else
+    /* assume LP64 (Linux, macOS, ...) */
+    #define ZFP_LP64 1
+  #endif
+
+  /* concatenation for literal suffixes */
+  #define _zfp_cat_(x, y) x ## y
+  #define _zfp_cat(x, y) _zfp_cat_(x, y)
+
+  /* signed 64-bit integers */
+  #if defined(ZFP_INT64) && defined(ZFP_INT64_SUFFIX)
+    #define INT64C(x) _zfp_cat(x, ZFP_INT64_SUFFIX)
+    typedef ZFP_INT64 int64;
+  #elif ZFP_LP64
+    #define INT64C(x) x ## l
+    typedef signed long int64;
+  #elif ZFP_LLP64
+    #define INT64C(x) x ## ll
+    typedef signed long long int64;
+  #else
+    #error "unknown 64-bit signed integer type"
+  #endif
+
+  /* unsigned 64-bit integers */
+  #if defined(ZFP_UINT64) && defined(ZFP_UINT64_SUFFIX)
+    #define UINT64C(x) _zfp_cat(x, ZFP_UINT64_SUFFIX)
+    typedef ZFP_UINT64 uint64;
+  #elif ZFP_LP64
+    #define UINT64C(x) x ## ul
+    typedef unsigned long uint64;
+  #elif ZFP_LLP64
+    #define UINT64C(x) x ## ull
+    typedef unsigned long long uint64;
+  #else
+    #error "unknown 64-bit unsigned integer type"
+  #endif
+#endif
+
+#endif
diff --git a/zfp/src/CMakeLists.txt b/zfp/src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4119245c0482c94a7fd4be15e76bfaf3266ffe26
--- /dev/null
+++ b/zfp/src/CMakeLists.txt
@@ -0,0 +1,63 @@
+if(ZFP_WITH_CUDA)
+  SET(CMAKE_CXX_FLAGS_PREVIOUS ${CMAKE_CXX_FLAGS})
+  SET(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -fPIC" )
+
+  add_subdirectory(cuda_zfp)
+  cuda_wrap_srcs(zfp OBJ zfp_cuda_backend_obj cuda_zfp/cuZFP.cu)
+  SET(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS_PREVIOUS})
+  add_definitions(-DZFP_WITH_CUDA)
+endif()
+
+
+set(zfp_source
+  zfp.c
+  bitstream.c
+  traitsf.h traitsd.h block1.h block2.h block3.h block4.h
+  encode1f.c encode1d.c encode1i.c encode1l.c
+  decode1f.c decode1d.c decode1i.c decode1l.c
+  encode2f.c encode2d.c encode2i.c encode2l.c
+  decode2f.c decode2d.c decode2i.c decode2l.c
+  encode3f.c encode3d.c encode3i.c encode3l.c
+  decode3f.c decode3d.c decode3i.c decode3l.c
+  encode4f.c encode4d.c encode4i.c encode4l.c
+  decode4f.c decode4d.c decode4i.c decode4l.c)
+
+add_library(zfp ${zfp_source}
+                ${zfp_cuda_backend_obj})
+add_library(zfp::zfp ALIAS zfp)
+
+if(ZFP_WITH_OPENMP)
+  target_compile_options(zfp PRIVATE ${OpenMP_C_FLAGS})
+  target_link_libraries(zfp PRIVATE ${OpenMP_C_LIBRARIES})
+endif()
+
+if(HAVE_LIBM_MATH)
+  target_link_libraries(zfp PRIVATE m)
+endif()
+
+if(WIN32)
+  # Define ZFP_SOURCE when compiling libzfp to export symbols to Windows DLL
+  list(APPEND zfp_defs ZFP_SOURCE)
+endif()
+
+if(ZFP_WITH_CUDA)
+  target_link_libraries(zfp PRIVATE ${CUDA_CUDART_LIBRARY} stdc++)
+endif()
+
+target_compile_definitions(zfp PRIVATE ${zfp_defs})
+
+target_include_directories(zfp
+  PUBLIC
+    $<BUILD_INTERFACE:${ZFP_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+  INTERFACE
+    $<BUILD_INTERFACE:${ZFP_SOURCE_DIR}/array>)
+
+set_property(TARGET zfp PROPERTY VERSION ${ZFP_VERSION})
+set_property(TARGET zfp PROPERTY SOVERSION ${ZFP_VERSION_MAJOR})
+set_property(TARGET zfp PROPERTY OUTPUT_NAME ${ZFP_LIBRARY_PREFIX}zfp)
+
+install(TARGETS zfp EXPORT zfp-targets
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/zfp/src/Makefile b/zfp/src/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..e4347096d196ab851958b521afdb699be475af16
--- /dev/null
+++ b/zfp/src/Makefile
@@ -0,0 +1,31 @@
+#include ../Config
+include ../../Make_include
+
+CFLAGS += -std=c99 -I../include
+
+LIBDIR = ../lib
+TARGETS = $(LIBDIR)/libzfp.a $(LIBDIR)/libzfp.so
+OBJECTS = bitstream.o decode1i.o decode1l.o decode1f.o decode1d.o encode1i.o encode1l.o encode1f.o encode1d.o decode2i.o decode2l.o decode2f.o decode2d.o encode2i.o encode2l.o encode2f.o encode2d.o decode3i.o decode3l.o decode3f.o decode3d.o encode3i.o encode3l.o encode3f.o encode3d.o decode4i.o decode4l.o decode4f.o decode4d.o encode4i.o encode4l.o encode4f.o encode4d.o zfp.o
+
+static: $(LIBDIR)/libzfp.a
+
+shared: $(LIBDIR)/libzfp.so
+
+clean:
+	rm -f $(OBJECTS)
+
+realclean:
+	rm -f $(TARGETS) $(OBJECTS)
+
+$(LIBDIR)/libzfp.a: $(OBJECTS)
+	mkdir -p $(LIBDIR)
+	ar rc $@ $^
+
+#	rm -f $@
+#
+$(LIBDIR)/libzfp.so: $(OBJECTS)
+	mkdir -p $(LIBDIR)
+	$(CC) $(CFLAGS) -shared $^ -o $@
+
+#.c.o:
+#	$(CC) $(CFLAGS) -c $<
diff --git a/zfp/src/bitstream.c b/zfp/src/bitstream.c
new file mode 100644
index 0000000000000000000000000000000000000000..05094c6d31befd5947e7b49164cc7eeb315fa95c
--- /dev/null
+++ b/zfp/src/bitstream.c
@@ -0,0 +1,4 @@
+#include "bitstream.h"
+#include "inline/bitstream.c"
+
+export_ const size_t stream_word_bits = wsize;
diff --git a/zfp/src/block1.h b/zfp/src/block1.h
new file mode 100644
index 0000000000000000000000000000000000000000..035d9c9523d63ff46b76dd648bf99b5503781016
--- /dev/null
+++ b/zfp/src/block1.h
@@ -0,0 +1 @@
+#define DIMS 1
diff --git a/zfp/src/block2.h b/zfp/src/block2.h
new file mode 100644
index 0000000000000000000000000000000000000000..e87ab62995ad655de67ed64a1e466e7622a3a758
--- /dev/null
+++ b/zfp/src/block2.h
@@ -0,0 +1 @@
+#define DIMS 2
diff --git a/zfp/src/block3.h b/zfp/src/block3.h
new file mode 100644
index 0000000000000000000000000000000000000000..a683568673a3f619d23ece9a3abd92b965a6aea6
--- /dev/null
+++ b/zfp/src/block3.h
@@ -0,0 +1 @@
+#define DIMS 3
diff --git a/zfp/src/block4.h b/zfp/src/block4.h
new file mode 100644
index 0000000000000000000000000000000000000000..6737fb25eaf4ed27653c2fc737d6d73ee058ffe7
--- /dev/null
+++ b/zfp/src/block4.h
@@ -0,0 +1 @@
+#define DIMS 4
diff --git a/zfp/src/cuda_zfp/CMakeLists.txt b/zfp/src/cuda_zfp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2fe402fa060f8fba83064356715fb1725fb9416c
--- /dev/null
+++ b/zfp/src/cuda_zfp/CMakeLists.txt
@@ -0,0 +1,25 @@
+###############################################################################
+#
+#  file: src/cuZFP/CMakeLists.txt 
+#
+###############################################################################
+
+set(cuZFP_sources
+    cuZFP.cu         # main entry point
+    decode.cuh
+    decode1.cuh
+    decode2.cuh
+    decode3.cuh
+    encode.cuh
+    encode1.cuh
+    encode2.cuh
+    encode3.cuh
+    pointers.cuh
+    type_info.cuh)
+
+set(cuZFP_headers
+    constant_setup.cuh
+    shared.h
+    cuZFP.h
+    ErrorCheck.h)
+
diff --git a/zfp/src/cuda_zfp/ErrorCheck.h b/zfp/src/cuda_zfp/ErrorCheck.h
new file mode 100644
index 0000000000000000000000000000000000000000..90a7ac47f7609034429ef1462ff6c065a4b32027
--- /dev/null
+++ b/zfp/src/cuda_zfp/ErrorCheck.h
@@ -0,0 +1,35 @@
+#ifndef ERRORCHECK_H
+#define ERRORCHECK_H
+#include <iostream>
+#include <string>
+#include <sstream>
+
+using std::stringstream;
+class ErrorCheck
+{
+public:
+  ErrorCheck()
+  {
+
+  }
+
+  void chk(std::string msg)
+  {
+    error = cudaGetLastError();
+    if (error != cudaSuccess)
+    {
+      std::cout << msg << " : " << error;
+      std::cout << " " << cudaGetErrorString(error) << std::endl;
+    }
+  }
+
+  void chk()
+  {
+    chk(str.str());
+    str.str("");
+  }
+  cudaError error;
+  stringstream str;
+};
+
+#endif // ERRORCHECK_H
diff --git a/zfp/src/cuda_zfp/constant_setup.cuh b/zfp/src/cuda_zfp/constant_setup.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1c1221adda9bf2edf2dfc1b1a849682b6d20f29a
--- /dev/null
+++ b/zfp/src/cuda_zfp/constant_setup.cuh
@@ -0,0 +1,39 @@
+#ifndef cuZFP_CONSTANT_SETUP
+#define cuZFP_CONSTANT_SETUP
+
+#include "constants.h"
+#include "shared.h"
+#include "ErrorCheck.h"
+#include "type_info.cuh"
+
+namespace cuZFP {
+
+class ConstantSetup
+{
+public:
+  static void setup_3d()
+  { 
+    ErrorCheck ec;
+    cudaMemcpyToSymbol(c_perm, perm_3d, sizeof(unsigned char) * 64, 0); 
+    ec.chk("setupConst: c_perm");
+  }
+
+  static void setup_2d()
+  {
+    ErrorCheck ec;
+    cudaMemcpyToSymbol(c_perm_2, perm_2, sizeof(unsigned char) * 16, 0); 
+    ec.chk("setupConst: c_perm_2");
+  }
+
+  static void setup_1d()
+  {
+    ErrorCheck ec;
+    cudaMemcpyToSymbol(c_perm_1, perm_1, sizeof(unsigned char) * 4, 0); 
+    ec.chk("setupConst: c_perm_1");
+  }
+};
+
+
+} //namespace 
+
+#endif
diff --git a/zfp/src/cuda_zfp/constants.h b/zfp/src/cuda_zfp/constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..423ac91cc60018ce7b4e810f7984c2208e33171e
--- /dev/null
+++ b/zfp/src/cuda_zfp/constants.h
@@ -0,0 +1,136 @@
+#ifndef cuZFP_CONSTANTS_H
+#define cuZFP_CONSTANTS_H
+
+namespace cuZFP {
+
+#define index_3d(x, y, z) ((x) + 4 * ((y) + 4 * (z)))
+
+static const unsigned char
+perm_3d[64] = {
+	index_3d(0, 0, 0), //  0 : 0
+
+	index_3d(1, 0, 0), //  1 : 1
+	index_3d(0, 1, 0), //  2 : 1
+	index_3d(0, 0, 1), //  3 : 1
+
+	index_3d(0, 1, 1), //  4 : 2
+	index_3d(1, 0, 1), //  5 : 2
+	index_3d(1, 1, 0), //  6 : 2
+
+	index_3d(2, 0, 0), //  7 : 2
+	index_3d(0, 2, 0), //  8 : 2
+	index_3d(0, 0, 2), //  9 : 2
+
+	index_3d(1, 1, 1), // 10 : 3
+
+	index_3d(2, 1, 0), // 11 : 3
+	index_3d(2, 0, 1), // 12 : 3
+	index_3d(0, 2, 1), // 13 : 3
+	index_3d(1, 2, 0), // 14 : 3
+	index_3d(1, 0, 2), // 15 : 3
+	index_3d(0, 1, 2), // 16 : 3
+
+	index_3d(3, 0, 0), // 17 : 3
+	index_3d(0, 3, 0), // 18 : 3
+	index_3d(0, 0, 3), // 19 : 3
+
+	index_3d(2, 1, 1), // 20 : 4
+	index_3d(1, 2, 1), // 21 : 4
+	index_3d(1, 1, 2), // 22 : 4
+
+	index_3d(0, 2, 2), // 23 : 4
+	index_3d(2, 0, 2), // 24 : 4
+	index_3d(2, 2, 0), // 25 : 4
+
+	index_3d(3, 1, 0), // 26 : 4
+	index_3d(3, 0, 1), // 27 : 4
+	index_3d(0, 3, 1), // 28 : 4
+	index_3d(1, 3, 0), // 29 : 4
+	index_3d(1, 0, 3), // 30 : 4
+	index_3d(0, 1, 3), // 31 : 4
+
+	index_3d(1, 2, 2), // 32 : 5
+	index_3d(2, 1, 2), // 33 : 5
+	index_3d(2, 2, 1), // 34 : 5
+
+	index_3d(3, 1, 1), // 35 : 5
+	index_3d(1, 3, 1), // 36 : 5
+	index_3d(1, 1, 3), // 37 : 5
+
+	index_3d(3, 2, 0), // 38 : 5
+	index_3d(3, 0, 2), // 39 : 5
+	index_3d(0, 3, 2), // 40 : 5
+	index_3d(2, 3, 0), // 41 : 5
+	index_3d(2, 0, 3), // 42 : 5
+	index_3d(0, 2, 3), // 43 : 5
+
+	index_3d(2, 2, 2), // 44 : 6
+
+	index_3d(3, 2, 1), // 45 : 6
+	index_3d(3, 1, 2), // 46 : 6
+	index_3d(1, 3, 2), // 47 : 6
+	index_3d(2, 3, 1), // 48 : 6
+	index_3d(2, 1, 3), // 49 : 6
+	index_3d(1, 2, 3), // 50 : 6
+
+	index_3d(0, 3, 3), // 51 : 6
+	index_3d(3, 0, 3), // 52 : 6
+	index_3d(3, 3, 0), // 53 : 6
+
+	index_3d(3, 2, 2), // 54 : 7
+	index_3d(2, 3, 2), // 55 : 7
+	index_3d(2, 2, 3), // 56 : 7
+
+	index_3d(1, 3, 3), // 57 : 7
+	index_3d(3, 1, 3), // 58 : 7
+	index_3d(3, 3, 1), // 59 : 7
+
+	index_3d(2, 3, 3), // 60 : 8
+	index_3d(3, 2, 3), // 61 : 8
+	index_3d(3, 3, 2), // 62 : 8
+
+	index_3d(3, 3, 3), // 63 : 9
+};
+
+#undef index_3d
+
+static const unsigned char perm_1[4] = 
+{
+  0, 1, 2, 3
+};
+
+#define index(i, j) ((i) + 4 * (j))
+
+/* order coefficients (i, j) by i + j, then i^2 + j^2 */
+static const unsigned char perm_2[16] = {
+  index(0, 0), /*  0 : 0 */
+
+  index(1, 0), /*  1 : 1 */
+  index(0, 1), /*  2 : 1 */
+
+  index(1, 1), /*  3 : 2 */
+
+  index(2, 0), /*  4 : 2 */
+  index(0, 2), /*  5 : 2 */
+
+  index(2, 1), /*  6 : 3 */
+  index(1, 2), /*  7 : 3 */
+
+  index(3, 0), /*  8 : 3 */
+  index(0, 3), /*  9 : 3 */
+
+  index(2, 2), /* 10 : 4 */
+
+  index(3, 1), /* 11 : 4 */
+  index(1, 3), /* 12 : 4 */
+
+  index(3, 2), /* 13 : 5 */
+  index(2, 3), /* 14 : 5 */
+
+  index(3, 3), /* 15 : 6 */
+};
+
+#undef index
+
+} // namespace cuZFP
+#endif
diff --git a/zfp/src/cuda_zfp/cuZFP.cu b/zfp/src/cuda_zfp/cuZFP.cu
new file mode 100644
index 0000000000000000000000000000000000000000..46815a6ac0a25f2828bba07f1dfb700d39777b94
--- /dev/null
+++ b/zfp/src/cuda_zfp/cuZFP.cu
@@ -0,0 +1,447 @@
+#include <assert.h>
+
+#include "cuZFP.h"
+
+#include "encode1.cuh"
+#include "encode2.cuh"
+#include "encode3.cuh"
+
+#include "decode1.cuh"
+#include "decode2.cuh"
+#include "decode3.cuh"
+
+#include "ErrorCheck.h"
+
+#include "constant_setup.cuh"
+#include "pointers.cuh"
+#include "type_info.cuh"
+#include <iostream>
+#include <assert.h>
+
+// we need to know about bitstream, but we don't 
+// want duplicate symbols.
+#ifndef inline_
+  #define inline_ inline
+#endif
+
+#include "../inline/bitstream.c"
+namespace internal 
+{ 
+  
+bool is_contigous3d(const uint dims[3], const int3 &stride, long long int &offset)
+{
+  typedef long long int int64;
+  int64 idims[3];
+  idims[0] = dims[0];
+  idims[1] = dims[1];
+  idims[2] = dims[2];
+
+  int64 imin = std::min(stride.x,0) * (idims[0] - 1) + 
+               std::min(stride.y,0) * (idims[1] - 1) + 
+               std::min(stride.z,0) * (idims[2] - 1);
+
+  int64 imax = std::max(stride.x,0) * (idims[0] - 1) + 
+               std::max(stride.y,0) * (idims[1] - 1) + 
+               std::max(stride.z,0) * (idims[2] - 1);
+  offset = imin;
+  int64 ns = idims[0] * idims[1] * idims[2];
+
+  return (imax - imin + 1 == ns);
+}
+
+bool is_contigous2d(const uint dims[3], const int3 &stride, long long int &offset)
+{
+  typedef long long int int64;
+  int64 idims[2];
+  idims[0] = dims[0];
+  idims[1] = dims[1];
+
+  int64 imin = std::min(stride.x,0) * (idims[0] - 1) + 
+               std::min(stride.y,0) * (idims[1] - 1);
+
+  int64  imax = std::max(stride.x,0) * (idims[0] - 1) + 
+                std::max(stride.y,0) * (idims[1] - 1); 
+
+  offset = imin;
+  return (imax - imin + 1) == (idims[0] * idims[1]);
+}
+
+bool is_contigous1d(uint dim, const int &stride, long long int &offset)
+{
+  offset = 0;
+  if(stride < 0) offset = stride * (int(dim) - 1);
+  return std::abs(stride) == 1;
+}
+
+bool is_contigous(const uint dims[3], const int3 &stride, long long int &offset)
+{
+  int d = 0;
+  
+  if(dims[0] != 0) d++;
+  if(dims[1] != 0) d++;
+  if(dims[2] != 0) d++;
+
+  if(d == 3)
+  {
+    return is_contigous3d(dims, stride, offset);
+  }
+  else if(d == 2)
+  {
+   return is_contigous2d(dims, stride, offset);
+  }
+  else
+  {
+    return is_contigous1d(dims[0], stride.x, offset);
+  } 
+
+}
+//
+// encode expects device pointers
+//
+template<typename T>
+size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_stream)
+{
+
+  int d = 0;
+  size_t len = 1;
+  for(int i = 0; i < 3; ++i)
+  {
+    if(dims[i] != 0)
+    {
+      d++;
+      len *= dims[i];
+    }
+  }
+
+  ErrorCheck errors;
+  size_t stream_size = 0;
+  if(d == 1)
+  {
+    int dim = dims[0];
+    int sx = stride.x;
+    cuZFP::ConstantSetup::setup_1d();
+    stream_size = cuZFP::encode1<T>(dim, sx, d_data, d_stream, bits_per_block); 
+  }
+  else if(d == 2)
+  {
+    uint2 ndims = make_uint2(dims[0], dims[1]);
+    int2 s;
+    s.x = stride.x; 
+    s.y = stride.y; 
+    cuZFP::ConstantSetup::setup_2d();
+    stream_size = cuZFP::encode2<T>(ndims, s, d_data, d_stream, bits_per_block); 
+  }
+  else if(d == 3)
+  {
+    int3 s;
+    s.x = stride.x; 
+    s.y = stride.y; 
+    s.z = stride.z; 
+    uint3 ndims = make_uint3(dims[0], dims[1], dims[2]);
+    cuZFP::ConstantSetup::setup_3d();
+    stream_size = cuZFP::encode<T>(ndims, s, d_data, d_stream, bits_per_block); 
+  }
+
+  errors.chk("Encode");
+  
+  return stream_size; 
+}
+
+template<typename T>
+size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *out)
+{
+
+  int d = 0;
+  size_t out_size = 1;
+  size_t stream_bytes = 0;
+  for(int i = 0; i < 3; ++i)
+  {
+    if(ndims[i] != 0)
+    {
+      d++;
+      out_size *= ndims[i];
+    }
+  }
+
+  if(d == 3)
+  {
+    uint3 dims = make_uint3(ndims[0], ndims[1], ndims[2]);
+
+    int3 s;
+    s.x = stride.x; 
+    s.y = stride.y; 
+    s.z = stride.z; 
+
+    cuZFP::ConstantSetup::setup_3d();
+    stream_bytes = cuZFP::decode3<T>(dims, s, stream, out, bits_per_block); 
+  }
+  else if(d == 1)
+  {
+    uint dim = ndims[0];
+    int sx = stride.x;
+
+    cuZFP::ConstantSetup::setup_1d();
+    stream_bytes = cuZFP::decode1<T>(dim, sx, stream, out, bits_per_block); 
+
+  }
+  else if(d == 2)
+  {
+    uint2 dims;
+    dims.x = ndims[0];
+    dims.y = ndims[1];
+
+    int2 s;
+    s.x = stride.x; 
+    s.y = stride.y; 
+
+    cuZFP::ConstantSetup::setup_2d();
+    stream_bytes = cuZFP::decode2<T>(dims, s, stream, out, bits_per_block); 
+  }
+  else std::cerr<<" d ==  "<<d<<" not implemented\n";
+ 
+  return stream_bytes;
+}
+
+Word *setup_device_stream(zfp_stream *stream,const zfp_field *field)
+{
+  bool stream_device = cuZFP::is_gpu_ptr(stream->stream->begin);
+  assert(sizeof(word) == sizeof(Word)); // "CUDA version currently only supports 64bit words");
+
+  if(stream_device)
+  {
+    return (Word*) stream->stream->begin;
+  } 
+
+  Word *d_stream = NULL;
+  // TODO: we we have a real stream we can just ask it how big it is
+  size_t max_size = zfp_stream_maximum_size(stream, field);
+  cudaMalloc(&d_stream, max_size);
+  cudaMemcpy(d_stream, stream->stream->begin, max_size, cudaMemcpyHostToDevice);
+  return d_stream;
+}
+
+void * offset_void(zfp_type type, void *ptr, long long int offset)
+{
+  void * offset_ptr = NULL;
+  if(type == zfp_type_float)
+  {
+    float* data = (float*) ptr;
+    offset_ptr = (void*)(&data[offset]);
+  }
+  else if(type == zfp_type_double)
+  {
+    double* data = (double*) ptr;
+    offset_ptr = (void*)(&data[offset]);
+  }
+  else if(type == zfp_type_int32)
+  {
+    int * data = (int*) ptr;
+    offset_ptr = (void*)(&data[offset]);
+  }
+  else if(type == zfp_type_int64)
+  {
+    long long int * data = (long long int*) ptr;
+    offset_ptr = (void*)(&data[offset]);
+  }
+  return offset_ptr;
+}
+
+void *setup_device_field(const zfp_field *field, const int3 &stride, long long int &offset)
+{
+  bool field_device = cuZFP::is_gpu_ptr(field->data);
+
+  if(field_device)
+  {
+    offset = 0;
+    return field->data;
+  }
+  
+  uint dims[3];
+  dims[0] = field->nx;
+  dims[1] = field->ny;
+  dims[2] = field->nz;
+
+  size_t type_size = zfp_type_size(field->type);
+
+  size_t field_size = 1;
+  for(int i = 0; i < 3; ++i)
+  {
+    if(dims[i] != 0)
+    {
+      field_size *= dims[i];
+    }
+  }
+
+  bool contig = internal::is_contigous(dims, stride, offset);
+  
+  void * host_ptr = offset_void(field->type, field->data, offset);;
+
+  void *d_data = NULL;
+  if(contig)
+  {
+    size_t field_bytes = type_size * field_size;
+    cudaMalloc(&d_data, field_bytes);
+
+    cudaMemcpy(d_data, host_ptr, field_bytes, cudaMemcpyHostToDevice);
+  }
+  return offset_void(field->type, d_data, -offset);
+}
+
+void cleanup_device_ptr(void *orig_ptr, void *d_ptr, size_t bytes, long long int offset, zfp_type type)
+{
+  bool device = cuZFP::is_gpu_ptr(orig_ptr);
+  if(device)
+  {
+    return;
+  }
+  // from whence it came
+  void *d_offset_ptr = offset_void(type, d_ptr, offset);
+  void *h_offset_ptr = offset_void(type, orig_ptr, offset);
+
+  if(bytes > 0)
+  {
+    cudaMemcpy(h_offset_ptr, d_offset_ptr, bytes, cudaMemcpyDeviceToHost);
+  }
+
+  cudaFree(d_offset_ptr);
+}
+
+} // namespace internal
+
+size_t
+cuda_compress(zfp_stream *stream, const zfp_field *field)
+{
+  uint dims[3];
+  dims[0] = field->nx;
+  dims[1] = field->ny;
+  dims[2] = field->nz;
+
+  int3 stride;  
+  stride.x = field->sx ? field->sx : 1;
+  stride.y = field->sy ? field->sy : field->nx;
+  stride.z = field->sz ? field->sz : field->nx * field->ny;
+  
+  size_t stream_bytes = 0;
+  long long int offset = 0; 
+  void *d_data = internal::setup_device_field(field, stride, offset);
+
+  if(d_data == NULL)
+  {
+    // null means the array is non-contiguous host mem which is not supported
+    return 0;
+  }
+
+  Word *d_stream = internal::setup_device_stream(stream, field);
+
+  if(field->type == zfp_type_float)
+  {
+    float* data = (float*) d_data;
+    stream_bytes = internal::encode<float>(dims, stride, (int)stream->maxbits, data, d_stream);
+  }
+  else if(field->type == zfp_type_double)
+  {
+    double* data = (double*) d_data;
+    stream_bytes = internal::encode<double>(dims, stride, (int)stream->maxbits, data, d_stream);
+  }
+  else if(field->type == zfp_type_int32)
+  {
+    int * data = (int*) d_data;
+    stream_bytes = internal::encode<int>(dims, stride, (int)stream->maxbits, data, d_stream);
+  }
+  else if(field->type == zfp_type_int64)
+  {
+    long long int * data = (long long int*) d_data;
+    stream_bytes = internal::encode<long long int>(dims, stride, (int)stream->maxbits, data, d_stream);
+  }
+
+  internal::cleanup_device_ptr(stream->stream->begin, d_stream, stream_bytes, 0, field->type);
+  internal::cleanup_device_ptr(field->data, d_data, 0, offset, field->type);
+
+  // zfp wants to flush the stream.
+  // set bits to wsize because we already did that.
+  size_t compressed_size = stream_bytes / sizeof(Word);
+  stream->stream->bits = wsize;
+  // set stream pointer to end of stream
+  stream->stream->ptr = stream->stream->begin + compressed_size;
+
+  return stream_bytes;
+}
+  
+void 
+cuda_decompress(zfp_stream *stream, zfp_field *field)
+{
+  uint dims[3];
+  dims[0] = field->nx;
+  dims[1] = field->ny;
+  dims[2] = field->nz;
+   
+  int3 stride;  
+  stride.x = field->sx ? field->sx : 1;
+  stride.y = field->sy ? field->sy : field->nx;
+  stride.z = field->sz ? field->sz : field->nx * field->ny;
+
+  size_t decoded_bytes = 0;
+  long long int offset = 0;
+  void *d_data = internal::setup_device_field(field, stride, offset);
+  
+  if(d_data == NULL)
+  {
+    // null means the array is non-contiguous host mem which is not supported
+    return;
+  }
+
+  Word *d_stream = internal::setup_device_stream(stream, field);
+
+  if(field->type == zfp_type_float)
+  {
+    float *data = (float*) d_data;
+    decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data);
+    d_data = (void*) data;
+  }
+  else if(field->type == zfp_type_double)
+  {
+    double *data = (double*) d_data;
+    decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data);
+    d_data = (void*) data;
+  }
+  else if(field->type == zfp_type_int32)
+  {
+    int *data = (int*) d_data;
+    decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data);
+    d_data = (void*) data;
+  }
+  else if(field->type == zfp_type_int64)
+  {
+    long long int *data = (long long int*) d_data;
+    decoded_bytes = internal::decode(dims, stride, (int)stream->maxbits, d_stream, data);
+    d_data = (void*) data;
+  }
+  else
+  {
+    std::cerr<<"Cannot decompress: type unknown\n";
+  }
+
+   
+  size_t type_size = zfp_type_size(field->type);
+
+  size_t field_size = 1;
+  for(int i = 0; i < 3; ++i)
+  {
+    if(dims[i] != 0)
+    {
+      field_size *= dims[i];
+    }
+  }
+  
+  size_t bytes = type_size * field_size;
+  internal::cleanup_device_ptr(stream->stream, d_stream,0, 0, field->type);
+  internal::cleanup_device_ptr(field->data, d_data, bytes, offset, field->type);
+  
+  // this is how zfp determins if this was a success
+  size_t words_read = decoded_bytes / sizeof(Word);
+  stream->stream->bits = wsize;
+  // set stream pointer to end of stream
+  stream->stream->ptr = stream->stream->begin + words_read;
+
+}
+
diff --git a/zfp/src/cuda_zfp/cuZFP.h b/zfp/src/cuda_zfp/cuZFP.h
new file mode 100644
index 0000000000000000000000000000000000000000..c88fe1e4aa6383c048bc7306a020590af2ea4749
--- /dev/null
+++ b/zfp/src/cuda_zfp/cuZFP.h
@@ -0,0 +1,15 @@
+#ifndef cuZFP_h
+#define cuZFP_h
+
+#include "zfp.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  size_t cuda_compress(zfp_stream *stream, const zfp_field *field);
+  void cuda_decompress(zfp_stream *stream, zfp_field *field);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/zfp/src/cuda_zfp/decode.cuh b/zfp/src/cuda_zfp/decode.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..656cf0b36196a3aca1e2654d65976b9576e4eef6
--- /dev/null
+++ b/zfp/src/cuda_zfp/decode.cuh
@@ -0,0 +1,264 @@
+#ifndef CU_ZFP_DECODE_CUH
+#define CU_ZFP_DECODE_CUH
+
+#include "shared.h"
+
+namespace cuZFP
+{
+
+/* map two's complement signed integer to negabinary unsigned integer */
+inline __device__
+long long int uint2int(unsigned long long int x)
+{
+	return (x ^0xaaaaaaaaaaaaaaaaull) - 0xaaaaaaaaaaaaaaaaull;
+}
+
+inline __device__
+int uint2int(unsigned int x)
+{
+	return (x ^0xaaaaaaaau) - 0xaaaaaaaau;
+}
+
+template<int block_size>
+class BlockReader
+{
+private:
+  const int m_maxbits; 
+  int m_current_bit;
+  Word *m_words;
+  Word m_buffer;
+  bool m_valid_block;
+  int m_block_idx;
+
+  __device__ BlockReader()
+    : m_maxbits(0)
+  {
+  }
+
+public:
+  __device__ BlockReader(Word *b, const int &maxbits, const int &block_idx, const int &num_blocks)
+    :  m_maxbits(maxbits), m_valid_block(true)
+  {
+    if(block_idx >= num_blocks) m_valid_block = false;
+    int word_index = (block_idx * maxbits)  / (sizeof(Word) * 8); 
+    m_words = b + word_index;
+    m_buffer = *m_words;
+    m_current_bit = (block_idx * maxbits) % (sizeof(Word) * 8); 
+
+    m_buffer >>= m_current_bit;
+    m_block_idx = block_idx;
+   
+  }
+  inline __device__
+  void print()
+  {
+    print_bits(m_buffer);
+  }
+
+  inline __device__ 
+  uint read_bit()
+  {
+    uint bit = m_buffer & 1;
+    ++m_current_bit;
+    m_buffer >>= 1;
+    // handle moving into next word
+    if(m_current_bit >= sizeof(Word) * 8) 
+    {
+      m_current_bit = 0;
+      ++m_words;
+      m_buffer = *m_words;
+    }
+    return bit; 
+  }
+
+
+  // note this assumes that n_bits is <= 64
+  inline __device__ 
+  uint64 read_bits(const uint &n_bits)
+  {
+    uint64 bits; 
+    // rem bits will always be positive
+    int rem_bits = sizeof(Word) * 8 - m_current_bit;
+     
+    int first_read = min(rem_bits, n_bits);
+    // first mask 
+    Word mask = ((Word)1<<((first_read)))-1;
+    bits = m_buffer & mask;
+    m_buffer >>= n_bits;
+    m_current_bit += first_read;
+    int next_read = 0;
+    if(n_bits >= rem_bits) 
+    {
+      ++m_words;
+      m_buffer = *m_words;
+      m_current_bit = 0;
+      next_read = n_bits - first_read; 
+    }
+   
+    // this is basically a no-op when first read constained 
+    // all the bits. TODO: if we have aligned reads, this could 
+    // be a conditional without divergence
+    mask = ((Word)1<<((next_read)))-1;
+    bits += (m_buffer & mask) << first_read;
+    m_buffer >>= next_read;
+    m_current_bit += next_read; 
+    return bits;
+  }
+
+}; // block reader
+
+template<typename Scalar, int Size, typename UInt>
+inline __device__
+void decode_ints(BlockReader<Size> &reader, uint &max_bits, UInt *data)
+{
+  const int intprec = get_precision<Scalar>();
+  memset(data, 0, sizeof(UInt) * Size);
+  uint64 x; 
+  // maxprec = 64;
+  const uint kmin = 0; //= intprec > maxprec ? intprec - maxprec : 0;
+  int bits = max_bits;
+  for (uint k = intprec, n = 0; bits && k-- > kmin;)
+  {
+    // read bit plane
+    uint m = MIN(n, bits);
+    bits -= m;
+    x = reader.read_bits(m);
+    for (; n < Size && bits && (bits--, reader.read_bit()); x += (Word) 1 << n++)
+      for (; n < (Size - 1) && bits && (bits--, !reader.read_bit()); n++);
+    
+    // deposit bit plane
+    #pragma unroll
+    for (int i = 0; x; i++, x >>= 1)
+    {
+      data[i] += (UInt)(x & 1u) << k;
+    }
+  } 
+}
+
+
+template<int BlockSize>
+struct inv_transform;
+
+template<>
+struct inv_transform<64>
+{
+  template<typename Int>
+  __device__ void inv_xform(Int *p)
+  {
+    uint x, y, z;
+    /* transform along z */
+    for (y = 0; y < 4; y++)
+      for (x = 0; x < 4; x++)
+        inv_lift<Int,16>(p + 1 * x + 4 * y);
+    /* transform along y */
+    for (x = 0; x < 4; x++)
+      for (z = 0; z < 4; z++)
+        inv_lift<Int,4>(p + 16 * z + 1 * x);
+    /* transform along x */
+    for (z = 0; z < 4; z++)
+      for (y = 0; y < 4; y++)
+        inv_lift<Int,1>(p + 4 * y + 16 * z); 
+  }
+
+};
+
+template<>
+struct inv_transform<16>
+{
+  template<typename Int>
+  __device__ void inv_xform(Int *p)
+  {
+
+    for(int x = 0; x < 4; ++x)
+    {
+      inv_lift<Int,4>(p + 1 * x);
+    }
+    for(int y = 0; y < 4; ++y)
+    {
+      inv_lift<Int,1>(p + 4 * y);
+    }
+  }
+
+};
+
+template<>
+struct inv_transform<4>
+{
+  template<typename Int>
+  __device__ void inv_xform(Int *p)
+  {
+    inv_lift<Int,1>(p);
+  }
+
+};
+
+template<typename Scalar, int BlockSize>
+__device__ void zfp_decode(BlockReader<BlockSize> &reader, Scalar *fblock, uint maxbits)
+{
+  typedef typename zfp_traits<Scalar>::UInt UInt;
+  typedef typename zfp_traits<Scalar>::Int Int;
+
+  uint s_cont = 1;
+  //
+  // there is no skip path for integers so just continue
+  //
+  if(!is_int<Scalar>())
+  {
+    s_cont = reader.read_bit();
+  }
+
+  if(s_cont)
+  {
+    uint ebits = get_ebits<Scalar>() + 1;
+
+    uint emax;
+    if(!is_int<Scalar>())
+    {
+      // read in the shared exponent
+      emax = reader.read_bits(ebits - 1) - get_ebias<Scalar>();
+    }
+    else
+    {
+      // no exponent bits
+      ebits = 0;
+    }
+
+	  maxbits -= ebits;
+    
+    UInt ublock[BlockSize];
+
+    decode_ints<Scalar, BlockSize, UInt>(reader, maxbits, ublock);
+
+    Int iblock[BlockSize];
+    unsigned char *perm = get_perm<BlockSize>();
+#if (CUDART_VERSION < 8000)
+    #pragma unroll 
+#else
+    #pragma unroll BlockSize
+#endif
+    for(int i = 0; i < BlockSize; ++i)
+    {
+		  iblock[perm[i]] = uint2int(ublock[i]);
+    }
+    
+    inv_transform<BlockSize> trans;
+    trans.inv_xform(iblock);
+
+		Scalar inv_w = dequantize<Int, Scalar>(1, emax);
+
+#if (CUDART_VERSION < 8000)
+    #pragma unroll 
+#else
+    #pragma unroll BlockSize
+#endif
+    for(int i = 0; i < BlockSize; ++i)
+    {
+		  fblock[i] = inv_w * (Scalar)iblock[i];
+    }
+     
+  }
+}
+
+
+}  // namespace cuZFP
+#endif
diff --git a/zfp/src/cuda_zfp/decode1.cuh b/zfp/src/cuda_zfp/decode1.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..996d9ed1ef17af1ff5dbf6c2e9fa7ee5a96bb484
--- /dev/null
+++ b/zfp/src/cuda_zfp/decode1.cuh
@@ -0,0 +1,155 @@
+#ifndef CUZFP_DECODE1_CUH
+#define CUZFP_DECODE1_CUH
+
+#include "shared.h"
+#include "decode.cuh"
+#include "type_info.cuh"
+
+namespace cuZFP {
+
+
+template<typename Scalar> 
+__device__ __host__ inline 
+void scatter_partial1(const Scalar* q, Scalar* p, int nx, int sx)
+{
+  uint x;
+  for (x = 0; x < nx; x++, p += sx)
+   *p = *q++;
+}
+
+template<typename Scalar> 
+__device__ __host__ inline 
+void scatter1(const Scalar* q, Scalar* p, int sx)
+{
+  uint x;
+  for (x = 0; x < 4; x++, p += sx)
+    *p = *q++;
+}
+
+template<class Scalar>
+__global__
+void
+cudaDecode1(Word *blocks,
+            Scalar *out,
+            const uint dim,
+            const int stride,
+            const uint padded_dim,
+            const uint total_blocks,
+            uint maxbits)
+{
+  typedef unsigned long long int ull;
+  typedef long long int ll;
+  typedef typename zfp_traits<Scalar>::UInt UInt;
+  typedef typename zfp_traits<Scalar>::Int Int;
+
+  const int intprec = get_precision<Scalar>();
+
+  const ull blockId = blockIdx.x +
+                      blockIdx.y * gridDim.x +
+                      gridDim.x  * gridDim.y * blockIdx.z;
+
+  // each thread gets a block so the block index is 
+  // the global thread index
+  const ull block_idx = blockId * blockDim.x + threadIdx.x;
+
+  if(block_idx >= total_blocks) return;
+
+  BlockReader<4> reader(blocks, maxbits, block_idx, total_blocks);
+  Scalar result[4] = {0,0,0,0};
+
+  zfp_decode(reader, result, maxbits);
+
+  uint block;
+  block = block_idx * 4ull; 
+  const ll offset = (ll)block * stride; 
+  
+  bool partial = false;
+  if(block + 4 > dim) partial = true;
+  if(partial)
+  {
+    const uint nx = 4u - (padded_dim - dim);
+    scatter_partial1(result, out + offset, nx, stride);
+  }
+  else
+  {
+    scatter1(result, out + offset, stride);
+  }
+}
+
+template<class Scalar>
+size_t decode1launch(uint dim, 
+                     int stride,
+                     Word *stream,
+                     Scalar *d_data,
+                     uint maxbits)
+{
+  const int cuda_block_size = 128;
+
+  uint zfp_pad(dim); 
+  if(zfp_pad % 4 != 0) zfp_pad += 4 - dim % 4;
+
+  uint zfp_blocks = (zfp_pad) / 4; 
+
+  if(dim % 4 != 0)  zfp_blocks = (dim + (4 - dim % 4)) / 4;
+
+  int block_pad = 0;
+  if(zfp_blocks % cuda_block_size != 0) 
+  {
+    block_pad = cuda_block_size - zfp_blocks % cuda_block_size; 
+  }
+
+  size_t total_blocks = block_pad + zfp_blocks;
+  size_t stream_bytes = calc_device_mem1d(zfp_pad, maxbits);
+
+  dim3 block_size = dim3(cuda_block_size, 1, 1);
+  dim3 grid_size = calculate_grid_size(total_blocks, cuda_block_size);
+
+#ifdef CUDA_ZFP_RATE_PRINT
+  // setup some timing code
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  cudaEventRecord(start);
+#endif
+
+  cudaDecode1<Scalar> << < grid_size, block_size >> >
+    (stream,
+		 d_data,
+     dim,
+     stride,
+     zfp_pad,
+     zfp_blocks, // total blocks to decode
+     maxbits);
+
+#ifdef CUDA_ZFP_RATE_PRINT
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+	cudaStreamSynchronize(0);
+
+  float miliseconds = 0;
+  cudaEventElapsedTime(&miliseconds, start, stop);
+  float seconds = miliseconds / 1000.f;
+  float rate = (float(dim) * sizeof(Scalar) ) / seconds;
+  rate /= 1024.f;
+  rate /= 1024.f;
+  rate /= 1024.f;
+  printf("Decode elapsed time: %.5f (s)\n", seconds);
+  printf("# decode1 rate: %.2f (GB / sec) %d\n", rate, maxbits);
+#endif
+  return stream_bytes;
+}
+
+template<class Scalar>
+size_t decode1(int dim, 
+               int stride,
+               Word *stream,
+               Scalar *d_data,
+               uint maxbits)
+{
+	return decode1launch<Scalar>(dim, stride, stream, d_data, maxbits);
+}
+
+} // namespace cuZFP
+
+#endif
diff --git a/zfp/src/cuda_zfp/decode2.cuh b/zfp/src/cuda_zfp/decode2.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..41e112b5a90a6ecb46c4cac4603fd6bd8e3ab172
--- /dev/null
+++ b/zfp/src/cuda_zfp/decode2.cuh
@@ -0,0 +1,172 @@
+#ifndef CUZFP_DECODE2_CUH
+#define CUZFP_DECODE2_CUH
+
+#include "shared.h"
+#include "decode.cuh"
+#include "type_info.cuh"
+
+namespace cuZFP {
+
+template<typename Scalar> 
+__device__ __host__ inline 
+void scatter_partial2(const Scalar* q, Scalar* p, int nx, int ny, int sx, int sy)
+{
+  uint x, y;
+  for (y = 0; y < ny; y++, p += sy - nx * sx, q += 4 - nx)
+    for (x = 0; x < nx; x++, p += sx, q++)
+      *p = *q;
+}
+
+template<typename Scalar> 
+__device__ __host__ inline 
+void scatter2(const Scalar* q, Scalar* p, int sx, int sy)
+{
+  uint x, y;
+  for (y = 0; y < 4; y++, p += sy - 4 * sx)
+    for (x = 0; x < 4; x++, p += sx)
+      *p = *q++;
+}
+
+
+template<class Scalar, int BlockSize>
+__global__
+void
+cudaDecode2(Word *blocks,
+            Scalar *out,
+            const uint2 dims,
+            const int2 stride,
+            const uint2 padded_dims,
+            uint maxbits)
+{
+  typedef unsigned long long int ull;
+  typedef long long int ll;
+  const ull blockId = blockIdx.x +
+                      blockIdx.y * gridDim.x +
+                      gridDim.x * gridDim.y * blockIdx.z;
+
+  // each thread gets a block so the block index is 
+  // the global thread index
+  const ull block_idx = blockId * blockDim.x + threadIdx.x;
+  
+  const int total_blocks = (padded_dims.x * padded_dims.y) / 16; 
+  
+  if(block_idx >= total_blocks) 
+  {
+    return;
+  }
+
+  BlockReader<BlockSize> reader(blocks, maxbits, block_idx, total_blocks);
+ 
+  Scalar result[BlockSize];
+  memset(result, 0, sizeof(Scalar) * BlockSize);
+
+  zfp_decode(reader, result, maxbits);
+
+  // logical block dims
+  uint2 block_dims;
+  block_dims.x = padded_dims.x >> 2; 
+  block_dims.y = padded_dims.y >> 2; 
+  // logical pos in 3d array
+  uint2 block;
+  block.x = (block_idx % block_dims.x) * 4; 
+  block.y = ((block_idx/ block_dims.x) % block_dims.y) * 4; 
+  
+  const ll offset = (ll)block.x * stride.x + (ll)block.y * stride.y; 
+
+  bool partial = false;
+  if(block.x + 4 > dims.x) partial = true;
+  if(block.y + 4 > dims.y) partial = true;
+  if(partial)
+  {
+    const uint nx = block.x + 4 > dims.x ? dims.x - block.x : 4;
+    const uint ny = block.y + 4 > dims.y ? dims.y - block.y : 4;
+    scatter_partial2(result, out + offset, nx, ny, stride.x, stride.y);
+  }
+  else
+  {
+    scatter2(result, out + offset, stride.x, stride.y);
+  }
+}
+
+template<class Scalar>
+size_t decode2launch(uint2 dims, 
+                     int2 stride,
+                     Word *stream,
+                     Scalar *d_data,
+                     uint maxbits)
+{
+  const int cuda_block_size = 128;
+  dim3 block_size;
+  block_size = dim3(cuda_block_size, 1, 1);
+  
+  uint2 zfp_pad(dims); 
+  // ensure that we have block sizes
+  // that are a multiple of 4
+  if(zfp_pad.x % 4 != 0) zfp_pad.x += 4 - dims.x % 4;
+  if(zfp_pad.y % 4 != 0) zfp_pad.y += 4 - dims.y % 4;
+
+  const int zfp_blocks = (zfp_pad.x * zfp_pad.y) / 16; 
+
+  
+  //
+  // we need to ensure that we launch a multiple of the 
+  // cuda block size
+  //
+  int block_pad = 0; 
+  if(zfp_blocks % cuda_block_size != 0)
+  {
+    block_pad = cuda_block_size - zfp_blocks % cuda_block_size; 
+  }
+
+
+  size_t stream_bytes = calc_device_mem2d(zfp_pad, maxbits);
+  size_t total_blocks = block_pad + zfp_blocks;
+  dim3 grid_size = calculate_grid_size(total_blocks, cuda_block_size);
+
+#ifdef CUDA_ZFP_RATE_PRINT
+  // setup some timing code
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start);
+#endif
+
+  cudaDecode2<Scalar, 16> << < grid_size, block_size >> >
+    (stream,
+		 d_data,
+     dims,
+     stride,
+     zfp_pad,
+     maxbits);
+
+#ifdef CUDA_ZFP_RATE_PRINT
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+	cudaStreamSynchronize(0);
+
+  float miliseconds = 0;
+  cudaEventElapsedTime(&miliseconds, start, stop);
+  float seconds = miliseconds / 1000.f;
+  float rate = (float(dims.x * dims.y) * sizeof(Scalar) ) / seconds;
+  rate /= 1024.f;
+  rate /= 1024.f;
+  rate /= 1024.f;
+  printf("Decode elapsed time: %.5f (s)\n", seconds);
+  printf("# decode2 rate: %.2f (GB / sec) %d\n", rate, maxbits);
+#endif
+  return stream_bytes;
+}
+
+template<class Scalar>
+size_t decode2(uint2 dims, 
+               int2 stride,
+               Word *stream,
+               Scalar *d_data,
+               uint maxbits)
+{
+	return decode2launch<Scalar>(dims, stride, stream, d_data, maxbits);
+}
+
+} // namespace cuZFP
+
+#endif
diff --git a/zfp/src/cuda_zfp/decode3.cuh b/zfp/src/cuda_zfp/decode3.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2a3ef00804de5261588622fa89d567da2f456f4c
--- /dev/null
+++ b/zfp/src/cuda_zfp/decode3.cuh
@@ -0,0 +1,183 @@
+#ifndef CUZFP_DECODE3_CUH
+#define CUZFP_DECODE3_CUH
+
+#include "shared.h"
+#include "decode.cuh"
+#include "type_info.cuh"
+
+namespace cuZFP {
+
+template<typename Scalar> 
+__device__ __host__ inline 
+void scatter_partial3(const Scalar* q, Scalar* p, int nx, int ny, int nz, int sx, int sy, int sz)
+{
+  uint x, y, z;
+  for (z = 0; z < nz; z++, p += sz - ny * sy, q += 4 * (4 - ny))
+    for (y = 0; y < ny; y++, p += sy - nx * sx, q += 4 - nx)
+      for (x = 0; x < nx; x++, p += sx, q++)
+        *p = *q;
+}
+
+template<typename Scalar> 
+__device__ __host__ inline 
+void scatter3(const Scalar* q, Scalar* p, int sx, int sy, int sz)
+{
+  uint x, y, z;
+  for (z = 0; z < 4; z++, p += sz - 4 * sy)
+    for (y = 0; y < 4; y++, p += sy - 4 * sx)
+      for (x = 0; x < 4; x++, p += sx)
+        *p = *q++;
+}
+
+
+template<class Scalar, int BlockSize>
+__global__
+void
+cudaDecode3(Word *blocks,
+            Scalar *out,
+            const uint3 dims,
+            const int3 stride,
+            const uint3 padded_dims,
+            uint maxbits)
+{
+  
+  typedef unsigned long long int ull;
+  typedef long long int ll;
+
+  const ull blockId = blockIdx.x +
+                      blockIdx.y * gridDim.x +
+                      gridDim.x * gridDim.y * blockIdx.z;
+  // each thread gets a block so the block index is 
+  // the global thread index
+  const ull block_idx = blockId * blockDim.x + threadIdx.x;
+  
+  const int total_blocks = (padded_dims.x * padded_dims.y * padded_dims.z) / 64; 
+  
+  if(block_idx >= total_blocks) 
+  {
+    return;
+  }
+
+  BlockReader<BlockSize> reader(blocks, maxbits, block_idx, total_blocks);
+ 
+  Scalar result[BlockSize];
+  memset(result, 0, sizeof(Scalar) * BlockSize);
+
+  zfp_decode<Scalar,BlockSize>(reader, result, maxbits);
+
+  // logical block dims
+  uint3 block_dims;
+  block_dims.x = padded_dims.x >> 2; 
+  block_dims.y = padded_dims.y >> 2; 
+  block_dims.z = padded_dims.z >> 2; 
+  // logical pos in 3d array
+  uint3 block;
+  block.x = (block_idx % block_dims.x) * 4; 
+  block.y = ((block_idx/ block_dims.x) % block_dims.y) * 4; 
+  block.z = (block_idx/ (block_dims.x * block_dims.y)) * 4; 
+  
+  // default strides
+  const ll offset = (ll)block.x * stride.x + (ll)block.y * stride.y + (ll)block.z * stride.z; 
+
+  bool partial = false;
+  if(block.x + 4 > dims.x) partial = true;
+  if(block.y + 4 > dims.y) partial = true;
+  if(block.z + 4 > dims.z) partial = true;
+  if(partial)
+  {
+    const uint nx = block.x + 4u > dims.x ? dims.x - block.x : 4;
+    const uint ny = block.y + 4u > dims.y ? dims.y - block.y : 4;
+    const uint nz = block.z + 4u > dims.z ? dims.z - block.z : 4;
+
+    scatter_partial3(result, out + offset, nx, ny, nz, stride.x, stride.y, stride.z);
+  }
+  else
+  {
+    scatter3(result, out + offset, stride.x, stride.y, stride.z);
+  }
+}
+template<class Scalar>
+size_t decode3launch(uint3 dims, 
+                     int3 stride,
+                     Word *stream,
+                     Scalar *d_data,
+                     uint maxbits)
+{
+  const int cuda_block_size = 128;
+  dim3 block_size;
+  block_size = dim3(cuda_block_size, 1, 1);
+
+  uint3 zfp_pad(dims); 
+  // ensure that we have block sizes
+  // that are a multiple of 4
+  if(zfp_pad.x % 4 != 0) zfp_pad.x += 4 - dims.x % 4;
+  if(zfp_pad.y % 4 != 0) zfp_pad.y += 4 - dims.y % 4;
+  if(zfp_pad.z % 4 != 0) zfp_pad.z += 4 - dims.z % 4;
+
+  const int zfp_blocks = (zfp_pad.x * zfp_pad.y * zfp_pad.z) / 64; 
+
+  
+  //
+  // we need to ensure that we launch a multiple of the 
+  // cuda block size
+  //
+  int block_pad = 0; 
+  if(zfp_blocks % cuda_block_size != 0)
+  {
+    block_pad = cuda_block_size - zfp_blocks % cuda_block_size; 
+  }
+
+  size_t total_blocks = block_pad + zfp_blocks;
+  size_t stream_bytes = calc_device_mem3d(zfp_pad, maxbits);
+
+  dim3 grid_size = calculate_grid_size(total_blocks, cuda_block_size);
+
+#ifdef CUDA_ZFP_RATE_PRINT
+  // setup some timing code
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  cudaEventRecord(start);
+#endif
+
+  cudaDecode3<Scalar, 64> << < grid_size, block_size >> >
+    (stream,
+		 d_data,
+     dims,
+     stride,
+     zfp_pad,
+     maxbits);
+
+#ifdef CUDA_ZFP_RATE_PRINT
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+	cudaStreamSynchronize(0);
+
+  float miliseconds = 0;
+  cudaEventElapsedTime(&miliseconds, start, stop);
+  float seconds = miliseconds / 1000.f;
+  float rate = (float(dims.x * dims.y * dims.z) * sizeof(Scalar) ) / seconds;
+  rate /= 1024.f;
+  rate /= 1024.f;
+  rate /= 1024.f;
+  printf("Decode elapsed time: %.5f (s)\n", seconds);
+  printf("# decode3 rate: %.2f (GB / sec) %d\n", rate, maxbits);
+#endif
+
+  return stream_bytes;
+}
+
+template<class Scalar>
+size_t decode3(uint3 dims, 
+               int3 stride,
+               Word  *stream,
+               Scalar *d_data,
+               uint maxbits)
+{
+	return decode3launch<Scalar>(dims, stride, stream, d_data, maxbits);
+}
+
+} // namespace cuZFP
+
+#endif
diff --git a/zfp/src/cuda_zfp/encode.cuh b/zfp/src/cuda_zfp/encode.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c65bd356a2f93f4d5627aa1523c9acabecc0b6e1
--- /dev/null
+++ b/zfp/src/cuda_zfp/encode.cuh
@@ -0,0 +1,419 @@
+#ifndef CU_ZFP_ENCODE_CUH
+#define CU_ZFP_ENCODE_CUH
+
+#include "shared.h"
+
+namespace cuZFP
+{
+
+// maximum number of bit planes to encode
+__device__ 
+static int
+precision(int maxexp, int maxprec, int minexp)
+{
+  return MIN(maxprec, MAX(0, maxexp - minexp + 8));
+}
+
+template<typename Scalar>
+inline __device__
+void pad_block(Scalar *p, uint n, uint s)
+{
+  switch (n) 
+  {
+    case 0:
+      p[0 * s] = 0;
+      /* FALLTHROUGH */
+    case 1:
+      p[1 * s] = p[0 * s];
+      /* FALLTHROUGH */
+    case 2:
+      p[2 * s] = p[1 * s];
+      /* FALLTHROUGH */
+    case 3:
+      p[3 * s] = p[0 * s];
+      /* FALLTHROUGH */
+    default:
+      break;
+  }
+}
+
+template<class Scalar>
+__device__
+static int
+exponent(Scalar x)
+{
+  if (x > 0) {
+    int e;
+    frexp(x, &e);
+    // clamp exponent in case x is denormalized
+    return max(e, 1 - get_ebias<Scalar>());
+  }
+  return -get_ebias<Scalar>();
+}
+
+template<class Scalar, int BlockSize>
+__device__
+static int
+max_exponent(const Scalar* p)
+{
+  Scalar max_val = 0;
+  for(int i = 0; i < BlockSize; ++i)
+  {
+    Scalar f = fabs(p[i]);
+    max_val = max(max_val,f);
+  }
+  return exponent<Scalar>(max_val);
+}
+
+// lifting transform of 4-vector
+template <class Int, uint s>
+__device__ 
+static void
+fwd_lift(Int* p)
+{
+  Int x = *p; p += s;
+  Int y = *p; p += s;
+  Int z = *p; p += s;
+  Int w = *p; p += s;
+
+  // default, non-orthogonal transform (preferred due to speed and quality)
+  //        ( 4  4  4  4) (x)
+  // 1/16 * ( 5  1 -1 -5) (y)
+  //        (-4  4  4 -4) (z)
+  //        (-2  6 -6  2) (w)
+  x += w; x >>= 1; w -= x;
+  z += y; z >>= 1; y -= z;
+  x += z; x >>= 1; z -= x;
+  w += y; w >>= 1; y -= w;
+  w += y >> 1; y -= w >> 1;
+
+  p -= s; *p = w;
+  p -= s; *p = z;
+  p -= s; *p = y;
+  p -= s; *p = x;
+}
+
+template<typename Scalar>
+Scalar
+inline __device__
+quantize_factor(const int &exponent, Scalar);
+
+template<>
+float
+inline __device__
+quantize_factor<float>(const int &exponent, float)
+{
+	return  LDEXP(1.0, get_precision<float>() - 2 - exponent);
+}
+
+template<>
+double
+inline __device__
+quantize_factor<double>(const int &exponent, double)
+{
+	return  LDEXP(1.0, get_precision<double>() - 2 - exponent);
+}
+
+template<typename Scalar, typename Int, int BlockSize>
+void __device__ fwd_cast(Int *iblock, const Scalar *fblock, int emax)
+{
+	Scalar s = quantize_factor(emax, Scalar());
+  for(int i = 0; i < BlockSize; ++i)
+  {
+    iblock[i] = (Int) (s * fblock[i]);
+  }
+}
+
+template<int BlockSize>
+struct transform;
+
+template<>
+struct transform<64>
+{
+  template<typename Int>
+  __device__ void fwd_xform(Int *p)
+  {
+
+    uint x, y, z;
+    /* transform along x */
+    for (z = 0; z < 4; z++)
+      for (y = 0; y < 4; y++)
+        fwd_lift<Int,1>(p + 4 * y + 16 * z);
+    /* transform along y */
+    for (x = 0; x < 4; x++)
+      for (z = 0; z < 4; z++)
+        fwd_lift<Int,4>(p + 16 * z + 1 * x);
+    /* transform along z */
+    for (y = 0; y < 4; y++)
+      for (x = 0; x < 4; x++)
+        fwd_lift<Int,16>(p + 1 * x + 4 * y);
+
+   }
+
+};
+
+template<>
+struct transform<16>
+{
+  template<typename Int>
+  __device__ void fwd_xform(Int *p)
+  {
+
+    uint x, y;
+    /* transform along x */
+    for (y = 0; y < 4; y++)
+     fwd_lift<Int,1>(p + 4 * y);
+    /* transform along y */
+    for (x = 0; x < 4; x++)
+      fwd_lift<Int,4>(p + 1 * x);
+    }
+
+};
+
+template<>
+struct transform<4>
+{
+  template<typename Int>
+  __device__ void fwd_xform(Int *p)
+  {
+    fwd_lift<Int,1>(p);
+  }
+
+};
+
+template<typename Int, typename UInt, int BlockSize>
+__device__ void fwd_order(UInt *ublock, const Int *iblock)
+{
+  unsigned char *perm = get_perm<BlockSize>();
+  for(int i = 0; i < BlockSize; ++i)
+  {
+    ublock[i] = int2uint(iblock[perm[i]]);
+  }
+}
+
+template<int block_size>
+struct BlockWriter
+{
+
+  uint m_word_index;
+  uint m_start_bit;
+  uint m_current_bit;
+  const int m_maxbits; 
+  Word *m_stream;
+
+  __device__ BlockWriter(Word *stream, const int &maxbits, const uint &block_idx)
+   :  m_current_bit(0),
+      m_maxbits(maxbits),
+      m_stream(stream)
+  {
+    m_word_index = (block_idx * maxbits)  / (sizeof(Word) * 8); 
+    m_start_bit = uint((block_idx * maxbits) % (sizeof(Word) * 8)); 
+  }
+
+  template<typename T>
+  __device__
+  void print_bits(T bits)
+  {
+    const int bit_size = sizeof(T) * 8;
+    for(int i = bit_size - 1; i >=0; --i)
+    {
+      T one = 1;
+      T mask = one << i;
+      int val = (bits & mask) >> i;
+      printf("%d", val);
+    }
+    printf("\n");
+  }
+  __device__
+  void print(int index)
+  {
+    print_bits(m_stream[index]);
+  }
+
+
+  __device__
+  long long unsigned int
+  write_bits(const long long unsigned int &bits, const uint &n_bits)
+  {
+    const uint wbits = sizeof(Word) * 8;
+    uint seg_start = (m_start_bit + m_current_bit) % wbits;
+    uint write_index = m_word_index + uint((m_start_bit + m_current_bit) / wbits);
+    uint seg_end = seg_start + n_bits - 1;
+    uint shift = seg_start; 
+    // we may be asked to write less bits than exist in 'bits'
+    // so we have to make sure that anything after n is zero.
+    // If this does not happen, then we may write into a zfp
+    // block not at the specified index
+    // uint zero_shift = sizeof(Word) * 8 - n_bits;
+    Word left = (bits >> n_bits) << n_bits;
+    
+    Word b = bits - left;
+    Word add = b << shift;
+    atomicAdd(&m_stream[write_index], add); 
+    // n_bits straddles the word boundary
+    bool straddle = seg_start < sizeof(Word) * 8 && seg_end >= sizeof(Word) * 8;
+    if(straddle)
+    {
+      Word rem = b >> (sizeof(Word) * 8 - shift);
+      atomicAdd(&m_stream[write_index + 1], rem); 
+    }
+    m_current_bit += n_bits;
+    return bits >> (Word)n_bits;
+  }
+
+  __device__
+  uint write_bit(const unsigned int &bit)
+  {
+    const uint wbits = sizeof(Word) * 8;
+    uint seg_start = (m_start_bit + m_current_bit) % wbits;
+    uint write_index = m_word_index + uint((m_start_bit + m_current_bit) / wbits);
+    uint shift = seg_start; 
+    // we may be asked to write less bits than exist in 'bits'
+    // so we have to make sure that anything after n is zero.
+    // If this does not happen, then we may write into a zfp
+    // block not at the specified index
+    // uint zero_shift = sizeof(Word) * 8 - n_bits;
+    
+    Word add = (Word)bit << shift;
+    atomicAdd(&m_stream[write_index], add); 
+    m_current_bit += 1;
+
+    return bit;
+  }
+
+};
+
+template<typename Int, int BlockSize> 
+void inline __device__ encode_block(BlockWriter<BlockSize> &stream,
+                                    int maxbits,
+                                    int maxprec,
+                                    Int *iblock)
+{
+  transform<BlockSize> tform;
+  tform.fwd_xform(iblock);
+
+  typedef typename zfp_traits<Int>::UInt UInt;
+  UInt ublock[BlockSize]; 
+  fwd_order<Int, UInt, BlockSize>(ublock, iblock);
+
+  uint intprec = CHAR_BIT * (uint)sizeof(UInt);
+  uint kmin = intprec > maxprec ? intprec - maxprec : 0;
+  uint bits = maxbits;
+  uint i, k, m, n;
+  uint64 x;
+
+  for (k = intprec, n = 0; bits && k-- > kmin;) {
+    /* step 1: extract bit plane #k to x */
+    x = 0;
+    for (i = 0; i < BlockSize; i++)
+    {
+      x += (uint64)((ublock[i] >> k) & 1u) << i;
+    }
+    /* step 2: encode first n bits of bit plane */
+    m = min(n, bits);
+    //uint temp  = bits;
+    bits -= m;
+    x = stream.write_bits(x, m);
+    
+    /* step 3: unary run-length encode remainder of bit plane */
+    for (; n < BlockSize && bits && (bits--, stream.write_bit(!!x)); x >>= 1, n++)
+    {
+      for (; n < BlockSize - 1 && bits && (bits--, !stream.write_bit(x & 1u)); x >>= 1, n++)
+      {  
+      }
+    }
+  }
+  
+}
+
+template<typename Scalar, int BlockSize>
+void inline __device__ zfp_encode_block(Scalar *fblock,
+                                        const int maxbits,
+                                        const uint block_idx,
+                                        Word *stream)
+{
+  BlockWriter<BlockSize> block_writer(stream, maxbits, block_idx);
+  int emax = max_exponent<Scalar, BlockSize>(fblock);
+  int maxprec = precision(emax, get_precision<Scalar>(), get_min_exp<Scalar>());
+  uint e = maxprec ? emax + get_ebias<Scalar>() : 0;
+  if(e)
+  {
+    const uint ebits = get_ebits<Scalar>()+1;
+    block_writer.write_bits(2 * e + 1, ebits);
+    typedef typename zfp_traits<Scalar>::Int Int;
+    Int iblock[BlockSize];
+    fwd_cast<Scalar, Int, BlockSize>(iblock, fblock, emax);
+
+
+    encode_block<Int, BlockSize>(block_writer, maxbits - ebits, maxprec, iblock);
+  }
+}
+
+template<>
+void inline __device__ zfp_encode_block<int, 64>(int *fblock,
+                                             const int maxbits,
+                                             const uint block_idx,
+                                             Word *stream)
+{
+  BlockWriter<64> block_writer(stream, maxbits, block_idx);
+  const int intprec = get_precision<int>();
+  encode_block<int, 64>(block_writer, maxbits, intprec, fblock);
+}
+
+template<>
+void inline __device__ zfp_encode_block<long long int, 64>(long long int *fblock,
+                                                       const int maxbits,
+                                                       const uint block_idx,
+                                                       Word *stream)
+{
+  BlockWriter<64> block_writer(stream, maxbits, block_idx);
+  const int intprec = get_precision<long long int>();
+  encode_block<long long int, 64>(block_writer, maxbits, intprec, fblock);
+}
+
+template<>
+void inline __device__ zfp_encode_block<int, 16>(int *fblock,
+                                             const int maxbits,
+                                             const uint block_idx,
+                                             Word *stream)
+{
+  BlockWriter<16> block_writer(stream, maxbits, block_idx);
+  const int intprec = get_precision<int>();
+  encode_block<int, 16>(block_writer, maxbits, intprec, fblock);
+}
+
+template<>
+void inline __device__ zfp_encode_block<long long int, 16>(long long int *fblock,
+                                                       const int maxbits,
+                                                       const uint block_idx,
+                                                       Word *stream)
+{
+  BlockWriter<16> block_writer(stream, maxbits, block_idx);
+  const int intprec = get_precision<long long int>();
+  encode_block<long long int, 16>(block_writer, maxbits, intprec, fblock);
+}
+
+template<>
+void inline __device__ zfp_encode_block<int, 4>(int *fblock,
+                                             const int maxbits,
+                                             const uint block_idx,
+                                             Word *stream)
+{
+  BlockWriter<4> block_writer(stream, maxbits, block_idx);
+  const int intprec = get_precision<int>();
+  encode_block<int, 4>(block_writer, maxbits, intprec, fblock);
+}
+
+template<>
+void inline __device__ zfp_encode_block<long long int, 4>(long long int *fblock,
+                                                       const int maxbits,
+                                                       const uint block_idx,
+                                                       Word *stream)
+{
+  BlockWriter<4> block_writer(stream, maxbits, block_idx);
+  const int intprec = get_precision<long long int>();
+  encode_block<long long int, 4>(block_writer, maxbits, intprec, fblock);
+}
+
+}  // namespace cuZFP
+#endif
diff --git a/zfp/src/cuda_zfp/encode1.cuh b/zfp/src/cuda_zfp/encode1.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9353f8c02073fec44a7030d5ed39b66f3a62ff32
--- /dev/null
+++ b/zfp/src/cuda_zfp/encode1.cuh
@@ -0,0 +1,174 @@
+#ifndef CUZFP_ENCODE1_CUH
+#define CUZFP_ENCODE1_CUH
+
+#include "cuZFP.h"
+#include "shared.h"
+#include "encode.cuh"
+#include "type_info.cuh"
+
+#include <iostream>
+#define ZFP_1D_BLOCK_SIZE 4 
+
+namespace cuZFP
+{
+
+template<typename Scalar> 
+__device__ __host__ inline 
+void gather_partial1(Scalar* q, const Scalar* p, int nx, int sx)
+{
+  uint x;
+  for (x = 0; x < nx; x++, p += sx)
+    q[x] = *p;
+  pad_block(q, nx, 1);
+}
+
+template<typename Scalar> 
+__device__ __host__ inline 
+void gather1(Scalar* q, const Scalar* p, int sx)
+{
+  uint x;
+  for (x = 0; x < 4; x++, p += sx)
+    *q++ = *p;
+}
+
+template<class Scalar>
+__global__
+void 
+cudaEncode1(const uint maxbits,
+           const Scalar* scalars,
+           Word *stream,
+           const uint dim,
+           const int sx,
+           const uint padded_dim,
+           const uint tot_blocks)
+{
+
+  typedef unsigned long long int ull;
+  typedef long long int ll;
+  const ull blockId = blockIdx.x +
+                      blockIdx.y * gridDim.x +
+                      gridDim.x * gridDim.y * blockIdx.z;
+
+  // each thread gets a block so the block index is 
+  // the global thread index
+  const uint block_idx = blockId * blockDim.x + threadIdx.x;
+
+  if(block_idx >= tot_blocks)
+  {
+    // we can't launch the exact number of blocks
+    // so just exit if this isn't real
+    return;
+  }
+
+  uint block_dim;
+  block_dim = padded_dim >> 2; 
+
+  // logical pos in 3d array
+  uint block;
+  block = (block_idx % block_dim) * 4; 
+
+  const ll offset = (ll)block * sx; 
+
+  Scalar fblock[ZFP_1D_BLOCK_SIZE]; 
+
+  bool partial = false;
+  if(block + 4 > dim) partial = true;
+ 
+  if(partial) 
+  {
+    uint nx = 4 - (padded_dim - dim);
+    gather_partial1(fblock, scalars + offset, nx, sx);
+  }
+  else
+  {
+    gather1(fblock, scalars + offset, sx);
+  }
+
+  zfp_encode_block<Scalar, ZFP_1D_BLOCK_SIZE>(fblock, maxbits, block_idx, stream);  
+
+}
+//
+// Launch the encode kernel
+//
+template<class Scalar>
+size_t encode1launch(uint dim, 
+                     int sx,
+                     const Scalar *d_data,
+                     Word *stream,
+                     const int maxbits)
+{
+  const int cuda_block_size = 128;
+  dim3 block_size = dim3(cuda_block_size, 1, 1);
+
+  uint zfp_pad(dim); 
+  if(zfp_pad % 4 != 0) zfp_pad += 4 - dim % 4;
+
+  const uint zfp_blocks = (zfp_pad) / 4; 
+  //
+  // we need to ensure that we launch a multiple of the 
+  // cuda block size
+  //
+  int block_pad = 0; 
+  if(zfp_blocks % cuda_block_size != 0)
+  {
+    block_pad = cuda_block_size - zfp_blocks % cuda_block_size; 
+  }
+
+  size_t total_blocks = block_pad + zfp_blocks;
+ 
+  dim3 grid_size = calculate_grid_size(total_blocks, cuda_block_size);
+
+  //
+  size_t stream_bytes = calc_device_mem1d(zfp_pad, maxbits);
+  // ensure we have zeros
+  cudaMemset(stream, 0, stream_bytes);
+
+#ifdef CUDA_ZFP_RATE_PRINT
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  cudaEventRecord(start);
+#endif
+
+	cudaEncode1<Scalar> << <grid_size, block_size>> >
+    (maxbits,
+     d_data,
+     stream,
+     dim,
+     sx,
+     zfp_pad,
+     zfp_blocks);
+
+#ifdef CUDA_ZFP_RATE_PRINT
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaStreamSynchronize(0);
+
+  float miliseconds = 0.f;
+  cudaEventElapsedTime(&miliseconds, start, stop);
+  float seconds = miliseconds / 1000.f;
+  float gb = (float(dim) * float(sizeof(Scalar))) / (1024.f * 1024.f * 1024.f);
+  float rate = gb / seconds;
+  printf("Encode elapsed time: %.5f (s)\n", seconds);
+  printf("# encode1 rate: %.2f (GB / sec) %d\n", rate, maxbits);
+#endif
+  return stream_bytes;
+}
+
+//
+// Encode a host vector and output a encoded device vector
+//
+template<class Scalar>
+size_t encode1(int dim,
+               int sx,
+               Scalar *d_data,
+               Word *stream,
+               const int maxbits)
+{
+  return encode1launch<Scalar>(dim, sx, d_data, stream, maxbits);
+}
+
+}
+
+#endif
diff --git a/zfp/src/cuda_zfp/encode2.cuh b/zfp/src/cuda_zfp/encode2.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7d9ebfe07ddd1b78f876609a1d6da8f61f0a544f
--- /dev/null
+++ b/zfp/src/cuda_zfp/encode2.cuh
@@ -0,0 +1,184 @@
+#ifndef CUZFP_ENCODE2_CUH
+#define CUZFP_ENCODE2_CUH
+
+#include "cuZFP.h"
+#include "shared.h"
+#include "encode.cuh"
+#include "ErrorCheck.h"
+#include "type_info.cuh"
+
+#define ZFP_2D_BLOCK_SIZE 16 
+
+namespace cuZFP
+{
+
+template<typename Scalar> 
+__device__ __host__ inline 
+void gather_partial2(Scalar* q, const Scalar* p, int nx, int ny, int sx, int sy)
+{
+  uint x, y;
+  for (y = 0; y < ny; y++, p += sy - nx * sx) {
+    for (x = 0; x < nx; x++, p += sx)
+      q[4 * y + x] = *p;
+      pad_block(q + 4 * y, nx, 1);
+  }
+  for (x = 0; x < 4; x++)
+    pad_block(q + x, ny, 4);
+}
+
+template<typename Scalar> 
+__device__ __host__ inline 
+void gather2(Scalar* q, const Scalar* p, int sx, int sy)
+{
+  uint x, y;
+  for (y = 0; y < 4; y++, p += sy - 4 * sx)
+    for (x = 0; x < 4; x++, p += sx)
+      *q++ = *p;
+}
+
+template<class Scalar>
+__global__
+void 
+cudaEncode2(const uint maxbits,
+           const Scalar* scalars,
+           Word *stream,
+           const uint2 dims,
+           const int2 stride,
+           const uint2 padded_dims,
+           const uint tot_blocks)
+{
+
+  typedef unsigned long long int ull;
+  typedef long long int ll;
+  const ull blockId = blockIdx.x +
+                      blockIdx.y * gridDim.x +
+                      gridDim.x * gridDim.y * blockIdx.z;
+
+  // each thread gets a block so the block index is 
+  // the global thread index
+  const uint block_idx = blockId * blockDim.x + threadIdx.x;
+
+  if(block_idx >= tot_blocks)
+  {
+    // we can't launch the exact number of blocks
+    // so just exit if this isn't real
+    return;
+  }
+
+  uint2 block_dims;
+  block_dims.x = padded_dims.x >> 2; 
+  block_dims.y = padded_dims.y >> 2; 
+
+  // logical pos in 3d array
+  uint2 block;
+  block.x = (block_idx % block_dims.x) * 4; 
+  block.y = ((block_idx/ block_dims.x) % block_dims.y) * 4; 
+
+  const ll offset = (ll)block.x * stride.x + (ll)block.y * stride.y; 
+
+  Scalar fblock[ZFP_2D_BLOCK_SIZE]; 
+
+  bool partial = false;
+  if(block.x + 4 > dims.x) partial = true;
+  if(block.y + 4 > dims.y) partial = true;
+ 
+  if(partial) 
+  {
+    const uint nx = block.x + 4 > dims.x ? dims.x - block.x : 4;
+    const uint ny = block.y + 4 > dims.y ? dims.y - block.y : 4;
+    gather_partial2(fblock, scalars + offset, nx, ny, stride.x, stride.y);
+
+  }
+  else
+  {
+    gather2(fblock, scalars + offset, stride.x, stride.y);
+  }
+
+  zfp_encode_block<Scalar, ZFP_2D_BLOCK_SIZE>(fblock, maxbits, block_idx, stream);  
+
+}
+
+//
+// Launch the encode kernel
+//
+template<class Scalar>
+size_t encode2launch(uint2 dims, 
+                     int2 stride,
+                     const Scalar *d_data,
+                     Word *stream,
+                     const int maxbits)
+{
+  const int cuda_block_size = 128;
+  dim3 block_size = dim3(cuda_block_size, 1, 1);
+
+  uint2 zfp_pad(dims); 
+  if(zfp_pad.x % 4 != 0) zfp_pad.x += 4 - dims.x % 4;
+  if(zfp_pad.y % 4 != 0) zfp_pad.y += 4 - dims.y % 4;
+
+  const uint zfp_blocks = (zfp_pad.x * zfp_pad.y) / 16; 
+
+  //
+  // we need to ensure that we launch a multiple of the 
+  // cuda block size
+  //
+  int block_pad = 0; 
+  if(zfp_blocks % cuda_block_size != 0)
+  {
+    block_pad = cuda_block_size - zfp_blocks % cuda_block_size; 
+  }
+
+  size_t total_blocks = block_pad + zfp_blocks;
+
+  dim3 grid_size = calculate_grid_size(total_blocks, cuda_block_size);
+
+  //
+  size_t stream_bytes = calc_device_mem2d(zfp_pad, maxbits);
+  // ensure we have zeros
+  cudaMemset(stream, 0, stream_bytes);
+
+#ifdef CUDA_ZFP_RATE_PRINT
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start);
+#endif
+
+	cudaEncode2<Scalar> << <grid_size, block_size>> >
+    (maxbits,
+     d_data,
+     stream,
+     dims,
+     stride,
+     zfp_pad,
+     zfp_blocks);
+
+#ifdef CUDA_ZFP_RATE_PRINT
+  cudaDeviceSynchronize();
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaStreamSynchronize(0);
+
+  float miliseconds = 0.f;
+  cudaEventElapsedTime(&miliseconds, start, stop);
+  float seconds = miliseconds / 1000.f;
+  float mb = (float(dims.x * dims.y) * sizeof(Scalar)) / (1024.f * 1024.f *1024.f);
+  float rate = mb / seconds;
+  printf("Encode elapsed time: %.5f (s)\n", seconds);
+  printf("# encode2 rate: %.2f (GB / sec) %d\n", rate, maxbits);
+#endif
+  return stream_bytes;
+}
+
+template<class Scalar>
+size_t encode2(uint2 dims,
+               int2 stride,
+               Scalar *d_data,
+               Word *stream,
+               const int maxbits)
+{
+  return encode2launch<Scalar>(dims, stride, d_data, stream, maxbits);
+}
+
+}
+
+#endif
diff --git a/zfp/src/cuda_zfp/encode3.cuh b/zfp/src/cuda_zfp/encode3.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9fe7ddd24df1244db414a901cf55ef951e27e84a
--- /dev/null
+++ b/zfp/src/cuda_zfp/encode3.cuh
@@ -0,0 +1,194 @@
+#ifndef CUZFP_ENCODE3_CUH
+#define CUZFP_ENCODE3_CUH
+
+#include "cuZFP.h"
+#include "shared.h"
+#include "encode.cuh"
+#include "type_info.cuh"
+
+#define ZFP_3D_BLOCK_SIZE 64
+namespace cuZFP{
+
+template<typename Scalar> 
+__device__ __host__ inline 
+void gather_partial3(Scalar* q, const Scalar* p, int nx, int ny, int nz, int sx, int sy, int sz)
+{
+  uint x, y, z;
+  for (z = 0; z < nz; z++, p += sz - ny * sy) {
+    for (y = 0; y < ny; y++, p += sy - nx * sx) {
+      for (x = 0; x < nx; x++, p += sx)
+        q[16 * z + 4 * y + x] = *p; 
+        pad_block(q + 16 * z + 4 * y, nx, 1);
+    }
+    for (x = 0; x < 4; x++)
+      pad_block(q + 16 * z + x, ny, 4);
+  }
+  for (y = 0; y < 4; y++)
+    for (x = 0; x < 4; x++)
+      pad_block(q + 4 * y + x, nz, 16);
+}
+
+template<typename Scalar> 
+__device__ __host__ inline 
+void gather3(Scalar* q, const Scalar* p, int sx, int sy, int sz)
+{
+  uint x, y, z;
+  for (z = 0; z < 4; z++, p += sz - 4 * sy)
+    for (y = 0; y < 4; y++, p += sy - 4 * sx)
+      for (x = 0; x < 4; x++, p += sx)
+        *q++ = *p;
+}
+
+template<class Scalar>
+__global__
+void 
+cudaEncode(const uint maxbits,
+           const Scalar* scalars,
+           Word *stream,
+           const uint3 dims,
+           const int3 stride,
+           const uint3 padded_dims,
+           const uint tot_blocks)
+{
+
+  typedef unsigned long long int ull;
+  typedef long long int ll;
+  const ull blockId = blockIdx.x +
+                      blockIdx.y * gridDim.x +
+                      gridDim.x * gridDim.y * blockIdx.z;
+
+  // each thread gets a block so the block index is 
+  // the global thread index
+  const uint block_idx = blockId * blockDim.x + threadIdx.x;
+
+  if(block_idx >= tot_blocks)
+  {
+    // we can't launch the exact number of blocks
+    // so just exit if this isn't real
+    return;
+  }
+
+  uint3 block_dims;
+  block_dims.x = padded_dims.x >> 2; 
+  block_dims.y = padded_dims.y >> 2; 
+  block_dims.z = padded_dims.z >> 2; 
+
+  // logical pos in 3d array
+  uint3 block;
+  block.x = (block_idx % block_dims.x) * 4; 
+  block.y = ((block_idx/ block_dims.x) % block_dims.y) * 4; 
+  block.z = (block_idx/ (block_dims.x * block_dims.y)) * 4; 
+
+  // default strides
+  ll offset = (ll)block.x * stride.x + (ll)block.y * stride.y + (ll)block.z * stride.z; 
+  Scalar fblock[ZFP_3D_BLOCK_SIZE]; 
+
+  bool partial = false;
+  if(block.x + 4 > dims.x) partial = true;
+  if(block.y + 4 > dims.y) partial = true;
+  if(block.z + 4 > dims.z) partial = true;
+ 
+  if(partial) 
+  {
+    const uint nx = block.x + 4 > dims.x ? dims.x - block.x : 4;
+    const uint ny = block.y + 4 > dims.y ? dims.y - block.y : 4;
+    const uint nz = block.z + 4 > dims.z ? dims.z - block.z : 4;
+    gather_partial3(fblock, scalars + offset, nx, ny, nz, stride.x, stride.y, stride.z);
+
+  }
+  else
+  {
+    gather3(fblock, scalars + offset, stride.x, stride.y, stride.z);
+  }
+  zfp_encode_block<Scalar, ZFP_3D_BLOCK_SIZE>(fblock, maxbits, block_idx, stream);  
+
+}
+
+//
+// Launch the encode kernel
+//
+template<class Scalar>
+size_t encode3launch(uint3 dims, 
+                     int3 stride,
+                     const Scalar *d_data,
+                     Word *stream,
+                     const int maxbits)
+{
+
+  const int cuda_block_size = 128;
+  dim3 block_size = dim3(cuda_block_size, 1, 1);
+
+  uint3 zfp_pad(dims); 
+  if(zfp_pad.x % 4 != 0) zfp_pad.x += 4 - dims.x % 4;
+  if(zfp_pad.y % 4 != 0) zfp_pad.y += 4 - dims.y % 4;
+  if(zfp_pad.z % 4 != 0) zfp_pad.z += 4 - dims.z % 4;
+
+  const uint zfp_blocks = (zfp_pad.x * zfp_pad.y * zfp_pad.z) / 64; 
+
+  //
+  // we need to ensure that we launch a multiple of the 
+  // cuda block size
+  //
+  int block_pad = 0; 
+  if(zfp_blocks % cuda_block_size != 0)
+  {
+    block_pad = cuda_block_size - zfp_blocks % cuda_block_size; 
+  }
+
+  size_t total_blocks = block_pad + zfp_blocks;
+
+  dim3 grid_size = calculate_grid_size(total_blocks, cuda_block_size);
+
+  size_t stream_bytes = calc_device_mem3d(zfp_pad, maxbits);
+  //ensure we start with 0s
+  cudaMemset(stream, 0, stream_bytes);
+
+#ifdef CUDA_ZFP_RATE_PRINT
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start);
+#endif
+
+	cudaEncode<Scalar> << <grid_size, block_size>> >
+    (maxbits,
+     d_data,
+     stream,
+     dims,
+     stride,
+     zfp_pad,
+     zfp_blocks);
+
+#ifdef CUDA_ZFP_RATE_PRINT
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaStreamSynchronize(0);
+
+  float miliseconds = 0;
+  cudaEventElapsedTime(&miliseconds, start, stop);
+  float seconds = miliseconds / 1000.f;
+  float rate = (float(dims.x * dims.y * dims.z) * sizeof(Scalar) ) / seconds;
+  rate /= 1024.f;
+  rate /= 1024.f;
+  rate /= 1024.f;
+  printf("Encode elapsed time: %.5f (s)\n", seconds);
+  printf("# encode3 rate: %.2f (GB / sec) \n", rate);
+#endif
+  return stream_bytes;
+}
+
+//
+// Just pass the raw pointer to the "real" encode
+//
+template<class Scalar>
+size_t encode(uint3 dims, 
+              int3 stride,
+              Scalar *d_data,
+              Word *stream,
+              const int bits_per_block)
+{
+  return encode3launch<Scalar>(dims, stride, d_data, stream, bits_per_block);
+}
+
+}
+#endif
diff --git a/zfp/src/cuda_zfp/pointers.cuh b/zfp/src/cuda_zfp/pointers.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ee8d773bc806ced70d0d5bab707bbaec83e903af
--- /dev/null
+++ b/zfp/src/cuda_zfp/pointers.cuh
@@ -0,0 +1,25 @@
+#ifndef CUZFP_POINTERS_CUH
+#define CUZFP_POINTERS_CUH
+
+#include "ErrorCheck.h"
+#include <iostream>
+
+
+namespace cuZFP
+{
+// https://gitlab.kitware.com/third-party/nvpipe/blob/master/encode.c
+bool is_gpu_ptr(const void *ptr)
+{
+  cudaPointerAttributes atts; 
+  const cudaError_t perr = cudaPointerGetAttributes(&atts, ptr);
+
+  // clear last error so other error checking does 
+  // not pick it up
+  cudaError_t error = cudaGetLastError();
+
+  return perr == cudaSuccess && atts.memoryType == cudaMemoryTypeDevice;
+}
+
+} // namespace cuZFP 
+
+#endif
diff --git a/zfp/src/cuda_zfp/shared.h b/zfp/src/cuda_zfp/shared.h
new file mode 100644
index 0000000000000000000000000000000000000000..52de03adc38cd2e9872ac0ae194698384ead298a
--- /dev/null
+++ b/zfp/src/cuda_zfp/shared.h
@@ -0,0 +1,274 @@
+#ifndef CUZFP_SHARED_H
+#define CUZFP_SHARED_H
+
+//#define CUDA_ZFP_RATE_PRINT 1
+typedef unsigned long long Word;
+#define Wsize ((uint)(CHAR_BIT * sizeof(Word)))
+
+#include "type_info.cuh"
+#include "zfp.h"
+#include <stdio.h>
+
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#define bitsize(x) (CHAR_BIT * (uint)sizeof(x))
+
+#define LDEXP(x, e) ldexp(x, e)
+
+#define NBMASK 0xaaaaaaaaaaaaaaaaull
+
+__constant__ unsigned char c_perm_1[4];
+__constant__ unsigned char c_perm_2[16];
+__constant__ unsigned char c_perm[64];
+
+namespace cuZFP
+{
+
+template<typename T>
+__device__ void print_bits(const T &bits)
+{
+  const int bit_size = sizeof(T) * 8;
+
+  for(int i = bit_size - 1; i >= 0; --i)
+  {
+    T one = 1;
+    T mask = one << i;
+    T val = (bits & mask) >> i ;
+    printf("%d", (int) val);
+  }
+  printf("\n");
+}
+
+size_t calc_device_mem1d(const int dim, 
+                         const int maxbits)
+{
+  
+  const size_t vals_per_block = 4;
+  size_t total_blocks = dim / vals_per_block; 
+  if(dim % vals_per_block != 0) 
+  {
+    total_blocks++;
+  }
+  const size_t bits_per_block = maxbits;
+  const size_t bits_per_word = sizeof(Word) * 8;
+  const size_t total_bits = bits_per_block * total_blocks;
+  size_t alloc_size = total_bits / bits_per_word;
+  if(total_bits % bits_per_word != 0) alloc_size++;
+  // ensure we have zeros
+  return alloc_size * sizeof(Word);
+}
+
+size_t calc_device_mem2d(const uint2 dims, 
+                         const int maxbits)
+{
+  
+  const size_t vals_per_block = 16;
+  size_t total_blocks = (dims.x * dims.y) / vals_per_block; 
+  if((dims.x * dims.y) % vals_per_block != 0) total_blocks++;
+  const size_t bits_per_block = maxbits;
+  const size_t bits_per_word = sizeof(Word) * 8;
+  const size_t total_bits = bits_per_block * total_blocks;
+  size_t alloc_size = total_bits / bits_per_word;
+  if(total_bits % bits_per_word != 0) alloc_size++;
+  return alloc_size * sizeof(Word);
+}
+
+size_t calc_device_mem3d(const uint3 encoded_dims, 
+                         const int bits_per_block)
+{
+  const size_t vals_per_block = 64;
+  const size_t size = encoded_dims.x * encoded_dims.y * encoded_dims.z; 
+  size_t total_blocks = size / vals_per_block; 
+  const size_t bits_per_word = sizeof(Word) * 8;
+  const size_t total_bits = bits_per_block * total_blocks;
+  const size_t alloc_size = total_bits / bits_per_word;
+  return alloc_size * sizeof(Word);
+}
+
+dim3 get_max_grid_dims()
+{
+  cudaDeviceProp prop; 
+  int device = 0;
+  cudaGetDeviceProperties(&prop, device);
+  dim3 grid_dims;
+  grid_dims.x = prop.maxGridSize[0];
+  grid_dims.y = prop.maxGridSize[1];
+  grid_dims.z = prop.maxGridSize[2];
+  return grid_dims;
+}
+
+// size is assumed to have a pad to the nearest cuda block size
+dim3 calculate_grid_size(size_t size, size_t cuda_block_size)
+{
+  size_t grids = size / cuda_block_size; // because of pad this will be exact
+  dim3 max_grid_dims = get_max_grid_dims();
+  int dims  = 1;
+  // check to see if we need to add more grids
+  if( grids > max_grid_dims.x)
+  {
+    dims = 2; 
+  }
+  if(grids > max_grid_dims.x * max_grid_dims.y)
+  {
+    dims = 3;
+  }
+
+  dim3 grid_size;
+  grid_size.x = 1;
+  grid_size.y = 1;
+  grid_size.z = 1;
+ 
+  if(dims == 1)
+  {
+    grid_size.x = grids; 
+  }
+
+  if(dims == 2)
+  {
+    float sq_r = sqrt((float)grids);
+    float intpart = 0.;
+    modf(sq_r,&intpart); 
+    uint base = intpart;
+    grid_size.x = base; 
+    grid_size.y = base; 
+    // figure out how many y to add
+    uint rem = (size - base * base);
+    uint y_rows = rem / base;
+    if(rem % base != 0) y_rows ++;
+    grid_size.y += y_rows; 
+  }
+
+  if(dims == 3)
+  {
+    float cub_r = pow((float)grids, 1.f/3.f);;
+    float intpart = 0.;
+    modf(cub_r,&intpart); 
+    int base = intpart;
+    grid_size.x = base; 
+    grid_size.y = base; 
+    grid_size.z = base; 
+    // figure out how many z to add
+    uint rem = (size - base * base * base);
+    uint z_rows = rem / (base * base);
+    if(rem % (base * base) != 0) z_rows ++;
+    grid_size.z += z_rows; 
+  }
+
+  
+  return grid_size;
+}
+
+
+// map two's complement signed integer to negabinary unsigned integer
+inline __device__ 
+unsigned long long int int2uint(const long long int x)
+{
+    return (x + (unsigned long long int)0xaaaaaaaaaaaaaaaaull) ^ 
+                (unsigned long long int)0xaaaaaaaaaaaaaaaaull;
+}
+
+inline __device__ 
+unsigned int int2uint(const int x)
+{
+    return (x + (unsigned int)0xaaaaaaaau) ^ 
+                (unsigned int)0xaaaaaaaau;
+}
+
+
+template<typename Int, typename Scalar>
+__device__
+Scalar
+dequantize(const Int &x, const int &e);
+
+template<>
+__device__
+double
+dequantize<long long int, double>(const long long int &x, const int &e)
+{
+	return LDEXP((double)x, e - (CHAR_BIT * scalar_sizeof<double>() - 2));
+}
+
+template<>
+__device__
+float
+dequantize<int, float>(const int &x, const int &e)
+{
+	return LDEXP((float)x, e - (CHAR_BIT * scalar_sizeof<float>() - 2));
+}
+
+template<>
+__device__
+int
+dequantize<int, int>(const int &x, const int &e)
+{
+	return 1;
+}
+
+template<>
+__device__
+long long int
+dequantize<long long int, long long int>(const long long int &x, const int &e)
+{
+	return 1;
+}
+
+/* inverse lifting transform of 4-vector */
+template<class Int, uint s>
+__device__
+static void
+inv_lift(Int* p)
+{
+	Int x, y, z, w;
+	x = *p; p += s;
+	y = *p; p += s;
+	z = *p; p += s;
+	w = *p; p += s;
+
+	/*
+	** non-orthogonal transform
+	**       ( 4  6 -4 -1) (x)
+	** 1/4 * ( 4  2  4  5) (y)
+	**       ( 4 -2  4 -5) (z)
+	**       ( 4 -6 -4  1) (w)
+	*/
+	y += w >> 1; w -= y >> 1;
+	y += w; w <<= 1; w -= y;
+	z += x; x <<= 1; x -= z;
+	y += z; z <<= 1; z -= y;
+	w += x; x <<= 1; x -= w;
+
+	p -= s; *p = w;
+	p -= s; *p = z;
+	p -= s; *p = y;
+	p -= s; *p = x;
+}
+
+
+template<int BlockSize>
+__device__
+unsigned char* get_perm();
+
+template<>
+__device__
+unsigned char* get_perm<64>()
+{
+  return c_perm;
+}
+
+template<>
+__device__
+unsigned char* get_perm<16>()
+{
+  return c_perm_2;
+}
+
+template<>
+__device__
+unsigned char* get_perm<4>()
+{
+  return c_perm_1;
+}
+
+
+} // namespace cuZFP
+#endif
diff --git a/zfp/src/cuda_zfp/type_info.cuh b/zfp/src/cuda_zfp/type_info.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..969f5532eed77b705b40f8da3c1181f427e28226
--- /dev/null
+++ b/zfp/src/cuda_zfp/type_info.cuh
@@ -0,0 +1,92 @@
+#ifndef cuZFP_TYPE_INFO
+#define cuZFP_TYPE_INFO
+
+namespace cuZFP {
+
+template<typename T> inline __host__ __device__ int get_ebias();
+template<> inline __host__ __device__ int get_ebias<double>() { return 1023; }
+template<> inline __host__ __device__ int get_ebias<float>() { return 127; }
+template<> inline __host__ __device__ int get_ebias<long long int>() { return 0; }
+template<> inline __host__ __device__ int get_ebias<int>() { return 0; }
+
+template<typename T> inline __host__ __device__ int get_ebits();
+template<> inline __host__ __device__ int get_ebits<double>() { return 11; }
+template<> inline __host__ __device__ int get_ebits<float>() { return 8; }
+template<> inline __host__ __device__ int get_ebits<int>() { return 0; }
+template<> inline __host__ __device__ int get_ebits<long long int>() { return 0; }
+
+template<typename T> inline __host__ __device__ int get_precision();
+template<> inline __host__ __device__ int get_precision<double>() { return 64; }
+template<> inline __host__ __device__ int get_precision<long long int>() { return 64; }
+template<> inline __host__ __device__ int get_precision<float>() { return 32; }
+template<> inline __host__ __device__ int get_precision<int>() { return 32; }
+
+template<typename T> inline __host__ __device__ int get_min_exp();
+template<> inline __host__ __device__ int get_min_exp<double>() { return -1074; }
+template<> inline __host__ __device__ int get_min_exp<float>() { return -1074; }
+template<> inline __host__ __device__ int get_min_exp<long long int>() { return 0; }
+template<> inline __host__ __device__ int get_min_exp<int>() { return 0; }
+
+template<typename T> inline __host__ __device__ int scalar_sizeof();
+
+template<> inline __host__ __device__ int scalar_sizeof<double>() { return 8; }
+template<> inline __host__ __device__ int scalar_sizeof<long long int>() { return 8; }
+template<> inline __host__ __device__ int scalar_sizeof<float>() { return 4; }
+template<> inline __host__ __device__ int scalar_sizeof<int>() { return 4; }
+
+template<typename T> struct zfp_traits;
+
+template<> struct zfp_traits<double>
+{
+  typedef unsigned long long int UInt;
+  typedef long long int Int;
+};
+
+template<> struct zfp_traits<long long int>
+{
+  typedef unsigned long long int UInt;
+  typedef long long int Int;
+};
+
+template<> struct zfp_traits<float>
+{
+  typedef unsigned int UInt;
+  typedef int Int;
+};
+
+template<> struct zfp_traits<int>
+{
+  typedef unsigned int UInt;
+  typedef int Int;
+};
+
+template<typename T> inline __host__ __device__ bool is_int()
+{
+  return false;
+}
+
+template<> inline __host__ __device__ bool is_int<int>()
+{
+  return true;
+}
+
+template<> inline __host__ __device__ bool is_int<long long int>()
+{
+  return true;
+}
+
+template<int T> struct block_traits;
+
+template<> struct block_traits<1>
+{
+  typedef unsigned char PlaneType;
+};
+
+template<> struct block_traits<2>
+{
+  typedef unsigned short PlaneType;
+};
+
+
+} // namespace cuZFP
+#endif
diff --git a/zfp/src/decode1d.c b/zfp/src/decode1d.c
new file mode 100644
index 0000000000000000000000000000000000000000..93756bf2b6527e6983d1b6248a0b084227fe3233
--- /dev/null
+++ b/zfp/src/decode1d.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block1.h"
+#include "traitsd.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec1.c"
+#include "template/decode.c"
+#include "template/decodef.c"
+#include "template/decode1.c"
diff --git a/zfp/src/decode1f.c b/zfp/src/decode1f.c
new file mode 100644
index 0000000000000000000000000000000000000000..55808b474f080058c443712dd2a1cc60615a5518
--- /dev/null
+++ b/zfp/src/decode1f.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block1.h"
+#include "traitsf.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec1.c"
+#include "template/decode.c"
+#include "template/decodef.c"
+#include "template/decode1.c"
diff --git a/zfp/src/decode1i.c b/zfp/src/decode1i.c
new file mode 100644
index 0000000000000000000000000000000000000000..22529cc25d887d54e0c47e3374fb99f0a5033f0a
--- /dev/null
+++ b/zfp/src/decode1i.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block1.h"
+#include "traitsi.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec1.c"
+#include "template/decode.c"
+#include "template/decodei.c"
+#include "template/decode1.c"
diff --git a/zfp/src/decode1l.c b/zfp/src/decode1l.c
new file mode 100644
index 0000000000000000000000000000000000000000..b980cc5d697095560179177b803d73c91d4c9573
--- /dev/null
+++ b/zfp/src/decode1l.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block1.h"
+#include "traitsl.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec1.c"
+#include "template/decode.c"
+#include "template/decodei.c"
+#include "template/decode1.c"
diff --git a/zfp/src/decode2d.c b/zfp/src/decode2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..2f72c9fc6f5586a0145ecc4ae7aec53d1a5af56b
--- /dev/null
+++ b/zfp/src/decode2d.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block2.h"
+#include "traitsd.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec2.c"
+#include "template/decode.c"
+#include "template/decodef.c"
+#include "template/decode2.c"
diff --git a/zfp/src/decode2f.c b/zfp/src/decode2f.c
new file mode 100644
index 0000000000000000000000000000000000000000..a1caffb2a466a056753ad7ee68e1a25e8605c77b
--- /dev/null
+++ b/zfp/src/decode2f.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block2.h"
+#include "traitsf.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec2.c"
+#include "template/decode.c"
+#include "template/decodef.c"
+#include "template/decode2.c"
diff --git a/zfp/src/decode2i.c b/zfp/src/decode2i.c
new file mode 100644
index 0000000000000000000000000000000000000000..65de16ba8e30e97c30d8f5441fdd1ead6a365202
--- /dev/null
+++ b/zfp/src/decode2i.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block2.h"
+#include "traitsi.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec2.c"
+#include "template/decode.c"
+#include "template/decodei.c"
+#include "template/decode2.c"
diff --git a/zfp/src/decode2l.c b/zfp/src/decode2l.c
new file mode 100644
index 0000000000000000000000000000000000000000..0ced03504bedd79f58679f51eecf959ad7f941da
--- /dev/null
+++ b/zfp/src/decode2l.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block2.h"
+#include "traitsl.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec2.c"
+#include "template/decode.c"
+#include "template/decodei.c"
+#include "template/decode2.c"
diff --git a/zfp/src/decode3d.c b/zfp/src/decode3d.c
new file mode 100644
index 0000000000000000000000000000000000000000..918741fc2dff7ed08fc270b1306a3f2d416f27fd
--- /dev/null
+++ b/zfp/src/decode3d.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block3.h"
+#include "traitsd.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec3.c"
+#include "template/decode.c"
+#include "template/decodef.c"
+#include "template/decode3.c"
diff --git a/zfp/src/decode3f.c b/zfp/src/decode3f.c
new file mode 100644
index 0000000000000000000000000000000000000000..30587a7709e015891a40a2e3d27a25a99172b14d
--- /dev/null
+++ b/zfp/src/decode3f.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block3.h"
+#include "traitsf.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec3.c"
+#include "template/decode.c"
+#include "template/decodef.c"
+#include "template/decode3.c"
diff --git a/zfp/src/decode3i.c b/zfp/src/decode3i.c
new file mode 100644
index 0000000000000000000000000000000000000000..aa30070dcd6ad637089338034b4595003bcde4ac
--- /dev/null
+++ b/zfp/src/decode3i.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block3.h"
+#include "traitsi.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec3.c"
+#include "template/decode.c"
+#include "template/decodei.c"
+#include "template/decode3.c"
diff --git a/zfp/src/decode3l.c b/zfp/src/decode3l.c
new file mode 100644
index 0000000000000000000000000000000000000000..1796b79355b6661795bf7f5a7df58376744945e6
--- /dev/null
+++ b/zfp/src/decode3l.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block3.h"
+#include "traitsl.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec3.c"
+#include "template/decode.c"
+#include "template/decodei.c"
+#include "template/decode3.c"
diff --git a/zfp/src/decode4d.c b/zfp/src/decode4d.c
new file mode 100644
index 0000000000000000000000000000000000000000..500e802a69f783a8c89c5398e73413adbe46741b
--- /dev/null
+++ b/zfp/src/decode4d.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block4.h"
+#include "traitsd.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec4.c"
+#include "template/decode.c"
+#include "template/decodef.c"
+#include "template/decode4.c"
diff --git a/zfp/src/decode4f.c b/zfp/src/decode4f.c
new file mode 100644
index 0000000000000000000000000000000000000000..de15b84fa06f9599cb48a67d9a34be82035153d7
--- /dev/null
+++ b/zfp/src/decode4f.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block4.h"
+#include "traitsf.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec4.c"
+#include "template/decode.c"
+#include "template/decodef.c"
+#include "template/decode4.c"
diff --git a/zfp/src/decode4i.c b/zfp/src/decode4i.c
new file mode 100644
index 0000000000000000000000000000000000000000..1bfe4aaf28ec2a744adc7e7633c6064e5266d4e1
--- /dev/null
+++ b/zfp/src/decode4i.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block4.h"
+#include "traitsi.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec4.c"
+#include "template/decode.c"
+#include "template/decodei.c"
+#include "template/decode4.c"
diff --git a/zfp/src/decode4l.c b/zfp/src/decode4l.c
new file mode 100644
index 0000000000000000000000000000000000000000..950f8a0b1a963923fa723970a7a55d0bb48eb456
--- /dev/null
+++ b/zfp/src/decode4l.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block4.h"
+#include "traitsl.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec4.c"
+#include "template/decode.c"
+#include "template/decodei.c"
+#include "template/decode4.c"
diff --git a/zfp/src/encode1d.c b/zfp/src/encode1d.c
new file mode 100644
index 0000000000000000000000000000000000000000..c96147497272674d1019c3d0ca2033ebedaa7f0e
--- /dev/null
+++ b/zfp/src/encode1d.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block1.h"
+#include "traitsd.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec1.c"
+#include "template/encode.c"
+#include "template/encodef.c"
+#include "template/encode1.c"
diff --git a/zfp/src/encode1f.c b/zfp/src/encode1f.c
new file mode 100644
index 0000000000000000000000000000000000000000..9e922e516e9ac3d9ab4d804e12f11a77d4c5a27c
--- /dev/null
+++ b/zfp/src/encode1f.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block1.h"
+#include "traitsf.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec1.c"
+#include "template/encode.c"
+#include "template/encodef.c"
+#include "template/encode1.c"
diff --git a/zfp/src/encode1i.c b/zfp/src/encode1i.c
new file mode 100644
index 0000000000000000000000000000000000000000..2d4a8b6a5eec2ec115e282e79030ce4f3ce29a83
--- /dev/null
+++ b/zfp/src/encode1i.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block1.h"
+#include "traitsi.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec1.c"
+#include "template/encode.c"
+#include "template/encodei.c"
+#include "template/encode1.c"
diff --git a/zfp/src/encode1l.c b/zfp/src/encode1l.c
new file mode 100644
index 0000000000000000000000000000000000000000..746539bbef13cf1a1348af5efa12dbc4b1f5dd00
--- /dev/null
+++ b/zfp/src/encode1l.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block1.h"
+#include "traitsl.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec1.c"
+#include "template/encode.c"
+#include "template/encodei.c"
+#include "template/encode1.c"
diff --git a/zfp/src/encode2d.c b/zfp/src/encode2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..053efe5e51dbddf40ba6a9b69a4e49dddb3be082
--- /dev/null
+++ b/zfp/src/encode2d.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block2.h"
+#include "traitsd.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec2.c"
+#include "template/encode.c"
+#include "template/encodef.c"
+#include "template/encode2.c"
diff --git a/zfp/src/encode2f.c b/zfp/src/encode2f.c
new file mode 100644
index 0000000000000000000000000000000000000000..52321e798dbddba9c169ab74dab252ee108d8e40
--- /dev/null
+++ b/zfp/src/encode2f.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block2.h"
+#include "traitsf.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec2.c"
+#include "template/encode.c"
+#include "template/encodef.c"
+#include "template/encode2.c"
diff --git a/zfp/src/encode2i.c b/zfp/src/encode2i.c
new file mode 100644
index 0000000000000000000000000000000000000000..c67d0ed019a87c284fdbda56884bb7458aad3a66
--- /dev/null
+++ b/zfp/src/encode2i.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block2.h"
+#include "traitsi.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec2.c"
+#include "template/encode.c"
+#include "template/encodei.c"
+#include "template/encode2.c"
diff --git a/zfp/src/encode2l.c b/zfp/src/encode2l.c
new file mode 100644
index 0000000000000000000000000000000000000000..990bc0104dac9124a146345160aa13bf9257119b
--- /dev/null
+++ b/zfp/src/encode2l.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block2.h"
+#include "traitsl.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec2.c"
+#include "template/encode.c"
+#include "template/encodei.c"
+#include "template/encode2.c"
diff --git a/zfp/src/encode3d.c b/zfp/src/encode3d.c
new file mode 100644
index 0000000000000000000000000000000000000000..4d82484906f22cfe503383fbcf60eb466d946a0f
--- /dev/null
+++ b/zfp/src/encode3d.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block3.h"
+#include "traitsd.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec3.c"
+#include "template/encode.c"
+#include "template/encodef.c"
+#include "template/encode3.c"
diff --git a/zfp/src/encode3f.c b/zfp/src/encode3f.c
new file mode 100644
index 0000000000000000000000000000000000000000..0a95c899781bd80d355fec90d1ad424dde58b6bd
--- /dev/null
+++ b/zfp/src/encode3f.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block3.h"
+#include "traitsf.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec3.c"
+#include "template/encode.c"
+#include "template/encodef.c"
+#include "template/encode3.c"
diff --git a/zfp/src/encode3i.c b/zfp/src/encode3i.c
new file mode 100644
index 0000000000000000000000000000000000000000..6c78aac34aa4973aad965580cb1d3938408ba19b
--- /dev/null
+++ b/zfp/src/encode3i.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block3.h"
+#include "traitsi.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec3.c"
+#include "template/encode.c"
+#include "template/encodei.c"
+#include "template/encode3.c"
diff --git a/zfp/src/encode3l.c b/zfp/src/encode3l.c
new file mode 100644
index 0000000000000000000000000000000000000000..931c7424b85e1ad99ddba1b8ae2fa592e935d8e4
--- /dev/null
+++ b/zfp/src/encode3l.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block3.h"
+#include "traitsl.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec3.c"
+#include "template/encode.c"
+#include "template/encodei.c"
+#include "template/encode3.c"
diff --git a/zfp/src/encode4d.c b/zfp/src/encode4d.c
new file mode 100644
index 0000000000000000000000000000000000000000..5ff58e7c4f2b421262759b7e6a17f36e6b9d5f68
--- /dev/null
+++ b/zfp/src/encode4d.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block4.h"
+#include "traitsd.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec4.c"
+#include "template/encode.c"
+#include "template/encodef.c"
+#include "template/encode4.c"
diff --git a/zfp/src/encode4f.c b/zfp/src/encode4f.c
new file mode 100644
index 0000000000000000000000000000000000000000..ba24f586787370fe629d17b7b746c15948e7fbeb
--- /dev/null
+++ b/zfp/src/encode4f.c
@@ -0,0 +1,13 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block4.h"
+#include "traitsf.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codecf.c"
+#include "template/codec4.c"
+#include "template/encode.c"
+#include "template/encodef.c"
+#include "template/encode4.c"
diff --git a/zfp/src/encode4i.c b/zfp/src/encode4i.c
new file mode 100644
index 0000000000000000000000000000000000000000..6e9fc1bfc3e2ec0c6ba39d5823ff4ff57ec30f5c
--- /dev/null
+++ b/zfp/src/encode4i.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block4.h"
+#include "traitsi.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec4.c"
+#include "template/encode.c"
+#include "template/encodei.c"
+#include "template/encode4.c"
diff --git a/zfp/src/encode4l.c b/zfp/src/encode4l.c
new file mode 100644
index 0000000000000000000000000000000000000000..d5bf86c0916e7f22d378de4f96b4276e3bf6becb
--- /dev/null
+++ b/zfp/src/encode4l.c
@@ -0,0 +1,12 @@
+#include "inline/inline.h"
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "block4.h"
+#include "traitsl.h"
+#include "template/template.h"
+#include "template/codec.h"
+#include "inline/bitstream.c"
+#include "template/codec4.c"
+#include "template/encode.c"
+#include "template/encodei.c"
+#include "template/encode4.c"
diff --git a/zfp/src/inline/bitstream.c b/zfp/src/inline/bitstream.c
new file mode 100644
index 0000000000000000000000000000000000000000..6e96629254b33f39ea5bdb0baa6284c4992ea7cd
--- /dev/null
+++ b/zfp/src/inline/bitstream.c
@@ -0,0 +1,450 @@
+/*
+High-speed in-memory bit stream I/O that supports reading and writing between
+0 and 64 bits at a time.  The implementation, which relies heavily on bit
+shifts, has been carefully written to ensure that all shifts are between
+zero and one less the width of the type being shifted to avoid undefined
+behavior.  This occasionally causes somewhat convoluted code.
+
+The following assumptions and restrictions apply:
+
+1. The user must allocate a memory buffer large enough to hold the bit stream,
+   whether for reading, writing, or both.  This buffer is associated with the
+   bit stream via stream_open(buffer, bytes), which allocates and returns a
+   pointer to an opaque bit stream struct.  Call stream_close(stream) to
+   deallocate this struct.
+
+2. The stream is either in a read or write state (or, initially, in both
+   states).  When done writing, call stream_flush(stream) before entering
+   read mode to ensure any buffered bits are output.  To enter read mode,
+   call stream_rewind(stream) or stream_rseek(stream, offset) to position
+   the stream at the beginning or at a particular bit offset.  Conversely,
+   stream_rewind(stream) or stream_wseek(stream, offset) positions the
+   stream for writing.  In read mode, the following functions may be called:
+
+     size_t stream_size(stream);
+     size_t stream_rtell(stream);
+     void stream_rewind(stream);
+     void stream_rseek(stream, offset);
+     void stream_skip(stream, uint n);
+     size_t stream_align(stream);
+     uint stream_read_bit(stream);
+     uint64 stream_read_bits(stream, n);
+
+   Each of the above read calls has a corresponding write call:
+
+     size_t stream_size(stream);
+     size_t stream_wtell(stream);
+     void stream_rewind(stream);
+     void stream_wseek(stream, offset);
+     void stream_pad(stream, n);
+     size_t stream_flush(stream);
+     uint stream_write_bit(stream, bit);
+     uint64 stream_write_bits(stream, value, n);
+
+3. The stream buffer is an unsigned integer of a user-specified type given
+   by the BIT_STREAM_WORD_TYPE macro.  Bits are read and written in units of
+   this integer word type.  Supported types are 8, 16, 32, or 64 bits wide.
+   The bit width of the buffer is denoted by 'wsize' and can be accessed via
+   the global constant stream_word_bits.  A small wsize allows for fine
+   granularity reads and writes, and may be preferable when working with many
+   small blocks of data that require non-sequential access.  The default
+   maximum size of 64 bits ensures maximum speed.  Note that even when
+   wsize < 64, it is still possible to read and write up to 64 bits at a time
+   using stream_read_bits() and stream_write_bits().
+
+4. If BIT_STREAM_STRIDED is defined, words read from or written to the stream
+   may be accessed noncontiguously by setting a power-of-two block size (which
+   by default is one word) and a block stride (defaults to zero blocks).  The
+   word pointer is always incremented by one word each time a word is accessed.
+   Once advanced past a block boundary, the word pointer is also advanced by
+   the stride to the next block.  This feature may be used to store blocks of
+   data interleaved, e.g. for progressive coding or for noncontiguous parallel
+   access to the bit stream  Note that the block size is measured in words,
+   while the stride is measured in multiples of the block size.  Strided access
+   can have a significant performance penalty.
+
+5. Multiple bits are read and written in order of least to most significant
+   bit.  Thus, the statement
+
+       value = stream_write_bits(stream, value, n);
+
+   is essentially equivalent to (but faster than)
+
+       for (i = 0; i < n; i++, value >>= 1)
+         stream_write_bit(value & 1);
+
+   when 0 <= n <= 64.  The same holds for read calls, and thus
+
+       value = stream_read_bits(stream, n);
+
+   is essentially equivalent to
+
+       for (i = 0, value = 0; i < n; i++)
+         value += (uint64)stream_read_bit() << i;
+
+   Note that it is possible to write fewer bits than the argument 'value'
+   holds (possibly even no bits), in which case any unwritten bits are
+   returned.
+
+6. Although the stream_wseek(stream, offset) call allows positioning the
+   stream for writing at any bit offset without any data loss (i.e. all
+   previously written bits preceding the offset remain valid), for efficiency
+   the stream_flush(stream) operation will zero all bits up to the next
+   multiple of wsize bits, thus overwriting bits that were previously stored
+   at that location.  Consequently, random write access is effectively
+   supported only at wsize granularity.  For sequential access, the largest
+   possible wsize is preferred due to higher speed.
+
+7. It is up to the user to adhere to these rules.  For performance reasons,
+   no error checking is done, and in particular buffer overruns are not
+   caught.
+*/
+
+#include <limits.h>
+#include <stdlib.h>
+
+#ifndef inline_
+  #define inline_
+#endif
+
+/* bit stream word/buffer type; granularity of stream I/O operations */
+#ifdef BIT_STREAM_WORD_TYPE
+  /* may be 8-, 16-, 32-, or 64-bit unsigned integer type */
+  typedef BIT_STREAM_WORD_TYPE word;
+#else
+  /* use maximum word size by default for highest speed */
+  typedef uint64 word;
+#endif
+
+/* number of bits in a buffered word */
+#define wsize ((uint)(CHAR_BIT * sizeof(word)))
+
+/* bit stream structure (opaque to caller) */
+struct bitstream {
+  uint bits;   /* number of buffered bits (0 <= bits < wsize) */
+  word buffer; /* buffer for incoming/outgoing bits (buffer < 2^bits) */
+  word* ptr;   /* pointer to next word to be read/written */
+  word* begin; /* beginning of stream */
+  word* end;   /* end of stream (currently unused) */
+#ifdef BIT_STREAM_STRIDED
+  size_t mask;     /* one less the block size in number of words */
+  ptrdiff_t delta; /* number of words between consecutive blocks */
+#endif
+};
+
+/* private functions ------------------------------------------------------- */
+
+/* read a single word from memory */
+static word
+stream_read_word(bitstream* s)
+{
+  word w = *s->ptr++;
+#ifdef BIT_STREAM_STRIDED
+  if (!((s->ptr - s->begin) & s->mask))
+    s->ptr += s->delta;
+#endif
+  return w;
+}
+
+/* write a single word to memory */
+static void
+stream_write_word(bitstream* s, word value)
+{
+  *s->ptr++ = value;
+#ifdef BIT_STREAM_STRIDED
+  if (!((s->ptr - s->begin) & s->mask))
+    s->ptr += s->delta;
+#endif
+}
+
+/* public functions -------------------------------------------------------- */
+
+/* pointer to beginning of stream */
+inline_ void*
+stream_data(const bitstream* s)
+{
+  return s->begin;
+}
+
+/* current byte size of stream (if flushed) */
+inline_ size_t
+stream_size(const bitstream* s)
+{
+  return sizeof(word) * (s->ptr - s->begin);
+}
+
+/* byte capacity of stream */
+inline_ size_t
+stream_capacity(const bitstream* s)
+{
+  return sizeof(word) * (s->end - s->begin);
+}
+
+/* number of words per block */
+inline_ size_t
+stream_stride_block(const bitstream* s)
+{
+#ifdef BIT_STREAM_STRIDED
+  return s->mask + 1;
+#else
+  return 1;
+#endif
+}
+
+/* number of blocks between consecutive stream blocks */
+inline_ ptrdiff_t
+stream_stride_delta(const bitstream* s)
+{
+#ifdef BIT_STREAM_STRIDED
+  return s->delta / (s->mask + 1);
+#else
+  return 0;
+#endif
+}
+
+/* read single bit (0 or 1) */
+inline_ uint
+stream_read_bit(bitstream* s)
+{
+  uint bit;
+  if (!s->bits) {
+    s->buffer = stream_read_word(s);
+    s->bits = wsize;
+  }
+  s->bits--;
+  bit = (uint)s->buffer & 1u;
+  s->buffer >>= 1;
+  return bit;
+}
+
+/* write single bit (must be 0 or 1) */
+inline_ uint
+stream_write_bit(bitstream* s, uint bit)
+{
+  s->buffer += (word)bit << s->bits;
+  if (++s->bits == wsize) {
+    stream_write_word(s, s->buffer);
+    s->buffer = 0;
+    s->bits = 0;
+  }
+  return bit;
+}
+
+/* read 0 <= n <= 64 bits */
+inline_ uint64
+stream_read_bits(bitstream* s, uint n)
+{
+  uint64 value = s->buffer;
+  if (s->bits < n) {
+    /* keep fetching wsize bits until enough bits are buffered */
+    do {
+      /* assert: 0 <= s->bits < n <= 64 */
+      s->buffer = stream_read_word(s);
+      value += (uint64)s->buffer << s->bits;
+      s->bits += wsize;
+    } while (sizeof(s->buffer) < sizeof(value) && s->bits < n);
+    /* assert: 1 <= n <= s->bits < n + wsize */
+    s->bits -= n;
+    if (!s->bits) {
+      /* value holds exactly n bits; no need for masking */
+      s->buffer = 0;
+    }
+    else {
+      /* assert: 1 <= s->bits < wsize */
+      s->buffer >>= wsize - s->bits;
+      /* assert: 1 <= n <= 64 */
+      value &= ((uint64)2 << (n - 1)) - 1;
+    }
+  }
+  else {
+    /* assert: 0 <= n <= s->bits < wsize <= 64 */
+    s->bits -= n;
+    s->buffer >>= n;
+    value &= ((uint64)1 << n) - 1;
+  }
+  return value;
+}
+
+/* write 0 <= n <= 64 low bits of value and return remaining bits */
+inline_ uint64
+stream_write_bits(bitstream* s, uint64 value, uint n)
+{
+  /* append bit string to buffer */
+  s->buffer += (word)(value << s->bits);
+  s->bits += n;
+  /* is buffer full? */
+  if (s->bits >= wsize) {
+    /* 1 <= n <= 64; decrement n to ensure valid right shifts below */
+    value >>= 1;
+    n--;
+    /* assert: 0 <= n < 64; wsize <= s->bits <= wsize + n */
+    do {
+      /* output wsize bits while buffer is full */
+      s->bits -= wsize;
+      /* assert: 0 <= s->bits <= n */
+      stream_write_word(s, s->buffer);
+      /* assert: 0 <= n - s->bits < 64 */
+      s->buffer = (word)(value >> (n - s->bits));
+    } while (sizeof(s->buffer) < sizeof(value) && s->bits >= wsize);
+  }
+  /* assert: 0 <= s->bits < wsize */
+  s->buffer &= ((word)1 << s->bits) - 1;
+  /* assert: 0 <= n < 64 */
+  return value >> n;
+}
+
+/* return bit offset to next bit to be read */
+inline_ size_t
+stream_rtell(const bitstream* s)
+{
+  return wsize * (s->ptr - s->begin) - s->bits;
+}
+
+/* return bit offset to next bit to be written */
+inline_ size_t
+stream_wtell(const bitstream* s)
+{
+  return wsize * (s->ptr - s->begin) + s->bits;
+}
+
+/* position stream for reading or writing at beginning */
+inline_ void
+stream_rewind(bitstream* s)
+{
+  s->ptr = s->begin;
+  s->buffer = 0;
+  s->bits = 0;
+}
+
+/* position stream for reading at given bit offset */
+inline_ void
+stream_rseek(bitstream* s, size_t offset)
+{
+  uint n = offset % wsize;
+  s->ptr = s->begin + offset / wsize;
+  if (n) {
+    s->buffer = stream_read_word(s) >> n;
+    s->bits = wsize - n;
+  }
+  else {
+    s->buffer = 0;
+    s->bits = 0;
+  }
+}
+
+/* position stream for writing at given bit offset */
+inline_ void
+stream_wseek(bitstream* s, size_t offset)
+{
+  uint n = offset % wsize;
+  s->ptr = s->begin + offset / wsize;
+  if (n) {
+    word buffer = *s->ptr;
+    buffer &= ((word)1 << n) - 1;
+    s->buffer = buffer;
+    s->bits = n;
+  }
+  else {
+    s->buffer = 0;
+    s->bits = 0;
+  }
+}
+
+/* skip over the next n bits (n >= 0) */
+inline_ void
+stream_skip(bitstream* s, uint n)
+{
+  stream_rseek(s, stream_rtell(s) + n);
+}
+
+/* append n zero-bits to stream (n >= 0) */
+inline_ void
+stream_pad(bitstream* s, uint n)
+{
+  for (s->bits += n; s->bits >= wsize; s->bits -= wsize) {
+    stream_write_word(s, s->buffer);
+    s->buffer = 0;
+  }
+}
+
+/* align stream on next word boundary */
+inline_ size_t
+stream_align(bitstream* s)
+{
+  uint bits = s->bits;
+  if (bits)
+    stream_skip(s, bits);
+  return bits;
+}
+
+/* write any remaining buffered bits and align stream on next word boundary */
+inline_ size_t
+stream_flush(bitstream* s)
+{
+  uint bits = (wsize - s->bits) % wsize;
+  if (bits)
+    stream_pad(s, bits);
+  return bits;
+}
+
+/* copy n bits from one bit stream to another */
+inline_ void
+stream_copy(bitstream* dst, bitstream* src, size_t n)
+{
+  while (n > wsize) {
+    word w = (word)stream_read_bits(src, wsize);
+    stream_write_bits(dst, w, wsize);
+    n -= wsize;
+  }
+  if (n) {
+    word w = (word)stream_read_bits(src, (uint)n);
+    stream_write_bits(dst, w, (uint)n);
+  }
+}
+
+#ifdef BIT_STREAM_STRIDED
+/* set block size in number of words and spacing in number of blocks */
+inline_ int
+stream_set_stride(bitstream* s, size_t block, ptrdiff_t delta)
+{
+  /* ensure block size is a power of two */
+  if (block & (block - 1))
+    return 0;
+  s->mask = block - 1;
+  s->delta = delta * block;
+  return 1;
+}
+#endif
+
+/* allocate and initialize bit stream to user-allocated buffer */
+inline_ bitstream*
+stream_open(void* buffer, size_t bytes)
+{
+  bitstream* s = (bitstream*)malloc(sizeof(bitstream));
+  if (s) {
+    s->begin = (word*)buffer;
+    s->end = s->begin + bytes / sizeof(word);
+#ifdef BIT_STREAM_STRIDED
+    stream_set_stride(s, 0, 0);
+#endif
+    stream_rewind(s);
+  }
+  return s;
+}
+
+/* close and deallocate bit stream */
+inline_ void
+stream_close(bitstream* s)
+{
+  free(s);
+}
+
+/* make a copy of bit stream to shared memory buffer */
+inline_ bitstream*
+stream_clone(const bitstream* s)
+{
+  bitstream* c = (bitstream*)malloc(sizeof(bitstream));
+  if (c)
+    *c = *s;
+  return c;
+}
diff --git a/zfp/src/inline/inline.h b/zfp/src/inline/inline.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9ade3f11d8f85b64711880cbee8198e737a6448
--- /dev/null
+++ b/zfp/src/inline/inline.h
@@ -0,0 +1,12 @@
+#ifndef INLINE_H
+#define INLINE_H
+
+#ifndef inline_
+  #if __STDC_VERSION__ >= 199901L
+    #define inline_ static inline
+  #else
+    #define inline_ static
+  #endif
+#endif
+
+#endif
diff --git a/zfp/src/share/omp.c b/zfp/src/share/omp.c
new file mode 100644
index 0000000000000000000000000000000000000000..9ee26b9a68aae56f3fb6e75054fb4b4ddb60d786
--- /dev/null
+++ b/zfp/src/share/omp.c
@@ -0,0 +1,25 @@
+#ifdef _OPENMP
+#include <omp.h>
+
+/* number of omp threads to use */
+static int
+thread_count_omp(const zfp_stream* stream)
+{
+  int count = stream->exec.params.omp.threads;
+  /* if no thread count is specified, use default number of threads */
+  if (!count)
+    count = omp_get_max_threads();
+  return count;
+}
+
+/* number of chunks to partition array into */
+static uint
+chunk_count_omp(const zfp_stream* stream, uint blocks, uint threads)
+{
+  uint chunk_size = stream->exec.params.omp.chunk_size;
+  /* if no chunk size is specified, assign one chunk per thread */
+  uint chunks = chunk_size ? (blocks + chunk_size - 1) / chunk_size : threads;
+  return MIN(chunks, blocks);
+}
+
+#endif
diff --git a/zfp/src/share/parallel.c b/zfp/src/share/parallel.c
new file mode 100644
index 0000000000000000000000000000000000000000..8c67d8f49e77800b2f044125c4060f94b11c73b4
--- /dev/null
+++ b/zfp/src/share/parallel.c
@@ -0,0 +1,86 @@
+#ifdef _OPENMP
+
+/* block index at which chunk begins */
+static uint
+chunk_offset(uint blocks, uint chunks, uint chunk)
+{
+  return (uint)((blocks * (uint64)chunk) / chunks);
+}
+
+/* initialize per-thread bit streams for parallel compression */
+static bitstream**
+compress_init_par(zfp_stream* stream, const zfp_field* field, uint chunks, uint blocks)
+{
+  bitstream** bs;
+  size_t size;
+  int copy = 0;
+  uint i;
+
+  /* determine maximum size buffer needed per thread */
+  zfp_field f = *field;
+  switch (zfp_field_dimensionality(field)) {
+    case 1:
+      f.nx = 4 * (blocks + chunks - 1) / chunks;
+      break;
+    case 2:
+      f.nx = 4;
+      f.ny = 4 * (blocks + chunks - 1) / chunks;
+      break;
+    case 3:
+      f.nx = 4;
+      f.ny = 4;
+      f.nz = 4 * (blocks + chunks - 1) / chunks;
+      break;
+    case 4:
+      f.nx = 4;
+      f.ny = 4;
+      f.nz = 4;
+      f.nw = 4 * (blocks + chunks - 1) / chunks;
+      break;
+    default:
+      return 0;
+  }
+  size = zfp_stream_maximum_size(stream, &f);
+
+  /* avoid copies in fixed-rate mode when each bitstream is word aligned */
+  copy |= stream->minbits != stream->maxbits;
+  copy |= (stream->maxbits % stream_word_bits) != 0;
+  copy |= (stream_wtell(stream->stream) % stream_word_bits) != 0;
+
+  /* set up buffer for each thread to compress to */
+  bs = (bitstream**)malloc(chunks * sizeof(bitstream*));
+  for (i = 0; i < chunks; i++) {
+    uint block = chunk_offset(blocks, chunks, i);
+    void* buffer = copy ? malloc(size) : (uchar*)stream_data(stream->stream) + stream_size(stream->stream) + block * stream->maxbits / CHAR_BIT;
+    bs[i] = stream_open(buffer, size);
+  }
+
+  return bs;
+}
+
+/* flush and concatenate bit streams if needed */
+static void
+compress_finish_par(zfp_stream* stream, bitstream** src, uint chunks)
+{
+  bitstream* dst = zfp_stream_bit_stream(stream);
+  int copy = (stream_data(dst) != stream_data(*src));
+  size_t offset = stream_wtell(dst);
+  uint i;
+  for (i = 0; i < chunks; i++) {
+    size_t bits = stream_wtell(src[i]);
+    offset += bits;
+    stream_flush(src[i]);
+    /* concatenate streams if they are not already contiguous */
+    if (copy) {
+      stream_rewind(src[i]);
+      stream_copy(dst, src[i], bits);
+      free(stream_data(src[i]));
+    }
+    stream_close(src[i]);
+  }
+  free(src);
+  if (!copy)
+    stream_wseek(dst, offset);
+}
+
+#endif
diff --git a/zfp/src/template/codec.h b/zfp/src/template/codec.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7149a98ea5f770e6db30a2cd1db9e59e9ed02c7
--- /dev/null
+++ b/zfp/src/template/codec.h
@@ -0,0 +1,3 @@
+#define PERM _t1(perm, DIMS)           /* coefficient order */
+#define BLOCK_SIZE (1 << (2 * DIMS))   /* values per block */
+#define EBIAS ((1 << (EBITS - 1)) - 1) /* exponent bias */
diff --git a/zfp/src/template/codec1.c b/zfp/src/template/codec1.c
new file mode 100644
index 0000000000000000000000000000000000000000..5a4786471a940610d67472fd90858177b7d452d6
--- /dev/null
+++ b/zfp/src/template/codec1.c
@@ -0,0 +1,4 @@
+/* order coefficients by polynomial degree/frequency */
+cache_align_(static const uchar perm_1[4]) = {
+  0, 1, 2, 3
+};
diff --git a/zfp/src/template/codec2.c b/zfp/src/template/codec2.c
new file mode 100644
index 0000000000000000000000000000000000000000..a0f977024d6e73203ecbd14e6b605c949d799023
--- /dev/null
+++ b/zfp/src/template/codec2.c
@@ -0,0 +1,32 @@
+#define index(i, j) ((i) + 4 * (j))
+
+/* order coefficients (i, j) by i + j, then i^2 + j^2 */
+cache_align_(static const uchar perm_2[16]) = {
+  index(0, 0), /*  0 : 0 */
+
+  index(1, 0), /*  1 : 1 */
+  index(0, 1), /*  2 : 1 */
+
+  index(1, 1), /*  3 : 2 */
+
+  index(2, 0), /*  4 : 2 */
+  index(0, 2), /*  5 : 2 */
+
+  index(2, 1), /*  6 : 3 */
+  index(1, 2), /*  7 : 3 */
+
+  index(3, 0), /*  8 : 3 */
+  index(0, 3), /*  9 : 3 */
+
+  index(2, 2), /* 10 : 4 */
+
+  index(3, 1), /* 11 : 4 */
+  index(1, 3), /* 12 : 4 */
+
+  index(3, 2), /* 13 : 5 */
+  index(2, 3), /* 14 : 5 */
+
+  index(3, 3), /* 15 : 6 */
+};
+
+#undef index
diff --git a/zfp/src/template/codec3.c b/zfp/src/template/codec3.c
new file mode 100644
index 0000000000000000000000000000000000000000..b95f302720317b13bea8b53814646dccc42b6de2
--- /dev/null
+++ b/zfp/src/template/codec3.c
@@ -0,0 +1,90 @@
+#define index(i, j, k) ((i) + 4 * ((j) + 4 * (k)))
+
+/* order coefficients (i, j, k) by i + j + k, then i^2 + j^2 + k^2 */
+cache_align_(static const uchar perm_3[64]) = {
+  index(0, 0, 0), /*  0 : 0 */
+
+  index(1, 0, 0), /*  1 : 1 */
+  index(0, 1, 0), /*  2 : 1 */
+  index(0, 0, 1), /*  3 : 1 */
+
+  index(0, 1, 1), /*  4 : 2 */
+  index(1, 0, 1), /*  5 : 2 */
+  index(1, 1, 0), /*  6 : 2 */
+
+  index(2, 0, 0), /*  7 : 2 */
+  index(0, 2, 0), /*  8 : 2 */
+  index(0, 0, 2), /*  9 : 2 */
+
+  index(1, 1, 1), /* 10 : 3 */
+
+  index(2, 1, 0), /* 11 : 3 */
+  index(2, 0, 1), /* 12 : 3 */
+  index(0, 2, 1), /* 13 : 3 */
+  index(1, 2, 0), /* 14 : 3 */
+  index(1, 0, 2), /* 15 : 3 */
+  index(0, 1, 2), /* 16 : 3 */
+
+  index(3, 0, 0), /* 17 : 3 */
+  index(0, 3, 0), /* 18 : 3 */
+  index(0, 0, 3), /* 19 : 3 */
+
+  index(2, 1, 1), /* 20 : 4 */
+  index(1, 2, 1), /* 21 : 4 */
+  index(1, 1, 2), /* 22 : 4 */
+
+  index(0, 2, 2), /* 23 : 4 */
+  index(2, 0, 2), /* 24 : 4 */
+  index(2, 2, 0), /* 25 : 4 */
+
+  index(3, 1, 0), /* 26 : 4 */
+  index(3, 0, 1), /* 27 : 4 */
+  index(0, 3, 1), /* 28 : 4 */
+  index(1, 3, 0), /* 29 : 4 */
+  index(1, 0, 3), /* 30 : 4 */
+  index(0, 1, 3), /* 31 : 4 */
+
+  index(1, 2, 2), /* 32 : 5 */
+  index(2, 1, 2), /* 33 : 5 */
+  index(2, 2, 1), /* 34 : 5 */
+
+  index(3, 1, 1), /* 35 : 5 */
+  index(1, 3, 1), /* 36 : 5 */
+  index(1, 1, 3), /* 37 : 5 */
+
+  index(3, 2, 0), /* 38 : 5 */
+  index(3, 0, 2), /* 39 : 5 */
+  index(0, 3, 2), /* 40 : 5 */
+  index(2, 3, 0), /* 41 : 5 */
+  index(2, 0, 3), /* 42 : 5 */
+  index(0, 2, 3), /* 43 : 5 */
+
+  index(2, 2, 2), /* 44 : 6 */
+
+  index(3, 2, 1), /* 45 : 6 */
+  index(3, 1, 2), /* 46 : 6 */
+  index(1, 3, 2), /* 47 : 6 */
+  index(2, 3, 1), /* 48 : 6 */
+  index(2, 1, 3), /* 49 : 6 */
+  index(1, 2, 3), /* 50 : 6 */
+
+  index(0, 3, 3), /* 51 : 6 */
+  index(3, 0, 3), /* 52 : 6 */
+  index(3, 3, 0), /* 53 : 6 */
+
+  index(3, 2, 2), /* 54 : 7 */
+  index(2, 3, 2), /* 55 : 7 */
+  index(2, 2, 3), /* 56 : 7 */
+
+  index(1, 3, 3), /* 57 : 7 */
+  index(3, 1, 3), /* 58 : 7 */
+  index(3, 3, 1), /* 59 : 7 */
+
+  index(2, 3, 3), /* 60 : 8 */
+  index(3, 2, 3), /* 61 : 8 */
+  index(3, 3, 2), /* 62 : 8 */
+
+  index(3, 3, 3), /* 63 : 9 */
+};
+
+#undef index
diff --git a/zfp/src/template/codec4.c b/zfp/src/template/codec4.c
new file mode 100644
index 0000000000000000000000000000000000000000..b8314525cd42802a3ad8463191cea96bddef9661
--- /dev/null
+++ b/zfp/src/template/codec4.c
@@ -0,0 +1,297 @@
+#define index(i, j, k, l) ((i) + 4 * ((j) + 4 * ((k) + 4 * (l))))
+
+/* order coefficients (i, j, k, l) by i + j + k + l, then i^2 + j^2 + k^2 + l^2 */
+cache_align_(static const uchar perm_4[256]) = {
+  index(0, 0, 0, 0), /*   0 :  0 */
+
+  index(1, 0, 0, 0), /*   1 :  1 */
+  index(0, 1, 0, 0), /*   2 :  1 */
+  index(0, 0, 1, 0), /*   3 :  1 */
+  index(0, 0, 0, 1), /*   4 :  1 */
+
+  index(1, 1, 0, 0), /*   5 :  2 */
+  index(0, 0, 1, 1), /*   6 :  2 */
+  index(1, 0, 1, 0), /*   7 :  2 */
+  index(0, 1, 0, 1), /*   8 :  2 */
+  index(1, 0, 0, 1), /*   9 :  2 */
+  index(0, 1, 1, 0), /*  10 :  2 */
+
+  index(2, 0, 0, 0), /*  11 :  2 */
+  index(0, 2, 0, 0), /*  12 :  2 */
+  index(0, 0, 2, 0), /*  13 :  2 */
+  index(0, 0, 0, 2), /*  14 :  2 */
+
+  index(0, 1, 1, 1), /*  15 :  3 */
+  index(1, 0, 1, 1), /*  16 :  3 */
+  index(1, 1, 0, 1), /*  17 :  3 */
+  index(1, 1, 1, 0), /*  18 :  3 */
+
+  index(2, 1, 0, 0), /*  19 :  3 */
+  index(2, 0, 1, 0), /*  20 :  3 */
+  index(2, 0, 0, 1), /*  21 :  3 */
+  index(0, 2, 1, 0), /*  22 :  3 */
+  index(0, 2, 0, 1), /*  23 :  3 */
+  index(1, 2, 0, 0), /*  24 :  3 */
+  index(0, 0, 2, 1), /*  25 :  3 */
+  index(1, 0, 2, 0), /*  26 :  3 */
+  index(0, 1, 2, 0), /*  27 :  3 */
+  index(1, 0, 0, 2), /*  28 :  3 */
+  index(0, 1, 0, 2), /*  29 :  3 */
+  index(0, 0, 1, 2), /*  30 :  3 */
+
+  index(3, 0, 0, 0), /*  31 :  3 */
+  index(0, 3, 0, 0), /*  32 :  3 */
+  index(0, 0, 3, 0), /*  33 :  3 */
+  index(0, 0, 0, 3), /*  34 :  3 */
+
+  index(1, 1, 1, 1), /*  35 :  4 */
+
+  index(2, 0, 1, 1), /*  36 :  4 */
+  index(2, 1, 0, 1), /*  37 :  4 */
+  index(2, 1, 1, 0), /*  38 :  4 */
+  index(1, 2, 0, 1), /*  39 :  4 */
+  index(1, 2, 1, 0), /*  40 :  4 */
+  index(0, 2, 1, 1), /*  41 :  4 */
+  index(1, 1, 2, 0), /*  42 :  4 */
+  index(0, 1, 2, 1), /*  43 :  4 */
+  index(1, 0, 2, 1), /*  44 :  4 */
+  index(0, 1, 1, 2), /*  45 :  4 */
+  index(1, 0, 1, 2), /*  46 :  4 */
+  index(1, 1, 0, 2), /*  47 :  4 */
+
+  index(2, 2, 0, 0), /*  48 :  4 */
+  index(0, 0, 2, 2), /*  49 :  4 */
+  index(2, 0, 2, 0), /*  50 :  4 */
+  index(0, 2, 0, 2), /*  51 :  4 */
+  index(2, 0, 0, 2), /*  52 :  4 */
+  index(0, 2, 2, 0), /*  53 :  4 */
+
+  index(3, 1, 0, 0), /*  54 :  4 */
+  index(3, 0, 1, 0), /*  55 :  4 */
+  index(3, 0, 0, 1), /*  56 :  4 */
+  index(0, 3, 1, 0), /*  57 :  4 */
+  index(0, 3, 0, 1), /*  58 :  4 */
+  index(1, 3, 0, 0), /*  59 :  4 */
+  index(0, 0, 3, 1), /*  60 :  4 */
+  index(1, 0, 3, 0), /*  61 :  4 */
+  index(0, 1, 3, 0), /*  62 :  4 */
+  index(1, 0, 0, 3), /*  63 :  4 */
+  index(0, 1, 0, 3), /*  64 :  4 */
+  index(0, 0, 1, 3), /*  65 :  4 */
+
+  index(2, 1, 1, 1), /*  66 :  5 */
+  index(1, 2, 1, 1), /*  67 :  5 */
+  index(1, 1, 2, 1), /*  68 :  5 */
+  index(1, 1, 1, 2), /*  69 :  5 */
+
+  index(1, 0, 2, 2), /*  70 :  5 */
+  index(1, 2, 0, 2), /*  71 :  5 */
+  index(1, 2, 2, 0), /*  72 :  5 */
+  index(2, 1, 0, 2), /*  73 :  5 */
+  index(2, 1, 2, 0), /*  74 :  5 */
+  index(0, 1, 2, 2), /*  75 :  5 */
+  index(2, 2, 1, 0), /*  76 :  5 */
+  index(0, 2, 1, 2), /*  77 :  5 */
+  index(2, 0, 1, 2), /*  78 :  5 */
+  index(0, 2, 2, 1), /*  79 :  5 */
+  index(2, 0, 2, 1), /*  80 :  5 */
+  index(2, 2, 0, 1), /*  81 :  5 */
+
+  index(3, 0, 1, 1), /*  82 :  5 */
+  index(3, 1, 0, 1), /*  83 :  5 */
+  index(3, 1, 1, 0), /*  84 :  5 */
+  index(1, 3, 0, 1), /*  85 :  5 */
+  index(1, 3, 1, 0), /*  86 :  5 */
+  index(0, 3, 1, 1), /*  87 :  5 */
+  index(1, 1, 3, 0), /*  88 :  5 */
+  index(0, 1, 3, 1), /*  89 :  5 */
+  index(1, 0, 3, 1), /*  90 :  5 */
+  index(0, 1, 1, 3), /*  91 :  5 */
+  index(1, 0, 1, 3), /*  92 :  5 */
+  index(1, 1, 0, 3), /*  93 :  5 */
+
+  index(3, 2, 0, 0), /*  94 :  5 */
+  index(3, 0, 2, 0), /*  95 :  5 */
+  index(3, 0, 0, 2), /*  96 :  5 */
+  index(0, 3, 2, 0), /*  97 :  5 */
+  index(0, 3, 0, 2), /*  98 :  5 */
+  index(2, 3, 0, 0), /*  99 :  5 */
+  index(0, 0, 3, 2), /* 100 :  5 */
+  index(2, 0, 3, 0), /* 101 :  5 */
+  index(0, 2, 3, 0), /* 102 :  5 */
+  index(2, 0, 0, 3), /* 103 :  5 */
+  index(0, 2, 0, 3), /* 104 :  5 */
+  index(0, 0, 2, 3), /* 105 :  5 */
+
+  index(2, 2, 1, 1), /* 106 :  6 */
+  index(1, 1, 2, 2), /* 107 :  6 */
+  index(2, 1, 2, 1), /* 108 :  6 */
+  index(1, 2, 1, 2), /* 109 :  6 */
+  index(2, 1, 1, 2), /* 110 :  6 */
+  index(1, 2, 2, 1), /* 111 :  6 */
+
+  index(0, 2, 2, 2), /* 112 :  6 */
+  index(2, 0, 2, 2), /* 113 :  6 */
+  index(2, 2, 0, 2), /* 114 :  6 */
+  index(2, 2, 2, 0), /* 115 :  6 */
+
+  index(3, 1, 1, 1), /* 116 :  6 */
+  index(1, 3, 1, 1), /* 117 :  6 */
+  index(1, 1, 3, 1), /* 118 :  6 */
+  index(1, 1, 1, 3), /* 119 :  6 */
+
+  index(3, 2, 1, 0), /* 120 :  6 */
+  index(3, 2, 0, 1), /* 121 :  6 */
+  index(3, 0, 2, 1), /* 122 :  6 */
+  index(3, 1, 2, 0), /* 123 :  6 */
+  index(3, 1, 0, 2), /* 124 :  6 */
+  index(3, 0, 1, 2), /* 125 :  6 */
+  index(0, 3, 2, 1), /* 126 :  6 */
+  index(1, 3, 2, 0), /* 127 :  6 */
+  index(1, 3, 0, 2), /* 128 :  6 */
+  index(0, 3, 1, 2), /* 129 :  6 */
+  index(2, 3, 1, 0), /* 130 :  6 */
+  index(2, 3, 0, 1), /* 131 :  6 */
+  index(1, 0, 3, 2), /* 132 :  6 */
+  index(0, 1, 3, 2), /* 133 :  6 */
+  index(2, 1, 3, 0), /* 134 :  6 */
+  index(2, 0, 3, 1), /* 135 :  6 */
+  index(0, 2, 3, 1), /* 136 :  6 */
+  index(1, 2, 3, 0), /* 137 :  6 */
+  index(2, 1, 0, 3), /* 138 :  6 */
+  index(2, 0, 1, 3), /* 139 :  6 */
+  index(0, 2, 1, 3), /* 140 :  6 */
+  index(1, 2, 0, 3), /* 141 :  6 */
+  index(1, 0, 2, 3), /* 142 :  6 */
+  index(0, 1, 2, 3), /* 143 :  6 */
+
+  index(3, 3, 0, 0), /* 144 :  6 */
+  index(0, 0, 3, 3), /* 145 :  6 */
+  index(3, 0, 3, 0), /* 146 :  6 */
+  index(0, 3, 0, 3), /* 147 :  6 */
+  index(3, 0, 0, 3), /* 148 :  6 */
+  index(0, 3, 3, 0), /* 149 :  6 */
+
+  index(1, 2, 2, 2), /* 150 :  7 */
+  index(2, 1, 2, 2), /* 151 :  7 */
+  index(2, 2, 1, 2), /* 152 :  7 */
+  index(2, 2, 2, 1), /* 153 :  7 */
+
+  index(3, 2, 1, 1), /* 154 :  7 */
+  index(3, 1, 2, 1), /* 155 :  7 */
+  index(3, 1, 1, 2), /* 156 :  7 */
+  index(1, 3, 2, 1), /* 157 :  7 */
+  index(1, 3, 1, 2), /* 158 :  7 */
+  index(2, 3, 1, 1), /* 159 :  7 */
+  index(1, 1, 3, 2), /* 160 :  7 */
+  index(2, 1, 3, 1), /* 161 :  7 */
+  index(1, 2, 3, 1), /* 162 :  7 */
+  index(2, 1, 1, 3), /* 163 :  7 */
+  index(1, 2, 1, 3), /* 164 :  7 */
+  index(1, 1, 2, 3), /* 165 :  7 */
+
+  index(3, 0, 2, 2), /* 166 :  7 */
+  index(3, 2, 0, 2), /* 167 :  7 */
+  index(3, 2, 2, 0), /* 168 :  7 */
+  index(2, 3, 0, 2), /* 169 :  7 */
+  index(2, 3, 2, 0), /* 170 :  7 */
+  index(0, 3, 2, 2), /* 171 :  7 */
+  index(2, 2, 3, 0), /* 172 :  7 */
+  index(0, 2, 3, 2), /* 173 :  7 */
+  index(2, 0, 3, 2), /* 174 :  7 */
+  index(0, 2, 2, 3), /* 175 :  7 */
+  index(2, 0, 2, 3), /* 176 :  7 */
+  index(2, 2, 0, 3), /* 177 :  7 */
+
+  index(1, 0, 3, 3), /* 178 :  7 */
+  index(1, 3, 0, 3), /* 179 :  7 */
+  index(1, 3, 3, 0), /* 180 :  7 */
+  index(3, 1, 0, 3), /* 181 :  7 */
+  index(3, 1, 3, 0), /* 182 :  7 */
+  index(0, 1, 3, 3), /* 183 :  7 */
+  index(3, 3, 1, 0), /* 184 :  7 */
+  index(0, 3, 1, 3), /* 185 :  7 */
+  index(3, 0, 1, 3), /* 186 :  7 */
+  index(0, 3, 3, 1), /* 187 :  7 */
+  index(3, 0, 3, 1), /* 188 :  7 */
+  index(3, 3, 0, 1), /* 189 :  7 */
+
+  index(2, 2, 2, 2), /* 190 :  8 */
+
+  index(3, 1, 2, 2), /* 191 :  8 */
+  index(3, 2, 1, 2), /* 192 :  8 */
+  index(3, 2, 2, 1), /* 193 :  8 */
+  index(2, 3, 1, 2), /* 194 :  8 */
+  index(2, 3, 2, 1), /* 195 :  8 */
+  index(1, 3, 2, 2), /* 196 :  8 */
+  index(2, 2, 3, 1), /* 197 :  8 */
+  index(1, 2, 3, 2), /* 198 :  8 */
+  index(2, 1, 3, 2), /* 199 :  8 */
+  index(1, 2, 2, 3), /* 200 :  8 */
+  index(2, 1, 2, 3), /* 201 :  8 */
+  index(2, 2, 1, 3), /* 202 :  8 */
+
+  index(3, 3, 1, 1), /* 203 :  8 */
+  index(1, 1, 3, 3), /* 204 :  8 */
+  index(3, 1, 3, 1), /* 205 :  8 */
+  index(1, 3, 1, 3), /* 206 :  8 */
+  index(3, 1, 1, 3), /* 207 :  8 */
+  index(1, 3, 3, 1), /* 208 :  8 */
+
+  index(2, 0, 3, 3), /* 209 :  8 */
+  index(2, 3, 0, 3), /* 210 :  8 */
+  index(2, 3, 3, 0), /* 211 :  8 */
+  index(3, 2, 0, 3), /* 212 :  8 */
+  index(3, 2, 3, 0), /* 213 :  8 */
+  index(0, 2, 3, 3), /* 214 :  8 */
+  index(3, 3, 2, 0), /* 215 :  8 */
+  index(0, 3, 2, 3), /* 216 :  8 */
+  index(3, 0, 2, 3), /* 217 :  8 */
+  index(0, 3, 3, 2), /* 218 :  8 */
+  index(3, 0, 3, 2), /* 219 :  8 */
+  index(3, 3, 0, 2), /* 220 :  8 */
+
+  index(3, 2, 2, 2), /* 221 :  9 */
+  index(2, 3, 2, 2), /* 222 :  9 */
+  index(2, 2, 3, 2), /* 223 :  9 */
+  index(2, 2, 2, 3), /* 224 :  9 */
+
+  index(2, 1, 3, 3), /* 225 :  9 */
+  index(2, 3, 1, 3), /* 226 :  9 */
+  index(2, 3, 3, 1), /* 227 :  9 */
+  index(3, 2, 1, 3), /* 228 :  9 */
+  index(3, 2, 3, 1), /* 229 :  9 */
+  index(1, 2, 3, 3), /* 230 :  9 */
+  index(3, 3, 2, 1), /* 231 :  9 */
+  index(1, 3, 2, 3), /* 232 :  9 */
+  index(3, 1, 2, 3), /* 233 :  9 */
+  index(1, 3, 3, 2), /* 234 :  9 */
+  index(3, 1, 3, 2), /* 235 :  9 */
+  index(3, 3, 1, 2), /* 236 :  9 */
+
+  index(0, 3, 3, 3), /* 237 :  9 */
+  index(3, 0, 3, 3), /* 238 :  9 */
+  index(3, 3, 0, 3), /* 239 :  9 */
+  index(3, 3, 3, 0), /* 240 :  9 */
+
+  index(3, 3, 2, 2), /* 241 : 10 */
+  index(2, 2, 3, 3), /* 242 : 10 */
+  index(3, 2, 3, 2), /* 243 : 10 */
+  index(2, 3, 2, 3), /* 244 : 10 */
+  index(3, 2, 2, 3), /* 245 : 10 */
+  index(2, 3, 3, 2), /* 246 : 10 */
+
+  index(1, 3, 3, 3), /* 247 : 10 */
+  index(3, 1, 3, 3), /* 248 : 10 */
+  index(3, 3, 1, 3), /* 249 : 10 */
+  index(3, 3, 3, 1), /* 250 : 10 */
+
+  index(2, 3, 3, 3), /* 251 : 11 */
+  index(3, 2, 3, 3), /* 252 : 11 */
+  index(3, 3, 2, 3), /* 253 : 11 */
+  index(3, 3, 3, 2), /* 254 : 11 */
+
+  index(3, 3, 3, 3), /* 255 : 12 */
+};
+
+#undef index
diff --git a/zfp/src/template/codecf.c b/zfp/src/template/codecf.c
new file mode 100644
index 0000000000000000000000000000000000000000..61003cfb286d6c78afe20e6cdd108c1855f5bf45
--- /dev/null
+++ b/zfp/src/template/codecf.c
@@ -0,0 +1,6 @@
+/* maximum number of bit planes to encode */
+static uint
+precision(int maxexp, uint maxprec, int minexp, int dims)
+{
+  return MIN(maxprec, (uint)MAX(0, maxexp - minexp + 2 * (dims + 1)));
+}
diff --git a/zfp/src/template/compress.c b/zfp/src/template/compress.c
new file mode 100644
index 0000000000000000000000000000000000000000..6a4b370c85630679f48924dca9af1ca90efa1224
--- /dev/null
+++ b/zfp/src/template/compress.c
@@ -0,0 +1,128 @@
+/* compress 1d contiguous array */
+static void
+_t2(compress, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
+{
+  const Scalar* data = (const Scalar*)field->data;
+  uint nx = field->nx;
+  uint mx = nx & ~3u;
+  uint x;
+
+  /* compress array one block of 4 values at a time */
+  for (x = 0; x < mx; x += 4, data += 4)
+    _t2(zfp_encode_block, Scalar, 1)(stream, data);
+  if (x < nx)
+    _t2(zfp_encode_partial_block_strided, Scalar, 1)(stream, data, nx - x, 1);
+}
+
+#if 0
+/* compress 1d strided array */
+static void
+_t2(compress_strided, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
+{
+  const Scalar* data = (const Scalar*)field->data;
+  uint nx = field->nx;
+  uint mx = nx & ~3u;
+  int sx = field->sx ? field->sx : 1;
+  uint x;
+
+  /* compress array one block of 4 values at a time */
+  for (x = 0; x < mx; x += 4, data += 4 * sx)
+    _t2(zfp_encode_block_strided, Scalar, 1)(stream, data, sx);
+  if (x < nx)
+    _t2(zfp_encode_partial_block_strided, Scalar, 1)(stream, data, nx - x, sx);
+}
+#else
+/* compress 1d strided array */
+static void
+_t2(compress_strided, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
+{
+  const Scalar* data = field->data;
+  uint nx = field->nx;
+  int sx = field->sx ? field->sx : 1;
+  uint x;
+
+  /* compress array one block of 4 values at a time */
+  for (x = 0; x < nx; x += 4) {
+    const Scalar* p = data + sx * (ptrdiff_t)x;
+    if (nx - x < 4)
+      _t2(zfp_encode_partial_block_strided, Scalar, 1)(stream, p, nx - x, sx);
+    else
+      _t2(zfp_encode_block_strided, Scalar, 1)(stream, p, sx);
+  }
+}
+#endif
+
+/* compress 2d strided array */
+static void
+_t2(compress_strided, Scalar, 2)(zfp_stream* stream, const zfp_field* field)
+{
+  const Scalar* data = (const Scalar*)field->data;
+  uint nx = field->nx;
+  uint ny = field->ny;
+  int sx = field->sx ? field->sx : 1;
+  int sy = field->sy ? field->sy : nx;
+  uint x, y;
+
+  /* compress array one block of 4x4 values at a time */
+  for (y = 0; y < ny; y += 4)
+    for (x = 0; x < nx; x += 4) {
+      const Scalar* p = data + sx * (ptrdiff_t)x + sy * (ptrdiff_t)y;
+      if (nx - x < 4 || ny - y < 4)
+        _t2(zfp_encode_partial_block_strided, Scalar, 2)(stream, p, MIN(nx - x, 4u), MIN(ny - y, 4u), sx, sy);
+      else
+        _t2(zfp_encode_block_strided, Scalar, 2)(stream, p, sx, sy);
+    }
+}
+
+/* compress 3d strided array */
+static void
+_t2(compress_strided, Scalar, 3)(zfp_stream* stream, const zfp_field* field)
+{
+  const Scalar* data = (const Scalar*)field->data;
+  uint nx = field->nx;
+  uint ny = field->ny;
+  uint nz = field->nz;
+  int sx = field->sx ? field->sx : 1;
+  int sy = field->sy ? field->sy : nx;
+  int sz = field->sz ? field->sz : nx * ny;
+  uint x, y, z;
+
+  /* compress array one block of 4x4x4 values at a time */
+  for (z = 0; z < nz; z += 4)
+    for (y = 0; y < ny; y += 4)
+      for (x = 0; x < nx; x += 4) {
+        const Scalar* p = data + sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z;
+        if (nx - x < 4 || ny - y < 4 || nz - z < 4)
+          _t2(zfp_encode_partial_block_strided, Scalar, 3)(stream, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), sx, sy, sz);
+        else
+          _t2(zfp_encode_block_strided, Scalar, 3)(stream, p, sx, sy, sz);
+      }
+}
+
+/* compress 4d strided array */
+static void
+_t2(compress_strided, Scalar, 4)(zfp_stream* stream, const zfp_field* field)
+{
+  const Scalar* data = field->data;
+  uint nx = field->nx;
+  uint ny = field->ny;
+  uint nz = field->nz;
+  uint nw = field->nw;
+  int sx = field->sx ? field->sx : 1;
+  int sy = field->sy ? field->sy : nx;
+  int sz = field->sz ? field->sz : (ptrdiff_t)nx * ny;
+  int sw = field->sw ? field->sw : (ptrdiff_t)nx * ny * nz;
+  uint x, y, z, w;
+
+  /* compress array one block of 4x4x4x4 values at a time */
+  for (w = 0; w < nw; w += 4)
+    for (z = 0; z < nz; z += 4)
+      for (y = 0; y < ny; y += 4)
+        for (x = 0; x < nx; x += 4) {
+          const Scalar* p = data + sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z + sw * (ptrdiff_t)w;
+          if (nx - x < 4 || ny - y < 4 || nz - z < 4 || nw - w < 4)
+            _t2(zfp_encode_partial_block_strided, Scalar, 4)(stream, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), MIN(nw - w, 4u), sx, sy, sz, sw);
+          else
+            _t2(zfp_encode_block_strided, Scalar, 4)(stream, p, sx, sy, sz, sw);
+        }
+}
diff --git a/zfp/src/template/cudacompress.c b/zfp/src/template/cudacompress.c
new file mode 100644
index 0000000000000000000000000000000000000000..1d685c92f395f87742f1bd5fe4f5776b5b24a5d2
--- /dev/null
+++ b/zfp/src/template/cudacompress.c
@@ -0,0 +1,44 @@
+#ifdef ZFP_WITH_CUDA
+
+#include "../cuda_zfp/cuZFP.h"
+
+static void 
+_t2(compress_cuda, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
+{
+  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
+  { 
+    cuda_compress(stream, field);   
+  }
+}
+
+/* compress 1d strided array */
+static void 
+_t2(compress_strided_cuda, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
+{
+  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
+  {
+    cuda_compress(stream, field);   
+  }
+}
+
+/* compress 2d strided array */
+static void 
+_t2(compress_strided_cuda, Scalar, 2)(zfp_stream* stream, const zfp_field* field)
+{
+  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
+  {
+    cuda_compress(stream, field);   
+  }
+}
+
+/* compress 3d strided array */
+static void
+_t2(compress_strided_cuda, Scalar, 3)(zfp_stream* stream, const zfp_field* field)
+{
+  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
+  {
+    cuda_compress(stream, field);   
+  }
+}
+
+#endif
diff --git a/zfp/src/template/cudadecompress.c b/zfp/src/template/cudadecompress.c
new file mode 100644
index 0000000000000000000000000000000000000000..4ea4e5bf04fe0d768f53fabc31ce883ca192dd47
--- /dev/null
+++ b/zfp/src/template/cudadecompress.c
@@ -0,0 +1,44 @@
+#ifdef ZFP_WITH_CUDA
+
+#include "../cuda_zfp/cuZFP.h"
+
+static void
+_t2(decompress_cuda, Scalar, 1)(zfp_stream* stream, zfp_field* field)
+{
+  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
+  {
+    cuda_decompress(stream, field);   
+  }
+}
+
+/* compress 1d strided array */
+static void
+_t2(decompress_strided_cuda, Scalar, 1)(zfp_stream* stream, zfp_field* field)
+{
+  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
+  {
+    cuda_decompress(stream, field);   
+  }
+}
+
+/* compress 2d strided array */
+static void
+_t2(decompress_strided_cuda, Scalar, 2)(zfp_stream* stream, zfp_field* field)
+{
+  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
+  {
+    cuda_decompress(stream, field);   
+  }
+}
+
+/* compress 3d strided array */
+static void
+_t2(decompress_strided_cuda, Scalar, 3)(zfp_stream* stream, zfp_field* field)
+{
+  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
+  {
+    cuda_decompress(stream, field);   
+  }
+}
+
+#endif
diff --git a/zfp/src/template/decode.c b/zfp/src/template/decode.c
new file mode 100644
index 0000000000000000000000000000000000000000..e2a2f276da0099f3435264621410d0089137bf94
--- /dev/null
+++ b/zfp/src/template/decode.c
@@ -0,0 +1,141 @@
+#include <limits.h>
+
+static void _t2(inv_xform, Int, DIMS)(Int* p);
+
+/* private functions ------------------------------------------------------- */
+
+/* inverse lifting transform of 4-vector */
+static void
+_t1(inv_lift, Int)(Int* p, uint s)
+{
+  Int x, y, z, w;
+  x = *p; p += s;
+  y = *p; p += s;
+  z = *p; p += s;
+  w = *p; p += s;
+
+  /*
+  ** non-orthogonal transform
+  **       ( 4  6 -4 -1) (x)
+  ** 1/4 * ( 4  2  4  5) (y)
+  **       ( 4 -2  4 -5) (z)
+  **       ( 4 -6 -4  1) (w)
+  */
+  y += w >> 1; w -= y >> 1;
+  y += w; w <<= 1; w -= y;
+  z += x; x <<= 1; x -= z;
+  y += z; z <<= 1; z -= y;
+  w += x; x <<= 1; x -= w;
+
+  p -= s; *p = w;
+  p -= s; *p = z;
+  p -= s; *p = y;
+  p -= s; *p = x;
+}
+
+/* map two's complement signed integer to negabinary unsigned integer */
+static Int
+_t1(uint2int, UInt)(UInt x)
+{
+  return (Int)((x ^ NBMASK) - NBMASK);
+}
+
+/* reorder unsigned coefficients and convert to signed integer */
+static void
+_t1(inv_order, Int)(const UInt* ublock, Int* iblock, const uchar* perm, uint n)
+{
+  do
+    iblock[*perm++] = _t1(uint2int, UInt)(*ublock++);
+  while (--n);
+}
+
+/* decompress sequence of size unsigned integers */
+static uint
+_t1(decode_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec, UInt* restrict_ data, uint size)
+{
+  /* make a copy of bit stream to avoid aliasing */
+  bitstream s = *stream;
+  uint intprec = CHAR_BIT * (uint)sizeof(UInt);
+  uint kmin = intprec > maxprec ? intprec - maxprec : 0;
+  uint bits = maxbits;
+  uint i, k, m, n;
+  uint64 x;
+
+  /* initialize data array to all zeros */
+  for (i = 0; i < size; i++)
+    data[i] = 0;
+
+  /* decode one bit plane at a time from MSB to LSB */
+  for (k = intprec, n = 0; bits && k-- > kmin;) {
+    /* decode first n bits of bit plane #k */
+    m = MIN(n, bits);
+    bits -= m;
+    x = stream_read_bits(&s, m);
+    /* unary run-length decode remainder of bit plane */
+    for (; n < size && bits && (bits--, stream_read_bit(&s)); x += (uint64)1 << n++)
+      for (; n < size - 1 && bits && (bits--, !stream_read_bit(&s)); n++)
+        ;
+    /* deposit bit plane from x */
+    for (i = 0; x; i++, x >>= 1)
+      data[i] += (UInt)(x & 1u) << k;
+  }
+
+  *stream = s;
+  return maxbits - bits;
+}
+
+/* decompress sequence of size > 64 unsigned integers */
+static uint
+_t1(decode_many_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec, UInt* restrict_ data, uint size)
+{
+  /* make a copy of bit stream to avoid aliasing */
+  bitstream s = *stream;
+  uint intprec = CHAR_BIT * (uint)sizeof(UInt);
+  uint kmin = intprec > maxprec ? intprec - maxprec : 0;
+  uint bits = maxbits;
+  uint i, k, m, n;
+
+  /* initialize data array to all zeros */
+  for (i = 0; i < size; i++)
+    data[i] = 0;
+
+  /* decode one bit plane at a time from MSB to LSB */
+  for (k = intprec, n = 0; bits && k-- > kmin;) {
+    /* decode first n bits of bit plane #k */
+    m = MIN(n, bits);
+    bits -= m;
+    for (i = 0; i < m; i++)
+      if (stream_read_bit(&s))
+        data[i] += (UInt)1 << k;
+    /* unary run-length decode remainder of bit plane */
+    for (; n < size && bits && (--bits, stream_read_bit(&s)); data[n] += (UInt)1 << k, n++)
+      for (; n < size - 1 && bits && (--bits, !stream_read_bit(&s)); n++)
+        ;
+  }
+
+  *stream = s;
+  return maxbits - bits;
+}
+
+/* decode block of integers */
+static uint
+_t2(decode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, int maxprec, Int* iblock)
+{
+  int bits;
+  cache_align_(UInt ublock[BLOCK_SIZE]);
+  /* decode integer coefficients */
+  if (BLOCK_SIZE <= 64)
+    bits = _t1(decode_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE);
+  else
+    bits = _t1(decode_many_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE);
+  /* read at least minbits bits */
+  if (bits < minbits) {
+    stream_skip(stream, minbits - bits);
+    bits = minbits;
+  }
+  /* reorder unsigned coefficients and convert to signed integer */
+  _t1(inv_order, Int)(ublock, iblock, PERM, BLOCK_SIZE);
+  /* perform decorrelating transform */
+  _t2(inv_xform, Int, DIMS)(iblock);
+  return bits;
+}
diff --git a/zfp/src/template/decode1.c b/zfp/src/template/decode1.c
new file mode 100644
index 0000000000000000000000000000000000000000..68ee0793e82a64c4733d59c255255c7588e3b424
--- /dev/null
+++ b/zfp/src/template/decode1.c
@@ -0,0 +1,53 @@
+/* private functions ------------------------------------------------------- */
+
+/* scatter 4-value block to strided array */
+static void
+_t2(scatter, Scalar, 1)(const Scalar* q, Scalar* p, int sx)
+{
+  uint x;
+  for (x = 0; x < 4; x++, p += sx)
+    *p = *q++;
+}
+
+/* scatter nx-value block to strided array */
+static void
+_t2(scatter_partial, Scalar, 1)(const Scalar* q, Scalar* p, uint nx, int sx)
+{
+  uint x;
+  for (x = 0; x < nx; x++, p += sx)
+   *p = *q++;
+}
+
+/* inverse decorrelating 1D transform */
+static void
+_t2(inv_xform, Int, 1)(Int* p)
+{
+  /* transform along x */
+  _t1(inv_lift, Int)(p, 1);
+}
+
+/* public functions -------------------------------------------------------- */
+
+/* decode 4-value floating-point block and store at p using stride sx */
+uint
+_t2(zfp_decode_block_strided, Scalar, 1)(zfp_stream* stream, Scalar* p, int sx)
+{
+  /* decode contiguous block */
+  cache_align_(Scalar fblock[4]);
+  uint bits = _t2(zfp_decode_block, Scalar, 1)(stream, fblock);
+  /* scatter block to strided array */
+  _t2(scatter, Scalar, 1)(fblock, p, sx);
+  return bits;
+}
+
+/* decode nx-value floating-point block and store at p using stride sx */
+uint
+_t2(zfp_decode_partial_block_strided, Scalar, 1)(zfp_stream* stream, Scalar* p, uint nx, int sx)
+{
+  /* decode contiguous block */
+  cache_align_(Scalar fblock[4]);
+  uint bits = _t2(zfp_decode_block, Scalar, 1)(stream, fblock);
+  /* scatter block to strided array */
+  _t2(scatter_partial, Scalar, 1)(fblock, p, nx, sx);
+  return bits;
+}
diff --git a/zfp/src/template/decode2.c b/zfp/src/template/decode2.c
new file mode 100644
index 0000000000000000000000000000000000000000..23e1892cb5726a1483eccd86e81206f62820ea28
--- /dev/null
+++ b/zfp/src/template/decode2.c
@@ -0,0 +1,60 @@
+/* private functions ------------------------------------------------------- */
+
+/* scatter 4*4 block to strided array */
+static void
+_t2(scatter, Scalar, 2)(const Scalar* q, Scalar* p, int sx, int sy)
+{
+  uint x, y;
+  for (y = 0; y < 4; y++, p += sy - 4 * sx)
+    for (x = 0; x < 4; x++, p += sx)
+      *p = *q++;
+}
+
+/* scatter nx*ny block to strided array */
+static void
+_t2(scatter_partial, Scalar, 2)(const Scalar* q, Scalar* p, uint nx, uint ny, int sx, int sy)
+{
+  uint x, y;
+  for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+    for (x = 0; x < nx; x++, p += sx, q++)
+      *p = *q;
+}
+
+/* inverse decorrelating 2D transform */
+static void
+_t2(inv_xform, Int, 2)(Int* p)
+{
+  uint x, y;
+  /* transform along y */
+  for (x = 0; x < 4; x++)
+    _t1(inv_lift, Int)(p + 1 * x, 4);
+  /* transform along x */
+  for (y = 0; y < 4; y++)
+    _t1(inv_lift, Int)(p + 4 * y, 1);
+}
+
+/* public functions -------------------------------------------------------- */
+
+/* decode 4*4 floating-point block and store at p using strides (sx, sy) */
+uint
+_t2(zfp_decode_block_strided, Scalar, 2)(zfp_stream* stream, Scalar* p, int sx, int sy)
+{
+  /* decode contiguous block */
+  cache_align_(Scalar fblock[16]);
+  uint bits = _t2(zfp_decode_block, Scalar, 2)(stream, fblock);
+  /* scatter block to strided array */
+  _t2(scatter, Scalar, 2)(fblock, p, sx, sy);
+  return bits;
+}
+
+/* decode nx*ny floating-point block and store at p using strides (sx, sy) */
+uint
+_t2(zfp_decode_partial_block_strided, Scalar, 2)(zfp_stream* stream, Scalar* p, uint nx, uint ny, int sx, int sy)
+{
+  /* decode contiguous block */
+  cache_align_(Scalar fblock[16]);
+  uint bits = _t2(zfp_decode_block, Scalar, 2)(stream, fblock);
+  /* scatter block to strided array */
+  _t2(scatter_partial, Scalar, 2)(fblock, p, nx, ny, sx, sy);
+  return bits;
+}
diff --git a/zfp/src/template/decode3.c b/zfp/src/template/decode3.c
new file mode 100644
index 0000000000000000000000000000000000000000..b48411821f1cfeb8be349d15645e6a137ec20a42
--- /dev/null
+++ b/zfp/src/template/decode3.c
@@ -0,0 +1,68 @@
+/* private functions ------------------------------------------------------- */
+
+/* scatter 4*4*4 block to strided array */
+static void
+_t2(scatter, Scalar, 3)(const Scalar* q, Scalar* p, int sx, int sy, int sz)
+{
+  uint x, y, z;
+  for (z = 0; z < 4; z++, p += sz - 4 * sy)
+    for (y = 0; y < 4; y++, p += sy - 4 * sx)
+      for (x = 0; x < 4; x++, p += sx)
+        *p = *q++;
+}
+
+/* scatter nx*ny*nz block to strided array */
+static void
+_t2(scatter_partial, Scalar, 3)(const Scalar* q, Scalar* p, uint nx, uint ny, uint nz, int sx, int sy, int sz)
+{
+  uint x, y, z;
+  for (z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 4 * (4 - ny))
+    for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 1 * (4 - nx))
+      for (x = 0; x < nx; x++, p += sx, q++)
+        *p = *q;
+}
+
+/* inverse decorrelating 3D transform */
+static void
+_t2(inv_xform, Int, 3)(Int* p)
+{
+  uint x, y, z;
+  /* transform along z */
+  for (y = 0; y < 4; y++)
+    for (x = 0; x < 4; x++)
+      _t1(inv_lift, Int)(p + 1 * x + 4 * y, 16);
+  /* transform along y */
+  for (x = 0; x < 4; x++)
+    for (z = 0; z < 4; z++)
+      _t1(inv_lift, Int)(p + 16 * z + 1 * x, 4);
+  /* transform along x */
+  for (z = 0; z < 4; z++)
+    for (y = 0; y < 4; y++)
+      _t1(inv_lift, Int)(p + 4 * y + 16 * z, 1);
+}
+
+/* public functions -------------------------------------------------------- */
+
+/* decode 4*4*4 floating-point block and store at p using strides (sx, sy, sz) */
+uint
+_t2(zfp_decode_block_strided, Scalar, 3)(zfp_stream* stream, Scalar* p, int sx, int sy, int sz)
+{
+  /* decode contiguous block */
+  cache_align_(Scalar fblock[64]);
+  uint bits = _t2(zfp_decode_block, Scalar, 3)(stream, fblock);
+  /* scatter block to strided array */
+  _t2(scatter, Scalar, 3)(fblock, p, sx, sy, sz);
+  return bits;
+}
+
+/* decode nx*ny*nz floating-point block and store at p using strides (sx, sy, sz) */
+uint
+_t2(zfp_decode_partial_block_strided, Scalar, 3)(zfp_stream* stream, Scalar* p, uint nx, uint ny, uint nz, int sx, int sy, int sz)
+{
+  /* decode contiguous block */
+  cache_align_(Scalar fblock[64]);
+  uint bits = _t2(zfp_decode_block, Scalar, 3)(stream, fblock);
+  /* scatter block to strided array */
+  _t2(scatter_partial, Scalar, 3)(fblock, p, nx, ny, nz, sx, sy, sz);
+  return bits;
+}
diff --git a/zfp/src/template/decode4.c b/zfp/src/template/decode4.c
new file mode 100644
index 0000000000000000000000000000000000000000..8d34abfce25c99ba326da7bfbb8ba68a59830f10
--- /dev/null
+++ b/zfp/src/template/decode4.c
@@ -0,0 +1,78 @@
+/* private functions ------------------------------------------------------- */
+
+/* scatter 4*4*4*4 block to strided array */
+static void
+_t2(scatter, Scalar, 4)(const Scalar* q, Scalar* p, int sx, int sy, int sz, int sw)
+{
+  uint x, y, z, w;
+  for (w = 0; w < 4; w++, p += sw - 4 * sz)
+    for (z = 0; z < 4; z++, p += sz - 4 * sy)
+      for (y = 0; y < 4; y++, p += sy - 4 * sx)
+        for (x = 0; x < 4; x++, p += sx)
+          *p = *q++;
+}
+
+/* scatter nx*ny*nz*nw block to strided array */
+static void
+_t2(scatter_partial, Scalar, 4)(const Scalar* q, Scalar* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw)
+{
+  uint x, y, z, w;
+  for (w = 0; w < nw; w++, p += sw - (ptrdiff_t)nz * sz, q += 16 * (4 - nz))
+    for (z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 4 * (4 - ny))
+      for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 1 * (4 - nx))
+        for (x = 0; x < nx; x++, p += sx, q++)
+          *p = *q;
+}
+
+/* inverse decorrelating 4D transform */
+static void
+_t2(inv_xform, Int, 4)(Int* p)
+{
+  uint x, y, z, w;
+  /* transform along w */
+  for (z = 0; z < 4; z++)
+    for (y = 0; y < 4; y++)
+      for (x = 0; x < 4; x++)
+        _t1(inv_lift, Int)(p + 1 * x + 4 * y + 16 * z, 64);
+  /* transform along z */
+  for (y = 0; y < 4; y++)
+    for (x = 0; x < 4; x++)
+      for (w = 0; w < 4; w++)
+        _t1(inv_lift, Int)(p + 64 * w + 1 * x + 4 * y, 16);
+  /* transform along y */
+  for (x = 0; x < 4; x++)
+    for (w = 0; w < 4; w++)
+      for (z = 0; z < 4; z++)
+        _t1(inv_lift, Int)(p + 16 * z + 64 * w + 1 * x, 4);
+  /* transform along x */
+  for (w = 0; w < 4; w++)
+    for (z = 0; z < 4; z++)
+      for (y = 0; y < 4; y++)
+        _t1(inv_lift, Int)(p + 4 * y + 16 * z + 64 * w, 1);
+}
+
+/* public functions -------------------------------------------------------- */
+
+/* decode 4*4*4*4 floating-point block and store at p using strides (sx, sy, sz, sw) */
+uint
+_t2(zfp_decode_block_strided, Scalar, 4)(zfp_stream* stream, Scalar* p, int sx, int sy, int sz, int sw)
+{
+  /* decode contiguous block */
+  cache_align_(Scalar fblock[256]);
+  uint bits = _t2(zfp_decode_block, Scalar, 4)(stream, fblock);
+  /* scatter block to strided array */
+  _t2(scatter, Scalar, 4)(fblock, p, sx, sy, sz, sw);
+  return bits;
+}
+
+/* decode nx*ny*nz*nw floating-point block and store at p using strides (sx, sy, sz, sw) */
+uint
+_t2(zfp_decode_partial_block_strided, Scalar, 4)(zfp_stream* stream, Scalar* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw)
+{
+  /* decode contiguous block */
+  cache_align_(Scalar fblock[256]);
+  uint bits = _t2(zfp_decode_block, Scalar, 4)(stream, fblock);
+  /* scatter block to strided array */
+  _t2(scatter_partial, Scalar, 4)(fblock, p, nx, ny, nz, nw, sx, sy, sz, sw);
+  return bits;
+}
diff --git a/zfp/src/template/decodef.c b/zfp/src/template/decodef.c
new file mode 100644
index 0000000000000000000000000000000000000000..b6abec939201fc0d94f79f77957519ec53e35ced
--- /dev/null
+++ b/zfp/src/template/decodef.c
@@ -0,0 +1,56 @@
+#include <limits.h>
+#include <math.h>
+
+/* private functions ------------------------------------------------------- */
+
+/* map integer x relative to exponent e to floating-point number */
+static Scalar
+_t1(dequantize, Scalar)(Int x, int e)
+{
+  return LDEXP((Scalar)x, e - (CHAR_BIT * (int)sizeof(Scalar) - 2));
+}
+
+/* inverse block-floating-point transform from signed integers */
+static void
+_t1(inv_cast, Scalar)(const Int* iblock, Scalar* fblock, uint n, int emax)
+{
+  /* compute power-of-two scale factor s */
+  Scalar s = _t1(dequantize, Scalar)(1, emax);
+  /* compute p-bit float x = s*y where |y| <= 2^(p-2) - 1 */
+  do
+    *fblock++ = (Scalar)(s * *iblock++);
+  while (--n);
+}
+
+/* public functions -------------------------------------------------------- */
+
+/* decode contiguous floating-point block */
+uint
+_t2(zfp_decode_block, Scalar, DIMS)(zfp_stream* zfp, Scalar* fblock)
+{
+  /* test if block has nonzero values */
+  if (stream_read_bit(zfp->stream)) {
+    cache_align_(Int iblock[BLOCK_SIZE]);
+    /* decode common exponent */
+    uint ebits = EBITS + 1;
+    int emax = (int)stream_read_bits(zfp->stream, ebits - 1) - EBIAS;
+    int maxprec = precision(emax, zfp->maxprec, zfp->minexp, DIMS);
+    /* decode integer block */
+    uint bits = _t2(decode_block, Int, DIMS)(zfp->stream, zfp->minbits - ebits, zfp->maxbits - ebits, maxprec, iblock);
+    /* perform inverse block-floating-point transform */
+    _t1(inv_cast, Scalar)(iblock, fblock, BLOCK_SIZE, emax);
+    return ebits + bits;
+  }
+  else {
+    /* set all values to zero */
+    uint i;
+    for (i = 0; i < BLOCK_SIZE; i++)
+      *fblock++ = 0;
+    if (zfp->minbits > 1) {
+      stream_skip(zfp->stream, zfp->minbits - 1);
+      return zfp->minbits;
+    }
+    else
+      return 1;
+  }
+}
diff --git a/zfp/src/template/decodei.c b/zfp/src/template/decodei.c
new file mode 100644
index 0000000000000000000000000000000000000000..b2fb4f440f93abfe65609df81e1a1f9d7da203e0
--- /dev/null
+++ b/zfp/src/template/decodei.c
@@ -0,0 +1,8 @@
+/* public functions -------------------------------------------------------- */
+
+/* decode contiguous integer block */
+uint
+_t2(zfp_decode_block, Int, DIMS)(zfp_stream* zfp, Int* iblock)
+{
+  return _t2(decode_block, Int, DIMS)(zfp->stream, zfp->minbits, zfp->maxbits, zfp->maxprec, iblock);
+}
diff --git a/zfp/src/template/decompress.c b/zfp/src/template/decompress.c
new file mode 100644
index 0000000000000000000000000000000000000000..db7bb512fc4c29e1d20eb6849ca87a6ca2f6ffae
--- /dev/null
+++ b/zfp/src/template/decompress.c
@@ -0,0 +1,128 @@
+/* decompress 1d contiguous array */
+static void
+_t2(decompress, Scalar, 1)(zfp_stream* stream, zfp_field* field)
+{
+  Scalar* data = (Scalar*)field->data;
+  uint nx = field->nx;
+  uint mx = nx & ~3u;
+  uint x;
+
+  /* decompress array one block of 4 values at a time */
+  for (x = 0; x < mx; x += 4, data += 4)
+    _t2(zfp_decode_block, Scalar, 1)(stream, data);
+  if (x < nx)
+    _t2(zfp_decode_partial_block_strided, Scalar, 1)(stream, data, nx - x, 1);
+}
+
+#if 0
+/* decompress 1d strided array */
+static void
+_t2(decompress_strided, Scalar, 1)(zfp_stream* stream, zfp_field* field)
+{
+  Scalar* data = (Scalar*)field->data;
+  uint nx = field->nx;
+  uint mx = nx & ~3u;
+  int sx = field->sx ? field->sx : 1;
+  uint x;
+
+  /* decompress array one block of 4 values at a time */
+  for (x = 0; x < mx; x += 4, data += 4 * sx)
+    _t2(zfp_decode_block_strided, Scalar, 1)(stream, data, sx);
+  if (x < nx)
+    _t2(zfp_decode_partial_block_strided, Scalar, 1)(stream, data, nx - x, sx);
+}
+#else
+/* decompress 1d strided array */
+static void
+_t2(decompress_strided, Scalar, 1)(zfp_stream* stream, zfp_field* field)
+{
+  Scalar* data = field->data;
+  uint nx = field->nx;
+  int sx = field->sx ? field->sx : 1;
+  uint x;
+
+  /* decompress array one block of 4 values at a time */
+  for (x = 0; x < nx; x += 4) {
+    Scalar* p = data + sx * (ptrdiff_t)x;
+    if (nx - x < 4)
+      _t2(zfp_decode_partial_block_strided, Scalar, 1)(stream, p, nx - x, sx);
+    else
+      _t2(zfp_decode_block_strided, Scalar, 1)(stream, p, sx);
+  }
+}
+#endif
+
+/* decompress 2d strided array */
+static void
+_t2(decompress_strided, Scalar, 2)(zfp_stream* stream, zfp_field* field)
+{
+  Scalar* data = (Scalar*)field->data;
+  uint nx = field->nx;
+  uint ny = field->ny;
+  int sx = field->sx ? field->sx : 1;
+  int sy = field->sy ? field->sy : nx;
+  uint x, y;
+
+  /* decompress array one block of 4x4 values at a time */
+  for (y = 0; y < ny; y += 4)
+    for (x = 0; x < nx; x += 4) {
+      Scalar* p = data + sx * (ptrdiff_t)x + sy * (ptrdiff_t)y;
+      if (nx - x < 4 || ny - y < 4)
+        _t2(zfp_decode_partial_block_strided, Scalar, 2)(stream, p, MIN(nx - x, 4u), MIN(ny - y, 4u), sx, sy);
+      else
+        _t2(zfp_decode_block_strided, Scalar, 2)(stream, p, sx, sy);
+    }
+}
+
+/* decompress 3d strided array */
+static void
+_t2(decompress_strided, Scalar, 3)(zfp_stream* stream, zfp_field* field)
+{
+  Scalar* data = (Scalar*)field->data;
+  uint nx = field->nx;
+  uint ny = field->ny;
+  uint nz = field->nz;
+  int sx = field->sx ? field->sx : 1;
+  int sy = field->sy ? field->sy : nx;
+  int sz = field->sz ? field->sz : nx * ny;
+  uint x, y, z;
+
+  /* decompress array one block of 4x4x4 values at a time */
+  for (z = 0; z < nz; z += 4)
+    for (y = 0; y < ny; y += 4)
+      for (x = 0; x < nx; x += 4) {
+        Scalar* p = data + sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z;
+        if (nx - x < 4 || ny - y < 4 || nz - z < 4)
+          _t2(zfp_decode_partial_block_strided, Scalar, 3)(stream, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), sx, sy, sz);
+        else
+          _t2(zfp_decode_block_strided, Scalar, 3)(stream, p, sx, sy, sz);
+      }
+}
+
+/* decompress 4d strided array */
+static void
+_t2(decompress_strided, Scalar, 4)(zfp_stream* stream, zfp_field* field)
+{
+  Scalar* data = field->data;
+  uint nx = field->nx;
+  uint ny = field->ny;
+  uint nz = field->nz;
+  uint nw = field->nw;
+  int sx = field->sx ? field->sx : 1;
+  int sy = field->sy ? field->sy : nx;
+  int sz = field->sz ? field->sz : (ptrdiff_t)nx * ny;
+  int sw = field->sw ? field->sw : (ptrdiff_t)nx * ny * nz;
+  uint x, y, z, w;
+
+  /* decompress array one block of 4x4x4x4 values at a time */
+  for (w = 0; w < nw; w += 4)
+    for (z = 0; z < nz; z += 4)
+      for (y = 0; y < ny; y += 4)
+        for (x = 0; x < nx; x += 4) {
+          Scalar* p = data + sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z + sw * (ptrdiff_t)w;
+          if (nx - x < 4 || ny - y < 4 || nz - z < 4 || nw - w < 4)
+            _t2(zfp_decode_partial_block_strided, Scalar, 4)(stream, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), MIN(nw - w, 4u), sx, sy, sz, sw);
+          else
+            _t2(zfp_decode_block_strided, Scalar, 4)(stream, p, sx, sy, sz, sw);
+        }
+}
diff --git a/zfp/src/template/encode.c b/zfp/src/template/encode.c
new file mode 100644
index 0000000000000000000000000000000000000000..bba18f60dc6b644ae0f48f3a6ca229af0150e30e
--- /dev/null
+++ b/zfp/src/template/encode.c
@@ -0,0 +1,159 @@
+#include <limits.h>
+
+static void _t2(fwd_xform, Int, DIMS)(Int* p);
+
+/* private functions ------------------------------------------------------- */
+
+/* pad partial block of width n <= 4 and stride s */
+static void
+_t1(pad_block, Scalar)(Scalar* p, uint n, uint s)
+{
+  switch (n) {
+    case 0:
+      p[0 * s] = 0;
+      /* FALLTHROUGH */
+    case 1:
+      p[1 * s] = p[0 * s];
+      /* FALLTHROUGH */
+    case 2:
+      p[2 * s] = p[1 * s];
+      /* FALLTHROUGH */
+    case 3:
+      p[3 * s] = p[0 * s];
+      /* FALLTHROUGH */
+    default:
+      break;
+  }
+}
+
+/* forward lifting transform of 4-vector */
+static void
+_t1(fwd_lift, Int)(Int* p, uint s)
+{
+  Int x, y, z, w;
+  x = *p; p += s;
+  y = *p; p += s;
+  z = *p; p += s;
+  w = *p; p += s;
+
+  /*
+  ** non-orthogonal transform
+  **        ( 4  4  4  4) (x)
+  ** 1/16 * ( 5  1 -1 -5) (y)
+  **        (-4  4  4 -4) (z)
+  **        (-2  6 -6  2) (w)
+  */
+  x += w; x >>= 1; w -= x;
+  z += y; z >>= 1; y -= z;
+  x += z; x >>= 1; z -= x;
+  w += y; w >>= 1; y -= w;
+  w += y >> 1; y -= w >> 1;
+
+  p -= s; *p = w;
+  p -= s; *p = z;
+  p -= s; *p = y;
+  p -= s; *p = x;
+}
+
+/* map two's complement signed integer to negabinary unsigned integer */
+static UInt
+_t1(int2uint, Int)(Int x)
+{
+  return ((UInt)x + NBMASK) ^ NBMASK;
+}
+
+/* reorder signed coefficients and convert to unsigned integer */
+static void
+_t1(fwd_order, Int)(UInt* ublock, const Int* iblock, const uchar* perm, uint n)
+{
+  do
+    *ublock++ = _t1(int2uint, Int)(iblock[*perm++]);
+  while (--n);
+}
+
+/* compress sequence of size unsigned integers */
+static uint
+_t1(encode_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec, const UInt* restrict_ data, uint size)
+{
+  /* make a copy of bit stream to avoid aliasing */
+  bitstream s = *stream;
+  uint intprec = CHAR_BIT * (uint)sizeof(UInt);
+  uint kmin = intprec > maxprec ? intprec - maxprec : 0;
+  uint bits = maxbits;
+  uint i, k, m, n;
+  uint64 x;
+
+  /* encode one bit plane at a time from MSB to LSB */
+  for (k = intprec, n = 0; bits && k-- > kmin;) {
+    /* step 1: extract bit plane #k to x */
+    x = 0;
+    for (i = 0; i < size; i++)
+      x += (uint64)((data[i] >> k) & 1u) << i;
+    /* step 2: encode first n bits of bit plane */
+    m = MIN(n, bits);
+    bits -= m;
+    x = stream_write_bits(&s, x, m);
+    /* step 3: unary run-length encode remainder of bit plane */
+    for (; n < size && bits && (bits--, stream_write_bit(&s, !!x)); x >>= 1, n++)
+      for (; n < size - 1 && bits && (bits--, !stream_write_bit(&s, x & 1u)); x >>= 1, n++)
+        ;
+  }
+
+  *stream = s;
+  return maxbits - bits;
+}
+
+/* compress sequence of size > 64 unsigned integers */
+static uint
+_t1(encode_many_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec, const UInt* restrict_ data, uint size)
+{
+  /* make a copy of bit stream to avoid aliasing */
+  bitstream s = *stream;
+  uint intprec = CHAR_BIT * (uint)sizeof(UInt);
+  uint kmin = intprec > maxprec ? intprec - maxprec : 0;
+  uint bits = maxbits;
+  uint i, k, m, n, c;
+
+  /* encode one bit plane at a time from MSB to LSB */
+  for (k = intprec, n = 0; bits && k-- > kmin;) {
+    /* step 1: encode first n bits of bit plane #k */
+    m = MIN(n, bits);
+    bits -= m;
+    for (i = 0; i < m; i++)
+      stream_write_bit(&s, (data[i] >> k) & 1u);
+    /* step 2: count remaining one-bits in bit plane */
+    c = 0;
+    for (i = m; i < size; i++)
+      c += (data[i] >> k) & 1u;
+    /* step 3: unary run-length encode remainder of bit plane */
+    for (; n < size && bits && (--bits, stream_write_bit(&s, !!c)); c--, n++)
+      for (; n < size - 1 && bits && (--bits, !stream_write_bit(&s, (data[n] >> k) & 1u)); n++)
+        ;
+  }
+
+  *stream = s;
+  return maxbits - bits;
+}
+
+/* encode block of integers */
+static uint
+_t2(encode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, int maxprec, Int* iblock)
+{
+  int bits;
+  cache_align_(UInt ublock[BLOCK_SIZE]);
+  /* perform decorrelating transform */
+  _t2(fwd_xform, Int, DIMS)(iblock);
+  /* reorder signed coefficients and convert to unsigned integer */
+  _t1(fwd_order, Int)(ublock, iblock, PERM, BLOCK_SIZE);
+  /* encode integer coefficients */
+  if (BLOCK_SIZE <= 64)
+    bits = _t1(encode_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE);
+  else
+    bits = _t1(encode_many_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE);
+  /* write at least minbits bits by padding with zeros */
+  if (bits < minbits) {
+    stream_pad(stream, minbits - bits);
+    bits = minbits;
+  }
+  return bits;
+}
diff --git a/zfp/src/template/encode1.c b/zfp/src/template/encode1.c
new file mode 100644
index 0000000000000000000000000000000000000000..c61849299a4ffbcde0c99ee7b5c90546cfcc3c9c
--- /dev/null
+++ b/zfp/src/template/encode1.c
@@ -0,0 +1,52 @@
+/* private functions ------------------------------------------------------- */
+
+/* gather 4-value block from strided array */
+static void
+_t2(gather, Scalar, 1)(Scalar* q, const Scalar* p, int sx)
+{
+  uint x;
+  for (x = 0; x < 4; x++, p += sx)
+    *q++ = *p;
+}
+
+/* gather nx-value block from strided array */
+static void
+_t2(gather_partial, Scalar, 1)(Scalar* q, const Scalar* p, uint nx, int sx)
+{
+  uint x;
+  for (x = 0; x < nx; x++, p += sx)
+    q[x] = *p;
+  _t1(pad_block, Scalar)(q, nx, 1);
+}
+
+/* forward decorrelating 1D transform */
+static void
+_t2(fwd_xform, Int, 1)(Int* p)
+{
+  /* transform along x */
+  _t1(fwd_lift, Int)(p, 1);
+}
+
+/* public functions -------------------------------------------------------- */
+
+/* encode 4-value floating-point block stored at p using stride sx */
+uint
+_t2(zfp_encode_block_strided, Scalar, 1)(zfp_stream* stream, const Scalar* p, int sx)
+{
+  /* gather block from strided array */
+  cache_align_(Scalar fblock[4]);
+  _t2(gather, Scalar, 1)(fblock, p, sx);
+  /* encode floating-point block */
+  return _t2(zfp_encode_block, Scalar, 1)(stream, fblock);
+}
+
+/* encode nx-value floating-point block stored at p using stride sx */
+uint
+_t2(zfp_encode_partial_block_strided, Scalar, 1)(zfp_stream* stream, const Scalar* p, uint nx, int sx)
+{
+  /* gather block from strided array */
+  cache_align_(Scalar fblock[4]);
+  _t2(gather_partial, Scalar, 1)(fblock, p, nx, sx);
+  /* encode floating-point block */
+  return _t2(zfp_encode_block, Scalar, 1)(stream, fblock);
+}
diff --git a/zfp/src/template/encode2.c b/zfp/src/template/encode2.c
new file mode 100644
index 0000000000000000000000000000000000000000..4bec256a630c54e700c43e909bd6e479dbf5baf4
--- /dev/null
+++ b/zfp/src/template/encode2.c
@@ -0,0 +1,62 @@
+/* private functions ------------------------------------------------------- */
+
+/* gather 4*4 block from strided array */
+static void
+_t2(gather, Scalar, 2)(Scalar* q, const Scalar* p, int sx, int sy)
+{
+  uint x, y;
+  for (y = 0; y < 4; y++, p += sy - 4 * sx)
+    for (x = 0; x < 4; x++, p += sx)
+      *q++ = *p;
+}
+
+/* gather nx*ny block from strided array */
+static void
+_t2(gather_partial, Scalar, 2)(Scalar* q, const Scalar* p, uint nx, uint ny, int sx, int sy)
+{
+  uint x, y;
+  for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx) {
+    for (x = 0; x < nx; x++, p += sx)
+      q[4 * y + x] = *p;
+    _t1(pad_block, Scalar)(q + 4 * y, nx, 1);
+  }
+  for (x = 0; x < 4; x++)
+    _t1(pad_block, Scalar)(q + x, ny, 4);
+}
+
+/* forward decorrelating 2D transform */
+static void
+_t2(fwd_xform, Int, 2)(Int* p)
+{
+  uint x, y;
+  /* transform along x */
+  for (y = 0; y < 4; y++)
+    _t1(fwd_lift, Int)(p + 4 * y, 1);
+  /* transform along y */
+  for (x = 0; x < 4; x++)
+    _t1(fwd_lift, Int)(p + 1 * x, 4);
+}
+
+/* public functions -------------------------------------------------------- */
+
+/* encode 4*4 floating-point block stored at p using strides (sx, sy) */
+uint
+_t2(zfp_encode_block_strided, Scalar, 2)(zfp_stream* stream, const Scalar* p, int sx, int sy)
+{
+  /* gather block from strided array */
+  cache_align_(Scalar fblock[16]);
+  _t2(gather, Scalar, 2)(fblock, p, sx, sy);
+  /* encode floating-point block */
+  return _t2(zfp_encode_block, Scalar, 2)(stream, fblock);
+}
+
+/* encode nx*ny floating-point block stored at p using strides (sx, sy) */
+uint
+_t2(zfp_encode_partial_block_strided, Scalar, 2)(zfp_stream* stream, const Scalar* p, uint nx, uint ny, int sx, int sy)
+{
+  /* gather block from strided array */
+  cache_align_(Scalar fblock[16]);
+  _t2(gather_partial, Scalar, 2)(fblock, p, nx, ny, sx, sy);
+  /* encode floating-point block */
+  return _t2(zfp_encode_block, Scalar, 2)(stream, fblock);
+}
diff --git a/zfp/src/template/encode3.c b/zfp/src/template/encode3.c
new file mode 100644
index 0000000000000000000000000000000000000000..a16a8add97dd5f3ecd3036982f384f68d9b4dfb3
--- /dev/null
+++ b/zfp/src/template/encode3.c
@@ -0,0 +1,74 @@
+/* private functions ------------------------------------------------------- */
+
+/* gather 4*4*4 block from strided array */
+static void
+_t2(gather, Scalar, 3)(Scalar* q, const Scalar* p, int sx, int sy, int sz)
+{
+  uint x, y, z;
+  for (z = 0; z < 4; z++, p += sz - 4 * sy)
+    for (y = 0; y < 4; y++, p += sy - 4 * sx)
+      for (x = 0; x < 4; x++, p += sx)
+        *q++ = *p;
+}
+
+/* gather nx*ny*nz block from strided array */
+static void
+_t2(gather_partial, Scalar, 3)(Scalar* q, const Scalar* p, uint nx, uint ny, uint nz, int sx, int sy, int sz)
+{
+  uint x, y, z;
+  for (z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy) {
+    for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx) {
+      for (x = 0; x < nx; x++, p += sx)
+        q[16 * z + 4 * y + x] = *p; 
+      _t1(pad_block, Scalar)(q + 16 * z + 4 * y, nx, 1);
+    }
+    for (x = 0; x < 4; x++)
+      _t1(pad_block, Scalar)(q + 16 * z + x, ny, 4);
+  }
+  for (y = 0; y < 4; y++)
+    for (x = 0; x < 4; x++)
+      _t1(pad_block, Scalar)(q + 4 * y + x, nz, 16);
+}
+
+/* forward decorrelating 3D transform */
+static void
+_t2(fwd_xform, Int, 3)(Int* p)
+{
+  uint x, y, z;
+  /* transform along x */
+  for (z = 0; z < 4; z++)
+    for (y = 0; y < 4; y++)
+      _t1(fwd_lift, Int)(p + 4 * y + 16 * z, 1);
+  /* transform along y */
+  for (x = 0; x < 4; x++)
+    for (z = 0; z < 4; z++)
+      _t1(fwd_lift, Int)(p + 16 * z + 1 * x, 4);
+  /* transform along z */
+  for (y = 0; y < 4; y++)
+    for (x = 0; x < 4; x++)
+      _t1(fwd_lift, Int)(p + 1 * x + 4 * y, 16);
+}
+
+/* public functions -------------------------------------------------------- */
+
+/* encode 4*4*4 floating-point block stored at p using strides (sx, sy, sz) */
+uint
+_t2(zfp_encode_block_strided, Scalar, 3)(zfp_stream* stream, const Scalar* p, int sx, int sy, int sz)
+{
+  /* gather block from strided array */
+  cache_align_(Scalar fblock[64]);
+  _t2(gather, Scalar, 3)(fblock, p, sx, sy, sz);
+  /* encode floating-point block */
+  return _t2(zfp_encode_block, Scalar, 3)(stream, fblock);
+}
+
+/* encode nx*ny*nz floating-point block stored at p using strides (sx, sy, sz) */
+uint
+_t2(zfp_encode_partial_block_strided, Scalar, 3)(zfp_stream* stream, const Scalar* p, uint nx, uint ny, uint nz, int sx, int sy, int sz)
+{
+  /* gather block from strided array */
+  cache_align_(Scalar fblock[64]);
+  _t2(gather_partial, Scalar, 3)(fblock, p, nx, ny, nz, sx, sy, sz);
+  /* encode floating-point block */
+  return _t2(zfp_encode_block, Scalar, 3)(stream, fblock);
+}
diff --git a/zfp/src/template/encode4.c b/zfp/src/template/encode4.c
new file mode 100644
index 0000000000000000000000000000000000000000..c9ed5425a3b1a8b3cdfd13676904983bebb55427
--- /dev/null
+++ b/zfp/src/template/encode4.c
@@ -0,0 +1,89 @@
+/* private functions ------------------------------------------------------- */
+
+/* gather 4*4*4*4 block from strided array */
+static void
+_t2(gather, Scalar, 4)(Scalar* q, const Scalar* p, int sx, int sy, int sz, int sw)
+{
+  uint x, y, z, w;
+  for (w = 0; w < 4; w++, p += sw - 4 * sz)
+    for (z = 0; z < 4; z++, p += sz - 4 * sy)
+      for (y = 0; y < 4; y++, p += sy - 4 * sx)
+        for (x = 0; x < 4; x++, p += sx)
+          *q++ = *p;
+}
+
+/* gather nx*ny*nz*nw block from strided array */
+static void
+_t2(gather_partial, Scalar, 4)(Scalar* q, const Scalar* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw)
+{
+  uint x, y, z, w;
+  for (w = 0; w < nw; w++, p += sw - (ptrdiff_t)nz * sz) {
+    for (z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy) {
+      for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx) {
+        for (x = 0; x < nx; x++, p += sx)
+          q[64 * w + 16 * z + 4 * y + x] = *p; 
+        _t1(pad_block, Scalar)(q + 64 * w + 16 * z + 4 * y, nx, 1);
+      }
+      for (x = 0; x < 4; x++)
+        _t1(pad_block, Scalar)(q + 64 * w + 16 * z + x, ny, 4);
+    }
+    for (y = 0; y < 4; y++)
+      for (x = 0; x < 4; x++)
+        _t1(pad_block, Scalar)(q + 64 * w + 4 * y + x, nz, 16);
+  }
+  for (z = 0; z < 4; z++)
+    for (y = 0; y < 4; y++)
+      for (x = 0; x < 4; x++)
+        _t1(pad_block, Scalar)(q + 16 * z + 4 * y + x, nw, 64);
+}
+
+/* forward decorrelating 4D transform */
+static void
+_t2(fwd_xform, Int, 4)(Int* p)
+{
+  uint x, y, z, w;
+  /* transform along x */
+  for (w = 0; w < 4; w++)
+    for (z = 0; z < 4; z++)
+      for (y = 0; y < 4; y++)
+        _t1(fwd_lift, Int)(p + 4 * y + 16 * z + 64 * w, 1);
+  /* transform along y */
+  for (x = 0; x < 4; x++)
+    for (w = 0; w < 4; w++)
+      for (z = 0; z < 4; z++)
+        _t1(fwd_lift, Int)(p + 16 * z + 64 * w + 1 * x, 4);
+  /* transform along z */
+  for (y = 0; y < 4; y++)
+    for (x = 0; x < 4; x++)
+      for (w = 0; w < 4; w++)
+        _t1(fwd_lift, Int)(p + 64 * w + 1 * x + 4 * y, 16);
+  /* transform along w */
+  for (z = 0; z < 4; z++)
+    for (y = 0; y < 4; y++)
+      for (x = 0; x < 4; x++)
+        _t1(fwd_lift, Int)(p + 1 * x + 4 * y + 16 * z, 64);
+}
+
+/* public functions -------------------------------------------------------- */
+
+/* encode 4*4*4*4 floating-point block stored at p using strides (sx, sy, sz, sw) */
+uint
+_t2(zfp_encode_block_strided, Scalar, 4)(zfp_stream* stream, const Scalar* p, int sx, int sy, int sz, int sw)
+{
+  /* gather block from strided array */
+  cache_align_(Scalar fblock[256]);
+  _t2(gather, Scalar, 4)(fblock, p, sx, sy, sz, sw);
+  /* encode floating-point block */
+  return _t2(zfp_encode_block, Scalar, 4)(stream, fblock);
+}
+
+/* encode nx*ny*nz*nw floating-point block stored at p using strides (sx, sy, sz, sw) */
+uint
+_t2(zfp_encode_partial_block_strided, Scalar, 4)(zfp_stream* stream, const Scalar* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw)
+{
+  /* gather block from strided array */
+  cache_align_(Scalar fblock[256]);
+  _t2(gather_partial, Scalar, 4)(fblock, p, nx, ny, nz, nw, sx, sy, sz, sw);
+  /* encode floating-point block */
+  return _t2(zfp_encode_block, Scalar, 4)(stream, fblock);
+}
diff --git a/zfp/src/template/encodef.c b/zfp/src/template/encodef.c
new file mode 100644
index 0000000000000000000000000000000000000000..874597a779a1edf014d4e810e1917b76a027d024
--- /dev/null
+++ b/zfp/src/template/encodef.c
@@ -0,0 +1,82 @@
+#include <limits.h>
+#include <math.h>
+
+/* private functions ------------------------------------------------------- */
+
+/* return normalized floating-point exponent for x >= 0 */
+static int
+_t1(exponent, Scalar)(Scalar x)
+{
+  if (x > 0) {
+    int e;
+    FREXP(x, &e);
+    /* clamp exponent in case x is denormal */
+    return MAX(e, 1 - EBIAS);
+  }
+  return -EBIAS;
+}
+
+/* compute maximum floating-point exponent in block of n values */
+static int
+_t1(exponent_block, Scalar)(const Scalar* p, uint n)
+{
+  Scalar max = 0;
+  do {
+    Scalar f = FABS(*p++);
+    if (max < f)
+      max = f;
+  } while (--n);
+  return _t1(exponent, Scalar)(max);
+}
+
+/* map floating-point number x to integer relative to exponent e */
+static Scalar
+_t1(quantize, Scalar)(Scalar x, int e)
+{
+  return LDEXP(x, (CHAR_BIT * (int)sizeof(Scalar) - 2) - e);
+}
+
+/* forward block-floating-point transform to signed integers */
+static void
+_t1(fwd_cast, Scalar)(Int* iblock, const Scalar* fblock, uint n, int emax)
+{
+  /* compute power-of-two scale factor s */
+  Scalar s = _t1(quantize, Scalar)(1, emax);
+  /* compute p-bit int y = s*x where x is floating and |y| <= 2^(p-2) - 1 */
+  do
+    *iblock++ = (Int)(s * *fblock++);
+  while (--n);
+}
+
+/* public functions -------------------------------------------------------- */
+
+/* encode contiguous floating-point block */
+uint
+_t2(zfp_encode_block, Scalar, DIMS)(zfp_stream* zfp, const Scalar* fblock)
+{
+  /* compute maximum exponent */
+  int emax = _t1(exponent_block, Scalar)(fblock, BLOCK_SIZE);
+  int maxprec = precision(emax, zfp->maxprec, zfp->minexp, DIMS);
+  uint e = maxprec ? emax + EBIAS : 0;
+  /* encode block only if biased exponent is nonzero */
+  if (e) {
+    cache_align_(Int iblock[BLOCK_SIZE]);
+    /* encode common exponent; LSB indicates that exponent is nonzero */
+    int ebits = EBITS + 1;
+    stream_write_bits(zfp->stream, 2 * e + 1, ebits);
+    /* perform forward block-floating-point transform */
+    _t1(fwd_cast, Scalar)(iblock, fblock, BLOCK_SIZE, emax);
+    /* encode integer block */
+    return ebits + _t2(encode_block, Int, DIMS)(zfp->stream, zfp->minbits - ebits, zfp->maxbits - ebits, maxprec, iblock);
+  }
+  else {
+    /* write single zero-bit to indicate that all values are zero */
+    stream_write_bit(zfp->stream, 0);
+    if (zfp->minbits > 1) {
+      stream_pad(zfp->stream, zfp->minbits - 1);
+      return zfp->minbits;
+    }
+    else
+      return 1;
+  }
+}
diff --git a/zfp/src/template/encodei.c b/zfp/src/template/encodei.c
new file mode 100644
index 0000000000000000000000000000000000000000..6f4cb2c11eef2b4e738afcfec6db093d1a851c47
--- /dev/null
+++ b/zfp/src/template/encodei.c
@@ -0,0 +1,13 @@
+/* public functions -------------------------------------------------------- */
+
+/* encode contiguous integer block */
+uint
+_t2(zfp_encode_block, Int, DIMS)(zfp_stream* zfp, const Int* iblock)
+{
+  cache_align_(Int block[BLOCK_SIZE]);
+  uint i;
+  /* copy block */
+  for (i = 0; i < BLOCK_SIZE; i++)
+    block[i] = iblock[i];
+  return _t2(encode_block, Int, DIMS)(zfp->stream, zfp->minbits, zfp->maxbits, zfp->maxprec, block);
+}
diff --git a/zfp/src/template/ompcompress.c b/zfp/src/template/ompcompress.c
new file mode 100644
index 0000000000000000000000000000000000000000..a654ac91a7fe067fc88f7b09068123621b3fea01
--- /dev/null
+++ b/zfp/src/template/ompcompress.c
@@ -0,0 +1,265 @@
+#ifdef _OPENMP
+
+/* compress 1d contiguous array in parallel */
+static void
+_t2(compress_omp, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
+{
+  /* array metadata */
+  const Scalar* data = (const Scalar*)field->data;
+  uint nx = field->nx;
+
+  /* number of omp threads, blocks, and chunks */
+  uint threads = thread_count_omp(stream);
+  uint blocks = (nx + 3) / 4;
+  uint chunks = chunk_count_omp(stream, blocks, threads);
+
+  /* allocate per-thread streams */
+  bitstream** bs = compress_init_par(stream, field, chunks, blocks);
+
+  /* compress chunks of blocks in parallel */
+  int chunk;
+  #pragma omp parallel for num_threads(threads)
+  for (chunk = 0; chunk < (int)chunks; chunk++) {
+    /* determine range of block indices assigned to this thread */
+    uint bmin = chunk_offset(blocks, chunks, chunk + 0);
+    uint bmax = chunk_offset(blocks, chunks, chunk + 1);
+    uint block;
+    /* set up thread-local bit stream */
+    zfp_stream s = *stream;
+    zfp_stream_set_bit_stream(&s, bs[chunk]);
+    /* compress sequence of blocks */
+    for (block = bmin; block < bmax; block++) {
+      /* determine block origin x within array */
+      const Scalar* p = data;
+      uint x = 4 * block;
+      p += x;
+      /* compress partial or full block */
+      if (nx - x < 4)
+        _t2(zfp_encode_partial_block_strided, Scalar, 1)(&s, p, MIN(nx - x, 4u), 1);
+      else
+        _t2(zfp_encode_block, Scalar, 1)(&s, p);
+    }
+  }
+
+  /* concatenate per-thread streams */
+  compress_finish_par(stream, bs, chunks);
+}
+
+/* compress 1d strided array in parallel */
+static void
+_t2(compress_strided_omp, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
+{
+  /* array metadata */
+  const Scalar* data = (const Scalar*)field->data;
+  uint nx = field->nx;
+  int sx = field->sx ? field->sx : 1;
+
+  /* number of omp threads, blocks, and chunks */
+  uint threads = thread_count_omp(stream);
+  uint blocks = (nx + 3) / 4;
+  uint chunks = chunk_count_omp(stream, blocks, threads);
+
+  /* allocate per-thread streams */
+  bitstream** bs = compress_init_par(stream, field, chunks, blocks);
+
+  /* compress chunks of blocks in parallel */
+  int chunk;
+  #pragma omp parallel for num_threads(threads)
+  for (chunk = 0; chunk < (int)chunks; chunk++) {
+    /* determine range of block indices assigned to this thread */
+    uint bmin = chunk_offset(blocks, chunks, chunk + 0);
+    uint bmax = chunk_offset(blocks, chunks, chunk + 1);
+    uint block;
+    /* set up thread-local bit stream */
+    zfp_stream s = *stream;
+    zfp_stream_set_bit_stream(&s, bs[chunk]);
+    /* compress sequence of blocks */
+    for (block = bmin; block < bmax; block++) {
+      /* determine block origin x within array */
+      const Scalar* p = data;
+      uint x = 4 * block;
+      p += sx * (ptrdiff_t)x;
+      /* compress partial or full block */
+      if (nx - x < 4)
+        _t2(zfp_encode_partial_block_strided, Scalar, 1)(&s, p, MIN(nx - x, 4u), sx);
+      else
+        _t2(zfp_encode_block_strided, Scalar, 1)(&s, p, sx);
+    }
+  }
+
+  /* concatenate per-thread streams */
+  compress_finish_par(stream, bs, chunks);
+}
+
+/* compress 2d strided array in parallel */
+static void
+_t2(compress_strided_omp, Scalar, 2)(zfp_stream* stream, const zfp_field* field)
+{
+  /* array metadata */
+  const Scalar* data = (const Scalar*)field->data;
+  uint nx = field->nx;
+  uint ny = field->ny;
+  int sx = field->sx ? field->sx : 1;
+  int sy = field->sy ? field->sy : nx;
+
+  /* number of omp threads, blocks, and chunks */
+  uint threads = thread_count_omp(stream);
+  uint bx = (nx + 3) / 4;
+  uint by = (ny + 3) / 4;
+  uint blocks = bx * by;
+  uint chunks = chunk_count_omp(stream, blocks, threads);
+
+  /* allocate per-thread streams */
+  bitstream** bs = compress_init_par(stream, field, chunks, blocks);
+
+  /* compress chunks of blocks in parallel */
+  int chunk;
+  #pragma omp parallel for num_threads(threads)
+  for (chunk = 0; chunk < (int)chunks; chunk++) {
+    /* determine range of block indices assigned to this thread */
+    uint bmin = chunk_offset(blocks, chunks, chunk + 0);
+    uint bmax = chunk_offset(blocks, chunks, chunk + 1);
+    uint block;
+    /* set up thread-local bit stream */
+    zfp_stream s = *stream;
+    zfp_stream_set_bit_stream(&s, bs[chunk]);
+    /* compress sequence of blocks */
+    for (block = bmin; block < bmax; block++) {
+      /* determine block origin (x, y) within array */
+      const Scalar* p = data;
+      uint b = block;
+      uint x, y;
+      x = 4 * (b % bx); b /= bx;
+      y = 4 * b;
+      p += sx * (ptrdiff_t)x + sy * (ptrdiff_t)y;
+      /* compress partial or full block */
+      if (nx - x < 4 || ny - y < 4)
+        _t2(zfp_encode_partial_block_strided, Scalar, 2)(&s, p, MIN(nx - x, 4u), MIN(ny - y, 4u), sx, sy);
+      else
+        _t2(zfp_encode_block_strided, Scalar, 2)(&s, p, sx, sy);
+    }
+  }
+
+  /* concatenate per-thread streams */
+  compress_finish_par(stream, bs, chunks);
+}
+
+/* compress 3d strided array in parallel */
+static void
+_t2(compress_strided_omp, Scalar, 3)(zfp_stream* stream, const zfp_field* field)
+{
+  /* array metadata */
+  const Scalar* data = (const Scalar*)field->data;
+  uint nx = field->nx;
+  uint ny = field->ny;
+  uint nz = field->nz;
+  int sx = field->sx ? field->sx : 1;
+  int sy = field->sy ? field->sy : nx;
+  int sz = field->sz ? field->sz : (ptrdiff_t)nx * ny;
+
+  /* number of omp threads, blocks, and chunks */
+  uint threads = thread_count_omp(stream);
+  uint bx = (nx + 3) / 4;
+  uint by = (ny + 3) / 4;
+  uint bz = (nz + 3) / 4;
+  uint blocks = bx * by * bz;
+  uint chunks = chunk_count_omp(stream, blocks, threads);
+
+  /* allocate per-thread streams */
+  bitstream** bs = compress_init_par(stream, field, chunks, blocks);
+
+  /* compress chunks of blocks in parallel */
+  int chunk;
+  #pragma omp parallel for num_threads(threads)
+  for (chunk = 0; chunk < (int)chunks; chunk++) {
+    /* determine range of block indices assigned to this thread */
+    uint bmin = chunk_offset(blocks, chunks, chunk + 0);
+    uint bmax = chunk_offset(blocks, chunks, chunk + 1);
+    uint block;
+    /* set up thread-local bit stream */
+    zfp_stream s = *stream;
+    zfp_stream_set_bit_stream(&s, bs[chunk]);
+    /* compress sequence of blocks */
+    for (block = bmin; block < bmax; block++) {
+      /* determine block origin (x, y, z) within array */
+      const Scalar* p = data;
+      uint b = block;
+      uint x, y, z;
+      x = 4 * (b % bx); b /= bx;
+      y = 4 * (b % by); b /= by;
+      z = 4 * b;
+      p += sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z;
+      /* compress partial or full block */
+      if (nx - x < 4 || ny - y < 4 || nz - z < 4)
+        _t2(zfp_encode_partial_block_strided, Scalar, 3)(&s, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), sx, sy, sz);
+      else
+        _t2(zfp_encode_block_strided, Scalar, 3)(&s, p, sx, sy, sz);
+    }
+  }
+
+  /* concatenate per-thread streams */
+  compress_finish_par(stream, bs, chunks);
+}
+
+/* compress 4d strided array in parallel */
+static void
+_t2(compress_strided_omp, Scalar, 4)(zfp_stream* stream, const zfp_field* field)
+{
+  /* array metadata */
+  const Scalar* data = field->data;
+  uint nx = field->nx;
+  uint ny = field->ny;
+  uint nz = field->nz;
+  uint nw = field->nw;
+  int sx = field->sx ? field->sx : 1;
+  int sy = field->sy ? field->sy : nx;
+  int sz = field->sz ? field->sz : (ptrdiff_t)nx * ny;
+  int sw = field->sw ? field->sw : (ptrdiff_t)nx * ny * nz;
+
+  /* number of omp threads, blocks, and chunks */
+  uint threads = thread_count_omp(stream);
+  uint bx = (nx + 3) / 4;
+  uint by = (ny + 3) / 4;
+  uint bz = (nz + 3) / 4;
+  uint bw = (nw + 3) / 4;
+  uint blocks = bx * by * bz * bw;
+  uint chunks = chunk_count_omp(stream, blocks, threads);
+
+  /* allocate per-thread streams */
+  bitstream** bs = compress_init_par(stream, field, chunks, blocks);
+
+  /* compress chunks of blocks in parallel */
+  int chunk;
+  #pragma omp parallel for num_threads(threads)
+  for (chunk = 0; chunk < (int)chunks; chunk++) {
+    /* determine range of block indices assigned to this thread */
+    uint bmin = chunk_offset(blocks, chunks, chunk + 0);
+    uint bmax = chunk_offset(blocks, chunks, chunk + 1);
+    uint block;
+    /* set up thread-local bit stream */
+    zfp_stream s = *stream;
+    zfp_stream_set_bit_stream(&s, bs[chunk]);
+    /* compress sequence of blocks */
+    for (block = bmin; block < bmax; block++) {
+      /* determine block origin (x, y, z, w) within array */
+      const Scalar* p = data;
+      uint b = block;
+      uint x, y, z, w;
+      x = 4 * (b % bx); b /= bx;
+      y = 4 * (b % by); b /= by;
+      z = 4 * (b % bz); b /= bz;
+      w = 4 * b;
+      p += sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z + sw * (ptrdiff_t)w;
+      /* compress partial or full block */
+      if (nx - x < 4 || ny - y < 4 || nz - z < 4 || nw - w < 4)
+        _t2(zfp_encode_partial_block_strided, Scalar, 4)(&s, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), MIN(nw - w, 4u), sx, sy, sz, sw);
+      else
+        _t2(zfp_encode_block_strided, Scalar, 4)(&s, p, sx, sy, sz, sw);
+    }
+  }
+
+  /* concatenate per-thread streams */
+  compress_finish_par(stream, bs, chunks);
+}
+
+#endif
diff --git a/zfp/src/template/template.h b/zfp/src/template/template.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd5becf7e4c25c9bfb326deb68c490b57ddcef41
--- /dev/null
+++ b/zfp/src/template/template.h
@@ -0,0 +1,12 @@
+#ifndef TEMPLATE_H
+#define TEMPLATE_H
+
+/* concatenation */
+#define _cat2(x, y)    x ## _ ## y
+#define _cat3(x, y, z) x ## _ ## y ## _ ## z
+
+/* 1- and 2-argument function templates */
+#define _t1(function, arg)        _cat2(function, arg)
+#define _t2(function, type, dims) _cat3(function, type, dims)
+
+#endif
diff --git a/zfp/src/traitsd.h b/zfp/src/traitsd.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc612b4934e689ad358bf1f4bc87c3b0a9895d4c
--- /dev/null
+++ b/zfp/src/traitsd.h
@@ -0,0 +1,11 @@
+/* double-precision floating-point traits */
+
+#define Scalar double                      /* floating-point type */
+#define Int int64                          /* corresponding signed integer type */
+#define UInt uint64                        /* corresponding unsigned integer type */
+#define EBITS 11                           /* number of exponent bits */
+#define NBMASK UINT64C(0xaaaaaaaaaaaaaaaa) /* negabinary mask */
+
+#define FABS(x) fabs(x)
+#define FREXP(x, e) frexp(x, e)
+#define LDEXP(x, e) ldexp(x, e)
diff --git a/zfp/src/traitsf.h b/zfp/src/traitsf.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba262ac09f8da3efc5e0b67b144e6cc9f9f9cd72
--- /dev/null
+++ b/zfp/src/traitsf.h
@@ -0,0 +1,17 @@
+/* single-precision floating-point traits */
+
+#define Scalar float       /* floating-point type */
+#define Int int32          /* corresponding signed integer type */
+#define UInt uint32        /* corresponding unsigned integer type */
+#define EBITS 8            /* number of exponent bits */
+#define NBMASK 0xaaaaaaaau /* negabinary mask */
+
+#if __STDC_VERSION__ >= 199901L
+  #define FABS(x)     fabsf(x)
+  #define FREXP(x, e) frexpf(x, e)
+  #define LDEXP(x, e) ldexpf(x, e)
+#else
+  #define FABS(x)     (float)fabs(x)
+  #define FREXP(x, e) (void)frexp(x, e)
+  #define LDEXP(x, e) (float)ldexp(x, e)
+#endif
diff --git a/zfp/src/traitsi.h b/zfp/src/traitsi.h
new file mode 100644
index 0000000000000000000000000000000000000000..1daca09f428056d7b9fa0979ae0fa03fc3e7cde9
--- /dev/null
+++ b/zfp/src/traitsi.h
@@ -0,0 +1,6 @@
+/* 32-bit integer traits */
+
+#define Scalar int32       /* integer type */
+#define Int int32          /* corresponding signed integer type */
+#define UInt uint32        /* corresponding unsigned integer type */
+#define NBMASK 0xaaaaaaaau /* negabinary mask */
diff --git a/zfp/src/traitsl.h b/zfp/src/traitsl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4c853467066db9522703023c10f75cad1538ed1
--- /dev/null
+++ b/zfp/src/traitsl.h
@@ -0,0 +1,6 @@
+/* 64-bit integer traits */
+
+#define Scalar int64                       /* integer type */
+#define Int int64                          /* corresponding signed integer type */
+#define UInt uint64                        /* corresponding unsigned integer type */
+#define NBMASK UINT64C(0xaaaaaaaaaaaaaaaa) /* negabinary mask */
diff --git a/zfp/src/zfp.c b/zfp/src/zfp.c
new file mode 100644
index 0000000000000000000000000000000000000000..049a58692464ec123c8d5e6977504c50971c5490
--- /dev/null
+++ b/zfp/src/zfp.c
@@ -0,0 +1,1025 @@
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "zfp.h"
+#include "zfp/macros.h"
+#include "template/template.h"
+
+/* public data ------------------------------------------------------------- */
+
+export_ const uint zfp_codec_version = ZFP_CODEC;
+export_ const uint zfp_library_version = ZFP_VERSION;
+export_ const char* const zfp_version_string = "zfp version " ZFP_VERSION_STRING " (October 1, 2018)";
+
+/* private functions ------------------------------------------------------- */
+
+static uint
+type_precision(zfp_type type)
+{
+  switch (type) {
+    case zfp_type_int32:
+      return CHAR_BIT * (uint)sizeof(int32);
+    case zfp_type_int64:
+      return CHAR_BIT * (uint)sizeof(int64);
+    case zfp_type_float:
+      return CHAR_BIT * (uint)sizeof(float);
+    case zfp_type_double:
+      return CHAR_BIT * (uint)sizeof(double);
+    default:
+      return 0;
+  }
+}
+
+/* shared code across template instances ------------------------------------*/
+
+#include "share/parallel.c"
+#include "share/omp.c"
+
+/* template instantiation of integer and float compressor -------------------*/
+
+#define Scalar int32
+#include "template/compress.c"
+#include "template/decompress.c"
+#include "template/ompcompress.c"
+#include "template/cudacompress.c"
+#include "template/cudadecompress.c"
+#undef Scalar
+
+#define Scalar int64
+#include "template/compress.c"
+#include "template/decompress.c"
+#include "template/ompcompress.c"
+#include "template/cudacompress.c"
+#include "template/cudadecompress.c"
+#undef Scalar
+
+#define Scalar float
+#include "template/compress.c"
+#include "template/decompress.c"
+#include "template/ompcompress.c"
+#include "template/cudacompress.c"
+#include "template/cudadecompress.c"
+#undef Scalar
+
+#define Scalar double
+#include "template/compress.c"
+#include "template/decompress.c"
+#include "template/ompcompress.c"
+#include "template/cudacompress.c"
+#include "template/cudadecompress.c"
+#undef Scalar
+
+/* public functions: miscellaneous ----------------------------------------- */
+
+size_t
+zfp_type_size(zfp_type type)
+{
+  switch (type) {
+    case zfp_type_int32:
+      return sizeof(int32);
+    case zfp_type_int64:
+      return sizeof(int64);
+    case zfp_type_float:
+      return sizeof(float);
+    case zfp_type_double:
+      return sizeof(double);
+    default:
+      return 0;
+  }
+}
+
+/* public functions: fields ------------------------------------------------ */
+
+zfp_field*
+zfp_field_alloc()
+{
+  zfp_field* field = (zfp_field*)malloc(sizeof(zfp_field));
+  if (field) {
+    field->type = zfp_type_none;
+    field->nx = field->ny = field->nz = field->nw = 0;
+    field->sx = field->sy = field->sz = field->sw = 0;
+    field->data = 0;
+  }
+  return field;
+}
+
+zfp_field*
+zfp_field_1d(void* data, zfp_type type, uint nx)
+{
+  zfp_field* field = zfp_field_alloc();
+  if (field) {
+    field->type = type;
+    field->nx = nx;
+    field->data = data;
+  }
+  return field;
+}
+
+zfp_field*
+zfp_field_2d(void* data, zfp_type type, uint nx, uint ny)
+{
+  zfp_field* field = zfp_field_alloc();
+  if (field) {
+    field->type = type;
+    field->nx = nx;
+    field->ny = ny;
+    field->data = data;
+  }
+  return field;
+}
+
+zfp_field*
+zfp_field_3d(void* data, zfp_type type, uint nx, uint ny, uint nz)
+{
+  zfp_field* field = zfp_field_alloc();
+  if (field) {
+    field->type = type;
+    field->nx = nx;
+    field->ny = ny;
+    field->nz = nz;
+    field->data = data;
+  }
+  return field;
+}
+
+zfp_field*
+zfp_field_4d(void* data, zfp_type type, uint nx, uint ny, uint nz, uint nw)
+{
+  zfp_field* field = zfp_field_alloc();
+  if (field) {
+    field->type = type;
+    field->nx = nx;
+    field->ny = ny;
+    field->nz = nz;
+    field->nw = nw;
+    field->data = data;
+  }
+  return field;
+}
+
+void
+zfp_field_free(zfp_field* field)
+{
+  free(field);
+}
+
+void*
+zfp_field_pointer(const zfp_field* field)
+{
+  return field->data;
+}
+
+zfp_type
+zfp_field_type(const zfp_field* field)
+{
+  return field->type;
+}
+
+uint
+zfp_field_precision(const zfp_field* field)
+{
+  return type_precision(field->type);
+}
+
+uint
+zfp_field_dimensionality(const zfp_field* field)
+{
+  return field->nx ? field->ny ? field->nz ? field->nw ? 4 : 3 : 2 : 1 : 0;
+}
+
+size_t
+zfp_field_size(const zfp_field* field, uint* size)
+{
+  if (size)
+    switch (zfp_field_dimensionality(field)) {
+      case 4:
+        size[3] = field->nw;
+        /* FALLTHROUGH */
+      case 3:
+        size[2] = field->nz;
+        /* FALLTHROUGH */
+      case 2:
+        size[1] = field->ny;
+        /* FALLTHROUGH */
+      case 1:
+        size[0] = field->nx;
+        break;
+    }
+  return (size_t)MAX(field->nx, 1u) * (size_t)MAX(field->ny, 1u) * (size_t)MAX(field->nz, 1u) * (size_t)MAX(field->nw, 1u);
+}
+
+int
+zfp_field_stride(const zfp_field* field, int* stride)
+{
+  if (stride)
+    switch (zfp_field_dimensionality(field)) {
+      case 4:
+        stride[3] = field->sw ? field->sw : field->nx * field->ny * field->nz;
+        /* FALLTHROUGH */
+      case 3:
+        stride[2] = field->sz ? field->sz : field->nx * field->ny;
+        /* FALLTHROUGH */
+      case 2:
+        stride[1] = field->sy ? field->sy : field->nx;
+        /* FALLTHROUGH */
+      case 1:
+        stride[0] = field->sx ? field->sx : 1;
+        break;
+    }
+  return field->sx || field->sy || field->sz || field->sw;
+}
+
+uint64
+zfp_field_metadata(const zfp_field* field)
+{
+  uint64 meta = 0;
+  /* 48 bits for dimensions */
+  switch (zfp_field_dimensionality(field)) {
+    case 1:
+      meta <<= 48; meta += field->nx - 1;
+      break;
+    case 2:
+      meta <<= 24; meta += field->ny - 1;
+      meta <<= 24; meta += field->nx - 1;
+      break;
+    case 3:
+      meta <<= 16; meta += field->nz - 1;
+      meta <<= 16; meta += field->ny - 1;
+      meta <<= 16; meta += field->nx - 1;
+      break;
+    case 4:
+      meta <<= 12; meta += field->nw - 1;
+      meta <<= 12; meta += field->nz - 1;
+      meta <<= 12; meta += field->ny - 1;
+      meta <<= 12; meta += field->nx - 1;
+      break;
+  }
+  /* 2 bits for dimensionality (1D, 2D, 3D, 4D) */
+  meta <<= 2; meta += zfp_field_dimensionality(field) - 1;
+  /* 2 bits for scalar type */
+  meta <<= 2; meta += field->type - 1;
+  return meta;
+}
+
+void
+zfp_field_set_pointer(zfp_field* field, void* data)
+{
+  field->data = data;
+}
+
+zfp_type
+zfp_field_set_type(zfp_field* field, zfp_type type)
+{
+  switch (type) {
+    case zfp_type_int32:
+    case zfp_type_int64:
+    case zfp_type_float:
+    case zfp_type_double:
+      field->type = type;
+      return type;
+    default:
+      return zfp_type_none;
+  }
+}
+
+void
+zfp_field_set_size_1d(zfp_field* field, uint n)
+{
+  field->nx = n;
+  field->ny = 0;
+  field->nz = 0;
+  field->nw = 0;
+}
+
+void
+zfp_field_set_size_2d(zfp_field* field, uint nx, uint ny)
+{
+  field->nx = nx;
+  field->ny = ny;
+  field->nz = 0;
+  field->nw = 0;
+}
+
+void
+zfp_field_set_size_3d(zfp_field* field, uint nx, uint ny, uint nz)
+{
+  field->nx = nx;
+  field->ny = ny;
+  field->nz = nz;
+  field->nw = 0;
+}
+
+void
+zfp_field_set_size_4d(zfp_field* field, uint nx, uint ny, uint nz, uint nw)
+{
+  field->nx = nx;
+  field->ny = ny;
+  field->nz = nz;
+  field->nw = nw;
+}
+
+void
+zfp_field_set_stride_1d(zfp_field* field, int sx)
+{
+  field->sx = sx;
+  field->sy = 0;
+  field->sz = 0;
+  field->sw = 0;
+}
+
+void
+zfp_field_set_stride_2d(zfp_field* field, int sx, int sy)
+{
+  field->sx = sx;
+  field->sy = sy;
+  field->sz = 0;
+  field->sw = 0;
+}
+
+void
+zfp_field_set_stride_3d(zfp_field* field, int sx, int sy, int sz)
+{
+  field->sx = sx;
+  field->sy = sy;
+  field->sz = sz;
+  field->sw = 0;
+}
+
+void
+zfp_field_set_stride_4d(zfp_field* field, int sx, int sy, int sz, int sw)
+{
+  field->sx = sx;
+  field->sy = sy;
+  field->sz = sz;
+  field->sw = sw;
+}
+
+int
+zfp_field_set_metadata(zfp_field* field, uint64 meta)
+{
+  uint64 dims;
+  field->type = (zfp_type)((meta & 0x3u) + 1); meta >>= 2;
+  dims = (meta & 0x3u) + 1; meta >>= 2;
+  switch (dims) {
+    case 1:
+      /* currently dimensions are limited to 2^32 - 1 */
+      field->nx = (meta & UINT64C(0x0000ffffffff)) + 1; meta >>= 48;
+      field->ny = 0;
+      field->nz = 0;
+      field->nw = 0;
+      break;
+    case 2:
+      field->nx = (meta & UINT64C(0xffffff)) + 1; meta >>= 24;
+      field->ny = (meta & UINT64C(0xffffff)) + 1; meta >>= 24;
+      field->nz = 0;
+      field->nw = 0;
+      break;
+    case 3:
+      field->nx = (meta & UINT64C(0xffff)) + 1; meta >>= 16;
+      field->ny = (meta & UINT64C(0xffff)) + 1; meta >>= 16;
+      field->nz = (meta & UINT64C(0xffff)) + 1; meta >>= 16;
+      field->nw = 0;
+      break;
+    case 4:
+      field->nx = (meta & UINT64C(0xfff)) + 1; meta >>= 12;
+      field->ny = (meta & UINT64C(0xfff)) + 1; meta >>= 12;
+      field->nz = (meta & UINT64C(0xfff)) + 1; meta >>= 12;
+      field->nw = (meta & UINT64C(0xfff)) + 1; meta >>= 12;
+      break;
+  }
+  field->sx = field->sy = field->sz = field->sw = 0;
+  return 1;
+}
+
+/* public functions: zfp compressed stream --------------------------------- */
+
+zfp_stream*
+zfp_stream_open(bitstream* stream)
+{
+  zfp_stream* zfp = (zfp_stream*)malloc(sizeof(zfp_stream));
+  if (zfp) {
+    zfp->stream = stream;
+    zfp->minbits = ZFP_MIN_BITS;
+    zfp->maxbits = ZFP_MAX_BITS;
+    zfp->maxprec = ZFP_MAX_PREC;
+    zfp->minexp = ZFP_MIN_EXP;
+    zfp->exec.policy = zfp_exec_serial;
+  }
+  return zfp;
+}
+
+void
+zfp_stream_close(zfp_stream* zfp)
+{
+  free(zfp);
+}
+
+bitstream*
+zfp_stream_bit_stream(const zfp_stream* zfp)
+{
+  return zfp->stream;
+}
+
+zfp_mode
+zfp_stream_compression_mode(const zfp_stream* zfp)
+{
+  if (zfp->minbits > zfp->maxbits || !(0 < zfp->maxprec && zfp->maxprec <= 64))
+    return zfp_mode_null;
+
+  /* default values are considered expert mode */
+  if (zfp->minbits == ZFP_MIN_BITS &&
+      zfp->maxbits == ZFP_MAX_BITS &&
+      zfp->maxprec == ZFP_MAX_PREC &&
+      zfp->minexp == ZFP_MIN_EXP)
+    return zfp_mode_expert;
+
+  /* fixed rate? */
+  if (zfp->minbits == zfp->maxbits &&
+      1 <= zfp->maxbits && zfp->maxbits <= ZFP_MAX_BITS &&
+      zfp->maxprec >= ZFP_MAX_PREC &&
+      zfp->minexp <= ZFP_MIN_EXP)
+    return zfp_mode_fixed_rate;
+
+  /* fixed precision? */
+  if (zfp->minbits <= ZFP_MIN_BITS &&
+      zfp->maxbits >= ZFP_MAX_BITS &&
+      zfp->maxprec >= 1 &&
+      zfp->minexp <= ZFP_MIN_EXP)
+    return zfp_mode_fixed_precision;
+
+  /* fixed accuracy? */
+  if (zfp->minbits <= ZFP_MIN_BITS &&
+      zfp->maxbits >= ZFP_MAX_BITS &&
+      zfp->maxprec >= ZFP_MAX_PREC &&
+      ZFP_MIN_EXP <= zfp->minexp)
+    return zfp_mode_fixed_accuracy;
+
+  return zfp_mode_expert;
+}
+
+uint64
+zfp_stream_mode(const zfp_stream* zfp)
+{
+  uint64 mode = 0;
+  uint minbits;
+  uint maxbits;
+  uint maxprec;
+  uint minexp;
+
+  /* common configurations mapped to short representation */
+  switch(zfp_stream_compression_mode(zfp)) {
+    case zfp_mode_fixed_rate:
+      if (zfp->maxbits <= 2048)
+        /* maxbits is [1, 2048] */
+        /* returns [0, 2047] */
+        return (zfp->maxbits - 1);
+      else
+        break;
+
+    case zfp_mode_fixed_precision:
+      if (zfp->maxprec <= 128)
+        /* maxprec is [1, 128] */
+        /* returns [2048, 2175] */
+        return (zfp->maxprec - 1) + (2048);
+      else
+        break;
+
+    case zfp_mode_fixed_accuracy:
+      if (zfp->minexp <= 843)
+        /* minexp is [ZFP_MIN_EXP=-1074, 843] */
+        /* [2177, ZFP_MODE_SHORT_MAX=4094] */
+        /* +1 because skipped 2176 */
+        return (zfp->minexp - ZFP_MIN_EXP) + (2048 + 128 + 1);
+
+    default:
+      break;
+  }
+
+  /* encode each parameter separately */
+  minbits = MAX(1, MIN(zfp->minbits, 0x8000u)) - 1;
+  maxbits = MAX(1, MIN(zfp->maxbits, 0x8000u)) - 1;
+  maxprec = MAX(1, MIN(zfp->maxprec, 0x0080u)) - 1;
+  minexp = MAX(0, MIN(zfp->minexp + 16495, 0x7fff));
+  mode <<= 15; mode += minexp;
+  mode <<=  7; mode += maxprec;
+  mode <<= 15; mode += maxbits;
+  mode <<= 15; mode += minbits;
+  mode <<= 12; mode += 0xfffu;
+
+  return mode;
+}
+
+void
+zfp_stream_params(const zfp_stream* zfp, uint* minbits, uint* maxbits, uint* maxprec, int* minexp)
+{
+  if (minbits)
+    *minbits = zfp->minbits;
+  if (maxbits)
+    *maxbits = zfp->maxbits;
+  if (maxprec)
+    *maxprec = zfp->maxprec;
+  if (minexp)
+    *minexp = zfp->minexp;
+}
+
+size_t
+zfp_stream_compressed_size(const zfp_stream* zfp)
+{
+  return stream_size(zfp->stream);
+}
+
+size_t
+zfp_stream_maximum_size(const zfp_stream* zfp, const zfp_field* field)
+{
+  uint dims = zfp_field_dimensionality(field);
+  uint mx = (MAX(field->nx, 1u) + 3) / 4;
+  uint my = (MAX(field->ny, 1u) + 3) / 4;
+  uint mz = (MAX(field->nz, 1u) + 3) / 4;
+  uint mw = (MAX(field->nw, 1u) + 3) / 4;
+  size_t blocks = (size_t)mx * (size_t)my * (size_t)mz * (size_t)mw;
+  uint values = 1u << (2 * dims);
+  uint maxbits = 1;
+
+  if (!dims)
+    return 0;
+  switch (field->type) {
+    case zfp_type_none:
+      return 0;
+    case zfp_type_float:
+      maxbits += 8;
+      break;
+    case zfp_type_double:
+      maxbits += 11;
+      break;
+    default:
+      break;
+  }
+  maxbits += values - 1 + values * MIN(zfp->maxprec, type_precision(field->type));
+  maxbits = MIN(maxbits, zfp->maxbits);
+  maxbits = MAX(maxbits, zfp->minbits);
+  return ((ZFP_HEADER_MAX_BITS + blocks * maxbits + stream_word_bits - 1) & ~(stream_word_bits - 1)) / CHAR_BIT;
+}
+
+void
+zfp_stream_set_bit_stream(zfp_stream* zfp, bitstream* stream)
+{
+  zfp->stream = stream;
+}
+
+double
+zfp_stream_set_rate(zfp_stream* zfp, double rate, zfp_type type, uint dims, int wra)
+{
+  uint n = 1u << (2 * dims);
+  uint bits = (uint)floor(n * rate + 0.5);
+  switch (type) {
+    case zfp_type_float:
+      bits = MAX(bits, 1 + 8u);
+      break;
+    case zfp_type_double:
+      bits = MAX(bits, 1 + 11u);
+      break;
+    default:
+      break;
+  }
+  if (wra) {
+    /* for write random access, round up to next multiple of stream word size */
+    bits += (uint)stream_word_bits - 1;
+    bits &= ~(stream_word_bits - 1);
+  }
+  zfp->minbits = bits;
+  zfp->maxbits = bits;
+  zfp->maxprec = ZFP_MAX_PREC;
+  zfp->minexp = ZFP_MIN_EXP;
+  return (double)bits / n;
+}
+
+uint
+zfp_stream_set_precision(zfp_stream* zfp, uint precision)
+{
+  zfp->minbits = ZFP_MIN_BITS;
+  zfp->maxbits = ZFP_MAX_BITS;
+  zfp->maxprec = precision ? MIN(precision, ZFP_MAX_PREC) : ZFP_MAX_PREC;
+  zfp->minexp = ZFP_MIN_EXP;
+  return zfp->maxprec;
+}
+
+double
+zfp_stream_set_accuracy(zfp_stream* zfp, double tolerance)
+{
+  int emin = ZFP_MIN_EXP;
+  if (tolerance > 0) {
+    /* tolerance = x * 2^emin, with 0.5 <= x < 1 */
+    frexp(tolerance, &emin);
+    emin--;
+    /* assert: 2^emin <= tolerance < 2^(emin+1) */
+  }
+  zfp->minbits = ZFP_MIN_BITS;
+  zfp->maxbits = ZFP_MAX_BITS;
+  zfp->maxprec = ZFP_MAX_PREC;
+  zfp->minexp = emin;
+  return tolerance > 0 ? ldexp(1.0, emin) : 0;
+}
+
+zfp_mode
+zfp_stream_set_mode(zfp_stream* zfp, uint64 mode)
+{
+  uint minbits, maxbits, maxprec;
+  int minexp;
+
+  if (mode <= ZFP_MODE_SHORT_MAX) {
+    /* 12-bit (short) encoding of one of three modes */
+    if (mode < 2048) {
+      /* fixed rate */
+      minbits = maxbits = (uint)mode + 1;
+      maxprec = ZFP_MAX_PREC;
+      minexp = ZFP_MIN_EXP;
+    }
+    else if (mode < (2048 + 128)) {
+      /* fixed precision */
+      minbits = ZFP_MIN_BITS;
+      maxbits = ZFP_MAX_BITS;
+      maxprec = (uint)mode + 1 - (2048);
+      minexp = ZFP_MIN_EXP;
+    }
+    else {
+      /* fixed accuracy */
+      minbits = ZFP_MIN_BITS;
+      maxbits = ZFP_MAX_BITS;
+      maxprec = ZFP_MAX_PREC;
+      minexp = (uint)mode + ZFP_MIN_EXP - (2048 + 128 + 1);
+    }
+  }
+  else {
+    /* 64-bit encoding */
+    mode >>= 12; minbits = ((uint)mode & 0x7fffu) + 1;
+    mode >>= 15; maxbits = ((uint)mode & 0x7fffu) + 1;
+    mode >>= 15; maxprec = ((uint)mode & 0x007fu) + 1;
+    mode >>=  7; minexp  = ((uint)mode & 0x7fffu) - 16495;
+  }
+
+  if (!zfp_stream_set_params(zfp, minbits, maxbits, maxprec, minexp))
+    return zfp_mode_null;
+
+  return zfp_stream_compression_mode(zfp);
+}
+
+int
+zfp_stream_set_params(zfp_stream* zfp, uint minbits, uint maxbits, uint maxprec, int minexp)
+{
+  if (minbits > maxbits || !(0 < maxprec && maxprec <= 64))
+    return 0;
+  zfp->minbits = minbits;
+  zfp->maxbits = maxbits;
+  zfp->maxprec = maxprec;
+  zfp->minexp = minexp;
+  return 1;
+}
+
+size_t
+zfp_stream_flush(zfp_stream* zfp)
+{
+  return stream_flush(zfp->stream);
+}
+
+size_t
+zfp_stream_align(zfp_stream* zfp)
+{
+  return stream_align(zfp->stream);
+}
+
+void
+zfp_stream_rewind(zfp_stream* zfp)
+{
+  stream_rewind(zfp->stream);
+}
+
+/* public functions: execution policy -------------------------------------- */
+
+zfp_exec_policy
+zfp_stream_execution(const zfp_stream* zfp)
+{
+  return zfp->exec.policy;
+}
+
+uint
+zfp_stream_omp_threads(const zfp_stream* zfp)
+{
+  return zfp->exec.params.omp.threads;
+}
+
+uint
+zfp_stream_omp_chunk_size(const zfp_stream* zfp)
+{
+  return zfp->exec.params.omp.chunk_size;
+}
+
+int
+zfp_stream_set_execution(zfp_stream* zfp, zfp_exec_policy policy)
+{
+  switch (policy) {
+    case zfp_exec_serial:
+      break;
+#ifdef ZFP_WITH_CUDA
+    case zfp_exec_cuda:
+      break;
+#endif
+    case zfp_exec_omp:
+#ifdef _OPENMP
+      if (zfp->exec.policy != policy) {
+        zfp->exec.params.omp.threads = 0;
+        zfp->exec.params.omp.chunk_size = 0;
+      }
+      break;
+#else
+      return 0;
+#endif
+    default:
+      return 0;
+  }
+  zfp->exec.policy = policy;
+  return 1;
+}
+
+int
+zfp_stream_set_omp_threads(zfp_stream* zfp, uint threads)
+{
+  if (!zfp_stream_set_execution(zfp, zfp_exec_omp))
+    return 0;
+  zfp->exec.params.omp.threads = threads;
+  return 1;
+}
+
+int
+zfp_stream_set_omp_chunk_size(zfp_stream* zfp, uint chunk_size)
+{
+  if (!zfp_stream_set_execution(zfp, zfp_exec_omp))
+    return 0;
+  zfp->exec.params.omp.chunk_size = chunk_size;
+  return 1;
+}
+
+/* public functions: utility functions --------------------------------------*/
+
+void
+zfp_promote_int8_to_int32(int32* oblock, const int8* iblock, uint dims)
+{
+  uint count = 1u << (2 * dims);
+  while (count--)
+    *oblock++ = (int32)*iblock++ << 23;
+}
+
+void
+zfp_promote_uint8_to_int32(int32* oblock, const uint8* iblock, uint dims)
+{
+  uint count = 1u << (2 * dims);
+  while (count--)
+    *oblock++ = ((int32)*iblock++ - 0x80) << 23;
+}
+
+void
+zfp_promote_int16_to_int32(int32* oblock, const int16* iblock, uint dims)
+{
+  uint count = 1u << (2 * dims);
+  while (count--)
+    *oblock++ = (int32)*iblock++ << 15;
+}
+
+void
+zfp_promote_uint16_to_int32(int32* oblock, const uint16* iblock, uint dims)
+{
+  uint count = 1u << (2 * dims);
+  while (count--)
+    *oblock++ = ((int32)*iblock++ - 0x8000) << 15;
+}
+
+void
+zfp_demote_int32_to_int8(int8* oblock, const int32* iblock, uint dims)
+{
+  uint count = 1u << (2 * dims);
+  while (count--) {
+    int32 i = *iblock++ >> 23;
+    *oblock++ = (int8)MAX(-0x80, MIN(i, 0x7f));
+  }
+}
+
+void
+zfp_demote_int32_to_uint8(uint8* oblock, const int32* iblock, uint dims)
+{
+  uint count = 1u << (2 * dims);
+  while (count--) {
+    int32 i = (*iblock++ >> 23) + 0x80;
+    *oblock++ = (uint8)MAX(0x00, MIN(i, 0xff));
+  }
+}
+
+void
+zfp_demote_int32_to_int16(int16* oblock, const int32* iblock, uint dims)
+{
+  uint count = 1u << (2 * dims);
+  while (count--) {
+    int32 i = *iblock++ >> 15;
+    *oblock++ = (int16)MAX(-0x8000, MIN(i, 0x7fff));
+  }
+}
+
+void
+zfp_demote_int32_to_uint16(uint16* oblock, const int32* iblock, uint dims)
+{
+  uint count = 1u << (2 * dims);
+  while (count--) {
+    int32 i = (*iblock++ >> 15) + 0x8000;
+    *oblock++ = (uint16)MAX(0x0000, MIN(i, 0xffff));
+  }
+}
+
+/* public functions: compression and decompression --------------------------*/
+
+size_t
+zfp_compress(zfp_stream* zfp, const zfp_field* field)
+{
+  /* function table [execution][strided][dimensionality][scalar type] */
+  void (*ftable[3][2][4][4])(zfp_stream*, const zfp_field*) = {
+    /* serial */
+    {{{ compress_int32_1,         compress_int64_1,         compress_float_1,         compress_double_1 },
+      { compress_strided_int32_2, compress_strided_int64_2, compress_strided_float_2, compress_strided_double_2 },
+      { compress_strided_int32_3, compress_strided_int64_3, compress_strided_float_3, compress_strided_double_3 },
+      { compress_strided_int32_4, compress_strided_int64_4, compress_strided_float_4, compress_strided_double_4 }},
+     {{ compress_strided_int32_1, compress_strided_int64_1, compress_strided_float_1, compress_strided_double_1 },
+      { compress_strided_int32_2, compress_strided_int64_2, compress_strided_float_2, compress_strided_double_2 },
+      { compress_strided_int32_3, compress_strided_int64_3, compress_strided_float_3, compress_strided_double_3 },
+      { compress_strided_int32_4, compress_strided_int64_4, compress_strided_float_4, compress_strided_double_4 }}},
+
+    /* OpenMP */
+#ifdef _OPENMP
+    {{{ compress_omp_int32_1,         compress_omp_int64_1,         compress_omp_float_1,         compress_omp_double_1 },
+      { compress_strided_omp_int32_2, compress_strided_omp_int64_2, compress_strided_omp_float_2, compress_strided_omp_double_2 },
+      { compress_strided_omp_int32_3, compress_strided_omp_int64_3, compress_strided_omp_float_3, compress_strided_omp_double_3 },
+      { compress_strided_omp_int32_4, compress_strided_omp_int64_4, compress_strided_omp_float_4, compress_strided_omp_double_4 }},
+     {{ compress_strided_omp_int32_1, compress_strided_omp_int64_1, compress_strided_omp_float_1, compress_strided_omp_double_1 },
+      { compress_strided_omp_int32_2, compress_strided_omp_int64_2, compress_strided_omp_float_2, compress_strided_omp_double_2 },
+      { compress_strided_omp_int32_3, compress_strided_omp_int64_3, compress_strided_omp_float_3, compress_strided_omp_double_3 },
+      { compress_strided_omp_int32_4, compress_strided_omp_int64_4, compress_strided_omp_float_4, compress_strided_omp_double_4 }}},
+#else
+    {{{ NULL }}},
+#endif
+
+    /* CUDA */
+#ifdef ZFP_WITH_CUDA
+    {{{ compress_cuda_int32_1,         compress_cuda_int64_1,         compress_cuda_float_1,         compress_cuda_double_1 },
+      { compress_strided_cuda_int32_2, compress_strided_cuda_int64_2, compress_strided_cuda_float_2, compress_strided_cuda_double_2 },
+      { compress_strided_cuda_int32_3, compress_strided_cuda_int64_3, compress_strided_cuda_float_3, compress_strided_cuda_double_3 },
+      { NULL,                            NULL,                            NULL,                            NULL }},
+     {{ compress_strided_cuda_int32_1, compress_strided_cuda_int64_1, compress_strided_cuda_float_1, compress_strided_cuda_double_1 },
+      { compress_strided_cuda_int32_2, compress_strided_cuda_int64_2, compress_strided_cuda_float_2, compress_strided_cuda_double_2 },
+      { compress_strided_cuda_int32_3, compress_strided_cuda_int64_3, compress_strided_cuda_float_3, compress_strided_cuda_double_3 },
+      { NULL,                            NULL,                            NULL,                            NULL }}},
+#else
+    {{{ NULL }}},
+#endif
+  };
+  uint exec = zfp->exec.policy;
+  uint strided = zfp_field_stride(field, NULL);
+  uint dims = zfp_field_dimensionality(field);
+  uint type = field->type;
+
+  switch (type) {
+    case zfp_type_int32:
+    case zfp_type_int64:
+    case zfp_type_float:
+    case zfp_type_double:
+      break;
+    default:
+      return 0;
+  }
+
+  /* return 0 if compression mode is not supported */
+  void (*compress)(zfp_stream*, const zfp_field*) = ftable[exec][strided][dims - 1][type - zfp_type_int32];
+  if (!compress)
+    return 0;
+
+  /* compress field and align bit stream on word boundary */
+  compress(zfp, field);
+  stream_flush(zfp->stream);
+
+  return stream_size(zfp->stream);
+}
+
+size_t
+zfp_decompress(zfp_stream* zfp, zfp_field* field)
+{
+  /* function table [execution][strided][dimensionality][scalar type] */
+  void (*ftable[3][2][4][4])(zfp_stream*, zfp_field*) = {
+    /* serial */
+    {{{ decompress_int32_1,         decompress_int64_1,         decompress_float_1,         decompress_double_1 },
+      { decompress_strided_int32_2, decompress_strided_int64_2, decompress_strided_float_2, decompress_strided_double_2 },
+      { decompress_strided_int32_3, decompress_strided_int64_3, decompress_strided_float_3, decompress_strided_double_3 },
+      { decompress_strided_int32_4, decompress_strided_int64_4, decompress_strided_float_4, decompress_strided_double_4 }},
+     {{ decompress_strided_int32_1, decompress_strided_int64_1, decompress_strided_float_1, decompress_strided_double_1 },
+      { decompress_strided_int32_2, decompress_strided_int64_2, decompress_strided_float_2, decompress_strided_double_2 },
+      { decompress_strided_int32_3, decompress_strided_int64_3, decompress_strided_float_3, decompress_strided_double_3 },
+      { decompress_strided_int32_4, decompress_strided_int64_4, decompress_strided_float_4, decompress_strided_double_4 }}},
+
+    /* OpenMP; not yet supported */
+    {{{ NULL }}},
+
+    /* CUDA */
+#ifdef ZFP_WITH_CUDA
+    {{{ decompress_cuda_int32_1,         decompress_cuda_int64_1,         decompress_cuda_float_1,         decompress_cuda_double_1 },
+      { decompress_strided_cuda_int32_2, decompress_strided_cuda_int64_2, decompress_strided_cuda_float_2, decompress_strided_cuda_double_2 },
+      { decompress_strided_cuda_int32_3, decompress_strided_cuda_int64_3, decompress_strided_cuda_float_3, decompress_strided_cuda_double_3 },
+      { NULL,                            NULL,                            NULL,                            NULL }},
+     {{ decompress_strided_cuda_int32_1, decompress_strided_cuda_int64_1, decompress_strided_cuda_float_1, decompress_strided_cuda_double_1 },
+      { decompress_strided_cuda_int32_2, decompress_strided_cuda_int64_2, decompress_strided_cuda_float_2, decompress_strided_cuda_double_2 },
+      { decompress_strided_cuda_int32_3, decompress_strided_cuda_int64_3, decompress_strided_cuda_float_3, decompress_strided_cuda_double_3 },
+      { NULL,                            NULL,                            NULL,                            NULL }}},
+#else
+    {{{ NULL }}},
+#endif
+  };
+  uint exec = zfp->exec.policy;
+  uint strided = zfp_field_stride(field, NULL);
+  uint dims = zfp_field_dimensionality(field);
+  uint type = field->type;
+
+  switch (type) {
+    case zfp_type_int32:
+    case zfp_type_int64:
+    case zfp_type_float:
+    case zfp_type_double:
+      break;
+    default:
+      return 0;
+  }
+
+  /* return 0 if decompression mode is not supported */
+  void (*decompress)(zfp_stream*, zfp_field*) = ftable[exec][strided][dims - 1][type - zfp_type_int32];
+  if (!decompress)
+    return 0;
+
+  /* decompress field and align bit stream on word boundary */
+  decompress(zfp, field);
+  stream_align(zfp->stream);
+
+  return stream_size(zfp->stream);
+}
+
+size_t
+zfp_write_header(zfp_stream* zfp, const zfp_field* field, uint mask)
+{
+  size_t bits = 0;
+  /* 32-bit magic */
+  if (mask & ZFP_HEADER_MAGIC) {
+    stream_write_bits(zfp->stream, 'z', 8);
+    stream_write_bits(zfp->stream, 'f', 8);
+    stream_write_bits(zfp->stream, 'p', 8);
+    stream_write_bits(zfp->stream, zfp_codec_version, 8);
+    bits += ZFP_MAGIC_BITS;
+  }
+  /* 52-bit field metadata */
+  if (mask & ZFP_HEADER_META) {
+    uint64 meta = zfp_field_metadata(field);
+    stream_write_bits(zfp->stream, meta, ZFP_META_BITS);
+    bits += ZFP_META_BITS;
+  }
+  /* 12- or 64-bit compression parameters */
+  if (mask & ZFP_HEADER_MODE) {
+    uint64 mode = zfp_stream_mode(zfp);
+    uint size = mode > ZFP_MODE_SHORT_MAX ? ZFP_MODE_LONG_BITS : ZFP_MODE_SHORT_BITS;
+    stream_write_bits(zfp->stream, mode, size);
+    bits += size;
+  }
+  return bits;
+}
+
+size_t
+zfp_read_header(zfp_stream* zfp, zfp_field* field, uint mask)
+{
+  size_t bits = 0;
+  if (mask & ZFP_HEADER_MAGIC) {
+    if (stream_read_bits(zfp->stream, 8) != 'z' ||
+        stream_read_bits(zfp->stream, 8) != 'f' ||
+        stream_read_bits(zfp->stream, 8) != 'p' ||
+        stream_read_bits(zfp->stream, 8) != zfp_codec_version)
+      return 0;
+    bits += ZFP_MAGIC_BITS;
+  }
+  if (mask & ZFP_HEADER_META) {
+    uint64 meta = stream_read_bits(zfp->stream, ZFP_META_BITS);
+    if (!zfp_field_set_metadata(field, meta))
+      return 0;
+    bits += ZFP_META_BITS;
+  }
+  if (mask & ZFP_HEADER_MODE) {
+    uint64 mode = stream_read_bits(zfp->stream, ZFP_MODE_SHORT_BITS);
+    bits += ZFP_MODE_SHORT_BITS;
+    if (mode > ZFP_MODE_SHORT_MAX) {
+      uint size = ZFP_MODE_LONG_BITS - ZFP_MODE_SHORT_BITS;
+      mode += stream_read_bits(zfp->stream, size) << ZFP_MODE_SHORT_BITS;
+      bits += size;
+    }
+    if (zfp_stream_set_mode(zfp, mode) == zfp_mode_null)
+      return 0;
+  }
+  return bits;
+}
diff --git a/zfp/tests/CMakeLists.txt b/zfp/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2a44af0160ad3becccbd8f9e2930870c372ecabb
--- /dev/null
+++ b/zfp/tests/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_executable(testzfp testzfp.cpp)
+target_link_libraries(testzfp zfp)
+target_compile_definitions(testzfp PRIVATE ${zfp_defs})
+
+option(ZFP_BUILD_TESTING_SMALL "Enable small-sized array testing" ON)
+if(ZFP_BUILD_TESTING_SMALL)
+  foreach(D IN ITEMS 1 2 3 4)
+    foreach(P IN ITEMS 32 64)
+      add_test(NAME small-arrays-${D}d-fp${P} COMMAND testzfp small ${D}d fp${P})
+    endforeach()
+  endforeach()
+endif()
+
+option(ZFP_BUILD_TESTING_LARGE "Enable large-sized array testing" OFF)
+if(ZFP_BUILD_TESTING_LARGE)
+  foreach(D IN ITEMS 1 2 3 4)
+    foreach(P IN ITEMS 32 64)
+      add_test(NAME large-arrays-${D}d-fp${P} COMMAND testzfp large ${D}d fp${P})
+    endforeach()
+  endforeach()
+endif()
diff --git a/zfp/tests/Makefile b/zfp/tests/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..2c496ee3513eb6dc3294065052e67c749affa2d6
--- /dev/null
+++ b/zfp/tests/Makefile
@@ -0,0 +1,16 @@
+include ../Config
+
+BINDIR = ../bin
+TARGETS = $(BINDIR)/testzfp
+CXXLIBS = -L../lib -lzfp
+
+all: $(TARGETS)
+
+$(BINDIR)/testzfp: testzfp.cpp ../lib/$(LIBZFP)
+	$(CXX) $(CXXFLAGS) -I../array testzfp.cpp $(CXXLIBS) -o $@
+
+test: $(BINDIR)/testzfp
+	$(BINDIR)/testzfp
+
+clean:
+	rm -f $(TARGETS)
diff --git a/zfp/tests/testzfp.cpp b/zfp/tests/testzfp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..883e4d56a348cf405cfbe596f494526ffc5df59d
--- /dev/null
+++ b/zfp/tests/testzfp.cpp
@@ -0,0 +1,1017 @@
+#include <algorithm>
+#include <cmath>
+#include <ctime>
+#include <cstdio>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include "zfp.h"
+#include "zfparray1.h"
+#include "zfparray2.h"
+#include "zfparray3.h"
+
+enum ArraySize {
+  Small  = 0, // 2^12 = 4096 scalars (2^12 = (2^6)^2 = (2^4)^3 = (2^3)^4)
+  Large  = 1  // 2^24 = 16 M scalars (2^24 = (2^12)^2 = (2^8)^3 = (2^6)^4)
+};
+
+enum ScalarType {
+  Float =  0, // 32-bit single precision
+  Double = 1  // 64-bit double precision
+};
+
+static const int width = 72; // characters per line
+
+inline uint
+mask(uint i)
+{
+  return 1u << i;
+}
+
+inline uint
+test_size(ArraySize size)
+{
+  return 2u << size;
+}
+
+// refine 1D array f[m] to g[2m]
+inline void
+refine1d(int* g, const int* f, size_t m)
+{
+  const int weight[4] = { -1, 9, 9, -1 };
+  const size_t n = 2 * m;
+
+  for (size_t x = 0; x < n; x++) {
+    int s = 0;
+    for (int i = 0; i < 4; i++) {
+      size_t xx = x & 1u ? (x / 2 + i - 1 + m) % m : x / 2;
+      s += weight[i] * f[xx];
+    }
+    g[x] = s / 16;
+  }
+}
+
+// refine 2D array f[m][m] to g[2m][2m]
+inline void
+refine2d(int* g, const int* f, size_t m)
+{
+  const int weight[4] = { -1, 9, 9, -1 };
+  const size_t n = 2 * m;
+
+  for (size_t y = 0; y < n; y++)
+    for (size_t x = 0; x < n; x++) {
+      int s = 0;
+      for (int j = 0; j < 4; j++) {
+        size_t yy = y & 1u ? (y / 2 + j - 1 + m) % m : y / 2;
+        for (int i = 0; i < 4; i++) {
+          size_t xx = x & 1u ? (x / 2 + i - 1 + m) % m : x / 2;
+          s += weight[i] * weight[j] * f[xx + m * yy];
+        }
+      }
+      g[x + n * y] = s / (16 * 16);
+    }
+}
+
+// refine 3D array f[m][m][m] to g[2m][2m][2m]
+inline void
+refine3d(int* g, const int* f, size_t m)
+{
+  const int weight[4] = { -1, 9, 9, -1 };
+  const size_t n = 2 * m;
+
+  for (size_t z = 0; z < n; z++)
+    for (size_t y = 0; y < n; y++)
+      for (size_t x = 0; x < n; x++) {
+        int s = 0;
+        for (int k = 0; k < 4; k++) {
+          size_t zz = z & 1u ? (z / 2 + k - 1 + m) % m : z / 2;
+          for (int j = 0; j < 4; j++) {
+            size_t yy = y & 1u ? (y / 2 + j - 1 + m) % m : y / 2;
+            for (int i = 0; i < 4; i++) {
+              size_t xx = x & 1u ? (x / 2 + i - 1 + m) % m : x / 2;
+              s += weight[i] * weight[j] * weight[k] * f[xx + m * (yy + m * zz)];
+            }
+          }
+        }
+        g[x + n * (y + n * z)] = s / (16 * 16 * 16);
+      }
+}
+
+// refine 4D array f[m][m][m][m] to g[2m][2m][2m][2m]
+inline void
+refine4d(int* g, const int* f, size_t m)
+{
+  const int weight[4] = { -1, 9, 9, -1 };
+  const size_t n = 2 * m;
+
+  for (size_t w = 0; w < n; w++)
+    for (size_t z = 0; z < n; z++)
+      for (size_t y = 0; y < n; y++)
+        for (size_t x = 0; x < n; x++) {
+          int s = 0;
+          for (int l = 0; l < 4; l++) {
+            size_t ww = w & 1u ? (w / 2 + l - 1 + m) % m : w / 2;
+            for (int k = 0; k < 4; k++) {
+              size_t zz = z & 1u ? (z / 2 + k - 1 + m) % m : z / 2;
+              for (int j = 0; j < 4; j++) {
+                size_t yy = y & 1u ? (y / 2 + j - 1 + m) % m : y / 2;
+                for (int i = 0; i < 4; i++) {
+                  size_t xx = x & 1u ? (x / 2 + i - 1 + m) % m : x / 2;
+                  s += weight[i] * weight[j] * weight[k] * weight[l] * f[xx + m * (yy + m * (zz + m * ww))];
+                }
+              }
+            }
+          }
+          g[x + n * (y + n * (z + n * w))] = s / (16 * 16 * 16 * 16);
+        }
+}
+
+template <typename real>
+inline void
+convert_ints_to_reals(real* data, const int* f, size_t n)
+{
+  for (size_t i = 0; i < n; i++)
+    data[i] = std::ldexp(real(f[i]), -12);
+}
+
+// generate 1D test array of size n
+template <typename real>
+inline bool
+gen_array_1d(real* data, size_t n)
+{
+  // ensure n >= 4 is a power of two
+  if (n < 4 || n & (n - 1))
+    return false;
+
+  // initialize 4-element integer array
+  int* f = new int[n];
+  std::fill(f, f + 4, 0);
+  for (uint x = 1; x < 3; x++)
+    f[x] = 0x10000 * (1 - 2 * int(x & 1u));
+
+  // refine to n-element array
+  int* g = new int[n];
+  for (size_t m = 4; m < n; m *= 2) {
+    refine1d(g, f, m);
+    std::swap(f, g);
+  }
+  delete[] g;
+
+  // convert ints to real type
+  convert_ints_to_reals(data, f, n);
+  delete[] f;
+
+  return true;
+}
+
+// generate 2D test array of size n^2
+template <typename real>
+inline bool
+gen_array_2d(real* data, size_t n)
+{
+  // ensure n >= 4 is a power of two
+  if (n < 4 || n & (n - 1))
+    return false;
+
+  // initialize 4x4 integer array
+  int* f = new int[n * n];
+  std::fill(f, f + 4 * 4, 0);
+  for (uint y = 1; y < 3; y++)
+    for (uint x = 1; x < 3; x++)
+      f[x + 4 * y] = 0x10000 * (1 - 2 * int((x ^ y) & 1u));
+
+  // refine to n^2 array
+  int* g = new int[n * n];
+  for (size_t m = 4; m < n; m *= 2) {
+    refine2d(g, f, m);
+    std::swap(f, g);
+  }
+  delete[] g;
+
+  // convert ints to real type
+  convert_ints_to_reals(data, f, n * n);
+  delete[] f;
+
+  return true;
+}
+
+// generate 3D test array of size n^3
+template <typename real>
+inline bool
+gen_array_3d(real* data, size_t n)
+{
+  // ensure n >= 4 is a power of two
+  if (n < 4 || n & (n - 1))
+    return false;
+
+  // initialize 4x4x4 integer array
+  int* f = new int[n * n * n];
+  std::fill(f, f + 4 * 4 * 4, 0);
+  for (uint z = 1; z <= 2u; z++)
+    for (uint y = 1; y <= 2u; y++)
+      for (uint x = 1; x <= 2u; x++)
+        f[x + 4 * (y + 4 * z)] = 0x10000 * (1 - 2 * int((x ^ y ^ z) & 1u));
+
+  // refine to n^3 array
+  int* g = new int[n * n * n];
+  for (size_t m = 4; m < n; m *= 2) {
+    refine3d(g, f, m);
+    std::swap(f, g);
+  }
+  delete[] g;
+
+  // convert ints to real type
+  convert_ints_to_reals(data, f, n * n * n);
+  delete[] f;
+
+  return true;
+}
+
+// generate 4D test array of size n^4
+template <typename real>
+inline bool
+gen_array_4d(real* data, size_t n)
+{
+  // ensure n >= 4 is a power of two
+  if (n < 4 || n & (n - 1))
+    return false;
+
+  // initialize 4x4x4x4 integer array
+  int* f = new int[n * n * n * n];
+  std::fill(f, f + 4 * 4 * 4 * 4, 0);
+  for (uint w = 1; w < 3; w++)
+    for (uint z = 1; z < 3; z++)
+      for (uint y = 1; y < 3; y++)
+        for (uint x = 1; x < 3; x++)
+          f[x + 4 * (y + 4 * (z + 4 * w))] = 0x10000 * (1 - 2 * int((x ^ y ^ z ^ w) & 1u));
+
+  // refine to n^4 array
+  int* g = new int[n * n * n * n];
+  for (size_t m = 4; m < n; m *= 2) {
+    refine4d(g, f, m);
+    std::swap(f, g);
+  }
+  delete[] g;
+
+  // convert ints to real type
+  convert_ints_to_reals(data, f, n * n * n * n);
+  delete[] f;
+
+  return true;
+}
+
+// initialize array
+template <typename Scalar>
+inline void
+initialize(Scalar* p, uint dims, ArraySize array_size)
+{
+  size_t size = 1ul << ((array_size == Small ? 12 : 24) / dims);
+
+  switch (dims) {
+    default:
+    case 1:
+      gen_array_1d<Scalar>(p, size);
+      break;
+    case 2:
+      gen_array_2d<Scalar>(p, size);
+      break;
+    case 3:
+      gen_array_3d<Scalar>(p, size);
+      break;
+    case 4:
+      gen_array_4d<Scalar>(p, size);
+      break;
+  }
+}
+
+// compute checksum
+inline uint32
+hash(const void* p, size_t n)
+{
+  uint32 h = 0;
+  for (const uchar* q = static_cast<const uchar*>(p); n; q++, n--) {
+    // Jenkins one-at-a-time hash; see http://www.burtleburtle.net/bob/hash/doobs.html
+    h += *q;
+    h += h << 10;
+    h ^= h >>  6;
+  }
+  h += h <<  3;
+  h ^= h >> 11;
+  h += h << 15;
+  return h;
+}
+
+// test fixed-rate mode
+template <typename Scalar>
+inline uint
+test_rate(zfp_stream* stream, const zfp_field* input, double rate, Scalar tolerance, bool timings = false)
+{
+  uint failures = 0;
+  size_t n = zfp_field_size(input, NULL);
+  uint dims = zfp_field_dimensionality(input);
+  zfp_type type = zfp_field_type(input);
+
+  // allocate memory for compressed data
+  rate = zfp_stream_set_rate(stream, rate, type, dims, 0);
+  size_t bufsize = zfp_stream_maximum_size(stream, input);
+  uchar* buffer = new uchar[bufsize];
+  bitstream* s = stream_open(buffer, bufsize);
+  zfp_stream_set_bit_stream(stream, s);
+
+  // perform compression test
+  std::ostringstream status;
+  status << "  compress:  ";
+  status << " rate=" << std::fixed << std::setprecision(0) << std::setw(2) << rate;
+  clock_t c = clock();
+  zfp_stream_rewind(stream);
+  size_t outsize = zfp_compress(stream, input);
+  double time = double(clock() - c) / CLOCKS_PER_SEC;
+  double throughput = (n * sizeof(Scalar)) / (0x100000 * time);
+  if (timings)
+    status << " throughput=" << std::setprecision(1) << std::setw(6) << throughput << " MB/s";
+  bool pass = true;
+  // make sure compressed size matches rate
+  size_t bytes = (size_t)floor(rate * zfp_field_size(input, NULL) / CHAR_BIT + 0.5);
+  if (outsize != bytes) {
+    status << " [" << outsize << " != " << bytes << "]";
+    pass = false;
+  }
+  std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl;
+  if (!pass)
+    failures++;
+
+  // perform decompression test
+  status.str("");
+  status << "  decompress:";
+  status << " rate=" << std::fixed << std::setprecision(0) << std::setw(2) << rate;
+  Scalar* g = new Scalar[n];
+  zfp_field* output = zfp_field_alloc();
+  *output = *input;
+  zfp_field_set_pointer(output, g);
+  c = clock();
+  zfp_stream_rewind(stream);
+  pass = !!zfp_decompress(stream, output);
+  if (!pass)
+    status << " [decompression failed]";
+  else {
+    double time = double(clock() - c) / CLOCKS_PER_SEC;
+    double throughput = (n * sizeof(Scalar)) / (0x100000 * time);
+    if (timings)
+      status << " throughput=" << std::setprecision(1) << std::setw(6) << throughput << " MB/s";
+    // compute max error
+    Scalar* f = static_cast<Scalar*>(zfp_field_pointer(input));
+    Scalar emax = 0;
+    for (uint i = 0; i < n; i++)
+      emax = std::max(emax, std::abs(f[i] - g[i]));
+    status << std::scientific;
+    status.precision(3);
+    // make sure max error is within tolerance
+    if (emax <= tolerance)
+      status << " " << emax << " <= " << tolerance;
+    else {
+      status << " [" << emax << " > " << tolerance << "]";
+      pass = false;
+    }
+  }
+  zfp_field_free(output);
+  delete[] g;
+  stream_close(s);
+  delete[] buffer;
+  std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl;
+  if (!pass)
+    failures++;
+
+  return failures;
+}
+
+// test fixed-precision mode
+template <typename Scalar>
+inline uint
+test_precision(zfp_stream* stream, const zfp_field* input, uint precision, size_t bytes)
+{
+  uint failures = 0;
+  size_t n = zfp_field_size(input, NULL);
+
+  // allocate memory for compressed data
+  zfp_stream_set_precision(stream, precision);
+  size_t bufsize = zfp_stream_maximum_size(stream, input);
+  uchar* buffer = new uchar[bufsize];
+  bitstream* s = stream_open(buffer, bufsize);
+  zfp_stream_set_bit_stream(stream, s);
+
+  // perform compression test
+  std::ostringstream status;
+  status << "  compress:  ";
+  status << " precision=" << std::setw(2) << precision;
+  zfp_stream_rewind(stream);
+  size_t outsize = zfp_compress(stream, input);
+  double ratio = double(n * sizeof(Scalar)) / outsize;
+  status << " ratio=" << std::fixed << std::setprecision(3) << std::setw(7) << ratio;
+  bool pass = true;
+  // make sure compressed size agrees
+  if (outsize != bytes) {
+    status << " [" << outsize << " != " << bytes << "]";
+    pass = false;
+  }
+  std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl;
+  if (!pass)
+    failures++;
+
+  // perform decompression test
+  status.str("");
+  status << "  decompress:";
+  status << " precision=" << std::setw(2) << precision;
+  Scalar* g = new Scalar[n];
+  zfp_field* output = zfp_field_alloc();
+  *output = *input;
+  zfp_field_set_pointer(output, g);
+  zfp_stream_rewind(stream);
+  pass = !!zfp_decompress(stream, output);
+  if (!pass)
+    status << " [decompression failed]";
+  zfp_field_free(output);
+  delete[] g;
+  stream_close(s);
+  delete[] buffer;
+  std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl;
+  if (!pass)
+    failures++;
+
+  return failures;
+}
+
+// test fixed-accuracy mode
+template <typename Scalar>
+inline uint
+test_accuracy(zfp_stream* stream, const zfp_field* input, Scalar tolerance, size_t bytes)
+{
+  uint failures = 0;
+  size_t n = zfp_field_size(input, NULL);
+
+  // allocate memory for compressed data
+  tolerance = static_cast<Scalar>(zfp_stream_set_accuracy(stream, tolerance));
+  size_t bufsize = zfp_stream_maximum_size(stream, input);
+  uchar* buffer = new uchar[bufsize];
+  bitstream* s = stream_open(buffer, bufsize);
+  zfp_stream_set_bit_stream(stream, s);
+
+  // perform compression test
+  std::ostringstream status;
+  status << "  compress:  ";
+  status << " tolerance=" << std::scientific << std::setprecision(3) << tolerance;
+  zfp_stream_rewind(stream);
+  size_t outsize = zfp_compress(stream, input);
+  double ratio = double(n * sizeof(Scalar)) / outsize;
+  status << " ratio=" << std::fixed << std::setprecision(3) << std::setw(7) << ratio;
+  bool pass = true;
+  // make sure compressed size agrees
+  if (outsize != bytes) {
+    status << " [" << outsize << " != " << bytes << "]";
+    pass = false;
+  }
+  std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl;
+  if (!pass)
+    failures++;
+
+  // perform decompression test
+  status.str("");
+  status << "  decompress:";
+  status << " tolerance=" << std::scientific << std::setprecision(3) << tolerance;
+  Scalar* g = new Scalar[n];
+  zfp_field* output = zfp_field_alloc();
+  *output = *input;
+  zfp_field_set_pointer(output, g);
+  zfp_stream_rewind(stream);
+  pass = !!zfp_decompress(stream, output);
+  if (!pass)
+    status << " [decompression failed]";
+  else {
+    // compute max error
+    Scalar* f = static_cast<Scalar*>(zfp_field_pointer(input));
+    Scalar emax = 0;
+    for (uint i = 0; i < n; i++)
+      emax = std::max(emax, std::abs(f[i] - g[i]));
+    status << std::scientific << std::setprecision(3) << " ";
+    // make sure max error is within tolerance
+    if (emax <= tolerance)
+      status << emax << " <= " << tolerance;
+    else if (tolerance == 0)
+      status << "(" << emax << " > 0)";
+    else {
+      status << "[" << emax << " > " << tolerance << "]";
+      pass = false;
+    }
+  }
+  zfp_field_free(output);
+  delete[] g;
+  stream_close(s);
+  delete[] buffer;
+  std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl;
+  if (!pass)
+    failures++;
+
+  return failures;
+}
+
+// perform 1D differencing
+template <typename Scalar>
+inline void
+update_array1(zfp::array1<Scalar>& a)
+{
+  for (uint i = 0; i < a.size() - 1; i++)
+    a(i) -= a(i + 1);
+  for (uint i = 0; i < a.size() - 1; i++)
+    a(0) = std::max(a(0), a(i));
+}
+
+// perform 2D differencing
+template <typename Scalar>
+inline void
+update_array2(zfp::array2<Scalar>& a)
+{
+  for (uint j = 0; j < a.size_y(); j++)
+    for (uint i = 0; i < a.size_x() - 1; i++)
+      a(i, j) -= a(i + 1, j);
+  for (uint j = 0; j < a.size_y() - 1; j++)
+    for (uint i = 0; i < a.size_x(); i++)
+      a(i, j) -= a(i, j + 1);
+  for (uint j = 0; j < a.size_y() - 1; j++)
+    for (uint i = 0; i < a.size_x() - 1; i++)
+      a(0, 0) = std::max(a(0, 0), a(i, j));
+}
+
+// perform 3D differencing
+template <typename Scalar>
+inline void
+update_array3(zfp::array3<Scalar>& a)
+{
+  for (uint k = 0; k < a.size_z(); k++)
+    for (uint j = 0; j < a.size_y(); j++)
+      for (uint i = 0; i < a.size_x() - 1; i++)
+        a(i, j, k) -= a(i + 1, j, k);
+  for (uint k = 0; k < a.size_z(); k++)
+    for (uint j = 0; j < a.size_y() - 1; j++)
+      for (uint i = 0; i < a.size_x(); i++)
+        a(i, j, k) -= a(i, j + 1, k);
+  for (uint k = 0; k < a.size_z() - 1; k++)
+    for (uint j = 0; j < a.size_y(); j++)
+      for (uint i = 0; i < a.size_x(); i++)
+        a(i, j, k) -= a(i, j, k + 1);
+  for (uint k = 0; k < a.size_z() - 1; k++)
+    for (uint j = 0; j < a.size_y() - 1; j++)
+      for (uint i = 0; i < a.size_x() - 1; i++)
+        a(0, 0, 0) = std::max(a(0, 0, 0), a(i, j, k));
+}
+
+template <class Array>
+inline void update_array(Array& a);
+
+template <>
+inline void
+update_array(zfp::array1<float>& a) { update_array1(a); }
+
+template <>
+inline void
+update_array(zfp::array1<double>& a) { update_array1(a); }
+
+template <>
+inline void
+update_array(zfp::array2<float>& a) { update_array2(a); }
+
+template <>
+inline void
+update_array(zfp::array2<double>& a) { update_array2(a); }
+
+template <>
+inline void
+update_array(zfp::array3<float>& a) { update_array3(a); }
+
+template <>
+inline void
+update_array(zfp::array3<double>& a) { update_array3(a); }
+
+// test random-accessible array primitive
+template <class Array, typename Scalar>
+inline uint
+test_array(Array& a, const Scalar* f, uint n, double tolerance, double dfmax)
+{
+  uint failures = 0;
+
+  // test construction
+  std::ostringstream status;
+  status << "  construct: ";
+  Scalar emax = 0;
+  for (uint i = 0; i < n; i++)
+    emax = std::max(emax, std::abs(f[i] - a[i]));
+  status << std::scientific;
+  status.precision(3);
+  // make sure max error is within tolerance
+  bool pass = true;
+  if (emax <= tolerance)
+    status << " " << emax << " <= " << tolerance;
+  else {
+    status << " [" << emax << " > " << tolerance << "]";
+    pass = false;
+  }
+
+  std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl;
+  if (!pass)
+    failures++;
+
+  // test array updates
+  status.str("");
+  status << "  update:    ";
+  update_array(a);
+  Scalar amax = a[0];
+  pass = true;
+  if (std::abs(amax - dfmax) <= 1e-3 * dfmax)
+    status << " " << amax << " ~ " << dfmax;
+  else {
+    status << " [" << amax << " != " << dfmax << "]";
+    pass = false;
+  }
+
+  std::cout << std::setw(width) << std::left << status.str() << (pass ? " OK " : "FAIL") << std::endl;
+  if (!pass)
+    failures++;
+
+  return failures;
+}
+
+// test small or large d-dimensional arrays of type Scalar
+template <typename Scalar>
+inline uint
+test(uint dims, ArraySize array_size)
+{
+  uint failures = 0;
+  uint m = test_size(array_size);
+  uint n = m * m * m * m * m * m * m * m * m * m * m * m;
+  Scalar* f = new Scalar[n];
+
+  // determine array size
+  uint nx, ny, nz ,nw;
+  zfp_field* field = zfp_field_alloc();
+  zfp_field_set_type(field, zfp::codec<Scalar>::type);
+  zfp_field_set_pointer(field, f);
+  switch (dims) {
+    case 1:
+      nx = n;
+      ny = nz = nw = 0;
+      zfp_field_set_size_1d(field, nx);
+      break;
+    case 2:
+      nx = ny = m * m * m * m * m * m;
+      nz = nw = 0;
+      zfp_field_set_size_2d(field, nx, ny);
+      break;
+    case 3:
+      nx = ny = nz = m * m * m * m;
+      nw = 0;
+      zfp_field_set_size_3d(field, nx, ny, nz);
+      break;
+    case 4:
+      nx = ny = nz = nw = m * m * m;
+      zfp_field_set_size_4d(field, nx, ny, nz, nw);
+      break;
+    default:
+      std::cout << "invalid dimensions " << dims << std::endl;
+      return 1;
+  }
+  initialize<Scalar>(f, dims, array_size);
+  uint t = (zfp_field_type(field) == zfp_type_float ? 0 : 1);
+  std::cout << "testing " << dims << "D array of " << (t == 0 ? "floats" : "doubles") << std::endl;
+
+  // test data integrity
+  uint32 checksum[2][2][4] = {
+    // small
+    {{ 0x54174c44u, 0x86609589u, 0xfc0a6a76u, 0xa3481e00u },
+     { 0x7d257bb6u, 0x294bb210u, 0x68614d26u, 0xf6bd3a21u }},
+    // large
+    {{ 0xd1ce1aceu, 0x644274dau, 0xc0ad63fau, 0x700de480u },
+     { 0xc3ed7116u, 0x644e2117u, 0xd7464b07u, 0x2516382eu }},
+  };
+  uint32 h = hash(f, n * sizeof(Scalar));
+  if (h != checksum[array_size][t][dims - 1])
+    std::cout << "warning: test data checksum " << std::hex << h << " != " << checksum[array_size][t][dims - 1] << "; tests below may fail" << std::endl;
+
+  // open compressed stream
+  zfp_stream* stream = zfp_stream_open(0);
+
+  // test fixed rate
+  for (uint rate = 2u >> t, i = 0; rate <= 32 * (t + 1); rate *= 4, i++) {
+    // expected max errors
+    double emax[2][2][4][4] = {
+      // small
+      {
+        {
+          {1.627e+01, 8.277e-02, 0.000e+00},
+          {1.500e+00, 3.663e-03, 0.000e+00},
+          {1.500e+00, 9.583e-03, 0.000e+00},
+          {1.373e+01, 6.633e-01, 0.000e+00},
+        },
+        {
+          {1.627e+01, 1.601e+01, 1.832e-04, 0.000e+00},
+          {2.376e+01, 1.797e-01, 8.584e-06, 0.000e+00},
+          {5.210e+00, 2.002e-01, 3.338e-05, 0.000e+00},
+          {1.016e+01, 8.985e+00, 3.312e-03, 0.000e+00},
+        },
+      },
+      // large
+      {
+        {
+          {1.627e+01, 2.100e-02, 0.000e+00},
+          {1.624e-01, 7.439e-05, 0.000e+00},
+          {1.001e-02, 7.248e-05, 0.000e+00},
+          {2.527e-02, 2.460e-04, 0.000e+00},
+        },
+        {
+          {1.627e+01, 1.601e+01, 2.289e-05, 0.000e+00},
+          {1.607e+01, 2.076e-03, 0.000e+00, 0.000e+00},
+          {1.407e-01, 7.344e-04, 0.000e+00, 0.000e+00},
+          {1.436e-01, 2.659e-03, 8.801e-08, 0.000e+00},
+        }
+      }
+    };
+    failures += test_rate<Scalar>(stream, field, rate, static_cast<Scalar>(emax[array_size][t][dims - 1][i]), array_size == Large);
+  }
+
+  if (stream_word_bits != 64)
+    std::cout << "warning: stream word size is smaller than 64; tests below may fail" << std::endl;
+
+  // test fixed precision
+  for (uint prec = 4u << t, i = 0; i < 3; prec *= 2, i++) {
+    // expected compressed sizes
+    size_t bytes[2][2][4][3] = {
+      // small
+      {
+        {
+          {2192, 3280, 6328},
+          { 592, 1328, 4384},
+          { 152, 1040, 4600},
+          {  64, 1760, 5856},
+        },
+        {
+          {3664, 6712, 14104},
+          {1424, 4480, 12616},
+          {1064, 4624, 12808},
+          {1768, 5864, 14056},
+        },
+      },
+      // large
+      {
+        {
+          {8965672, 13160560, 21835352},
+          {2235560,  3512848, 10309240},
+          { 568456,  1361056,  8759696},
+          { 134344,   739632,  8896360},
+        },
+        {
+          {14733112, 23407904, 44997832},
+          { 3905240, 10701640, 40856544},
+          { 1458368,  8857008, 41270184},
+          {  763928,  8920656, 41574712},
+        },
+      }
+    };
+    failures += test_precision<Scalar>(stream, field, prec, bytes[array_size][t][dims - 1][i]);
+  }
+
+  // test fixed accuracy
+  for (uint i = 0; i < 3; i++) {
+    Scalar tol[] = { Scalar(1e-3), 2 * std::numeric_limits<Scalar>::epsilon(), 0 };
+    // expected compressed sizes
+    size_t bytes[2][2][4][3] = {
+      // small
+      {
+        {
+          {6328, 11944, 13720},
+          {4936, 11064, 12520},
+          {6104, 11752, 12784},
+          {9440, 14048, 14048},
+        },
+        {
+          {6712, 25888, 29064},
+          {5032, 26016, 28984},
+          {6128, 27120, 29192},
+          {9448, 30440, 30440},
+        },
+      },
+      // large
+      {
+        {
+          {21815976, 38285256, 43425280},
+          { 9187232, 32695984, 40464144},
+          { 8914336, 33364208, 41172864},
+          {12109200, 35921784, 41550416},
+        },
+        {
+          {23388528, 79426016,  88659304},
+          { 9579632, 89770896, 103388072},
+          { 9011648, 94009072, 107606336},
+          {12133496, 97126288, 107911568},
+        },
+      }
+    };
+    failures += test_accuracy<Scalar>(stream, field, tol[i], bytes[array_size][t][dims - 1][i]);
+  }
+
+  // test compressed array support
+  double emax[2][2][3] = {
+    // small
+    {
+      {4.578e-05, 7.630e-06, 3.148e-05},
+      {1.832e-04, 8.584e-06, 3.338e-05},
+    },
+    // large
+    {
+      {0.000e+00, 0.000e+00, 0.000e+00},
+      {2.289e-05, 0.000e+00, 0.000e+00},
+    }
+  };
+  double dfmax[2][2][3] = {
+    // small
+    {
+      {2.155e-02, 3.755e-01, 1.846e+00},
+      {2.155e-02, 3.755e-01, 1.846e+00},
+    },
+    // large
+    {
+      {2.441e-04, 4.883e-04, 1.221e-03},
+      {2.670e-04, 4.883e-04, 1.221e-03},
+    }
+  };
+  double rate = 16;
+  switch (dims) {
+    case 1: {
+        zfp::array1<Scalar> a(nx, rate, f);
+        failures += test_array(a, f, n, static_cast<Scalar>(emax[array_size][t][dims - 1]), static_cast<Scalar>(dfmax[array_size][t][dims - 1]));
+      }
+      break;
+    case 2: {
+        zfp::array2<Scalar> a(nx, ny, rate, f);
+        failures += test_array(a, f, n, static_cast<Scalar>(emax[array_size][t][dims - 1]), static_cast<Scalar>(dfmax[array_size][t][dims - 1]));
+      }
+      break;
+    case 3: {
+        zfp::array3<Scalar> a(nx, ny, nz, rate, f);
+        failures += test_array(a, f, n, static_cast<Scalar>(emax[array_size][t][dims - 1]), static_cast<Scalar>(dfmax[array_size][t][dims - 1]));
+      }
+      break;
+    case 4: // 4D arrays not yet supported
+      break;
+  }
+
+  std::cout << std::endl;
+  zfp_stream_close(stream);
+  zfp_field_free(field);
+
+  delete[] f;
+  return failures;
+}
+
+// various library and compiler sanity checks
+inline uint
+common_tests()
+{
+  uint failures = 0;
+  // test library version
+  if (zfp_codec_version != ZFP_CODEC || zfp_library_version != ZFP_VERSION) {
+    std::cout << "library header and binary version mismatch" << std::endl;
+    failures++;
+  }
+  // ensure integer type sizes are correct
+  if (sizeof(int8) != 1u || sizeof(uint8) != 1u) {
+    std::cout << "8-bit integer type is not one byte wide" << std::endl;
+    failures++;
+  }
+  if (sizeof(int16) != 2u || sizeof(uint16) != 2u) {
+    std::cout << "16-bit integer type is not two bytes wide" << std::endl;
+    failures++;
+  }
+  if (sizeof(int32) != 4u || sizeof(uint32) != 4u) {
+    std::cout << "32-bit integer type is not four bytes wide" << std::endl;
+    failures++;
+  }
+  if (sizeof(int64) != 8u || sizeof(uint64) != 8u) {
+    std::cout << "64-bit integer type is not eight bytes wide" << std::endl;
+    failures++;
+  }
+  // ensure signed right shifts are arithmetic
+  int32 x32 = -2;
+  if ((x32 >> 1) != -1 || (x32 >> 2) != -1) {
+    std::cout << "32-bit arithmetic right shift not supported" << std::endl;
+    failures++;
+  }
+  int64 x64 = -2;
+  if ((x64 >> 1) != INT64C(-1) || (x64 >> 2) != INT64C(-1)) {
+    std::cout << "64-bit arithmetic right shift not supported" << std::endl;
+    failures++;
+  }
+  // testing requires default (64-bit) stream words
+  if (stream_word_bits != 64) {
+    std::cout << "regression testing requires BIT_STREAM_WORD_TYPE=uint64" << std::endl;
+    failures++;
+  }
+  return failures;
+}
+
+int main(int argc, char* argv[])
+{
+  std::cout << zfp_version_string << std::endl;
+  std::cout << "library version " << zfp_library_version << std::endl;
+  std::cout << "CODEC version " << zfp_codec_version << std::endl;
+  std::cout << "data model ";
+  size_t model = ((sizeof(uint64) - 1) << 12) +
+                 ((sizeof(void*) - 1) << 8) +
+                 ((sizeof(unsigned long int) - 1) << 4) +
+                 ((sizeof(unsigned int) - 1) << 0);
+  switch (model) {
+    case 0x7331u:
+      std::cout << "LP32";
+      break;
+    case 0x7333u:
+      std::cout << "ILP32";
+      break;
+    case 0x7733u:
+      std::cout << "LLP64";
+      break;
+    case 0x7773u:
+      std::cout << "LP64";
+      break;
+    case 0x7777u:
+      std::cout << "ILP64";
+      break;
+    default:
+      std::cout << "unknown (0x" << std::hex << model << ")";
+      break;
+  }
+  std::cout << std::endl;
+  std::cout << std::endl;
+
+  uint sizes = 0;
+  uint types = 0;
+  uint dims = 0;
+
+  for (int i = 1; i < argc; i++)
+    if (std::string(argv[i]) == "small")
+      sizes |= mask(Small);
+    else if (std::string(argv[i]) == "large")
+      sizes |= mask(Large);
+    else if (std::string(argv[i]) == "float" || std::string(argv[i]) == "fp32")
+      types |= mask(Float);
+    else if (std::string(argv[i]) == "double" || std::string(argv[i]) == "fp64")
+      types |= mask(Double);
+    else if (std::string(argv[i]) == "1d")
+      dims |= mask(1);
+    else if (std::string(argv[i]) == "2d")
+      dims |= mask(2);
+    else if (std::string(argv[i]) == "3d")
+      dims |= mask(3);
+    else if (std::string(argv[i]) == "4d")
+      dims |= mask(4);
+    else if (std::string(argv[i]) == "all") {
+      sizes |= mask(Small) | mask(Large);
+      types |= mask(Float) | mask(Double);
+      dims |= mask(1) | mask(2) | mask(3) | mask(4);
+    }
+    else {
+      std::cerr << "Usage: testzfp [all] [small|large] [fp32|fp64|float|double] [1d|2d|3d|4d]" << std::endl;
+      return EXIT_FAILURE;
+    }
+
+  // use defaults if not specified
+  if (!sizes)
+    sizes = mask(Small);
+  if (!types)
+    types = mask(Float) | mask(Double);
+  if (!dims)
+    dims = mask(1) | mask(2) | mask(3) | mask(4);
+
+  // test library and compiler
+  uint failures = common_tests();
+  if (failures)
+    return EXIT_FAILURE;
+
+  // test arrays
+  for (int size = Small; size <= Large; size++)
+    if (sizes & mask(ArraySize(size))) {
+      for (uint d = 1; d <= 4; d++)
+        if (dims & mask(d)) {
+          if (types & mask(Float))
+            failures += test<float>(d, ArraySize(size));
+          if (types & mask(Double))
+            failures += test<double>(d, ArraySize(size));
+       }
+    }
+
+  if (failures)
+    std::cout << failures << " test(s) failed" << std::endl;
+  else
+    std::cout << "all tests passed" << std::endl;
+
+  return failures ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/zfp/travis.sh b/zfp/travis.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2314e0376889f7d8a7a8bc8de1343c3abb334f99
--- /dev/null
+++ b/zfp/travis.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env sh
+set -e
+
+mkdir build
+cd build
+
+# build/test without OpenMP, with CFP
+cmake .. -DCMAKE_C_STANDARD=${C_STANDARD:-99} -DCMAKE_CXX_STANDARD=${CXX_STANDARD:-98} -DZFP_WITH_OPENMP=OFF -DBUILD_CFP=ON
+cmake --build .
+ctest -V -C "Debug"
+
+rm -rf ./*
+
+# build/test with OpenMP, with CFP custom namespace
+cmake .. -DCMAKE_C_STANDARD=${C_STANDARD:-99} -DCMAKE_CXX_STANDARD=${CXX_STANDARD:-98} -DBUILD_CFP=ON -DCFP_NAMESPACE=cfp2
+cmake --build .
+ctest -V -C "Debug"
diff --git a/zfp/utils/CMakeLists.txt b/zfp/utils/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a960d2c9cdbf2363689bbe47130578d8655bd7f4
--- /dev/null
+++ b/zfp/utils/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_executable(zfpcmd zfp.c)
+set_property(TARGET zfpcmd PROPERTY OUTPUT_NAME zfp)
+target_link_libraries(zfpcmd zfp)
+if(HAVE_LIBM_MATH)
+  target_link_libraries(zfpcmd m)
+endif()
diff --git a/zfp/utils/Makefile b/zfp/utils/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..50a40ce9e7d8dfd7c15f48468fefb9db0ccc08df
--- /dev/null
+++ b/zfp/utils/Makefile
@@ -0,0 +1,12 @@
+include ../Config
+
+TARGET = ../bin/zfp
+
+all: $(TARGET)
+
+$(TARGET): zfp.c ../lib/$(LIBZFP)
+	mkdir -p ../bin
+	$(CC) $(CFLAGS) zfp.c -L../lib -lzfp -lm -o $(TARGET)
+
+clean:
+	rm -f $(TARGET) fields.o
diff --git a/zfp/utils/zfp.c b/zfp/utils/zfp.c
new file mode 100644
index 0000000000000000000000000000000000000000..d4f4cf6f512fe8a3bfa8b5f412948eb9b4a24ee1
--- /dev/null
+++ b/zfp/utils/zfp.c
@@ -0,0 +1,646 @@
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "zfp.h"
+#include "zfp/macros.h"
+
+/*
+File I/O is done using the following combinations of i, o, s, and z:
+- i   : read uncompressed
+- z   : read compressed
+- i, s: read uncompressed, print stats
+- i, o: read and write uncompressed
+- i, z: read uncompressed, write compressed
+- z, o: read compressed, write uncompressed
+
+The 7 major tasks to be accomplished are:
+- read uncompressed:  i
+- read compressed:    !i
+- compress:           i
+- write compressed:   i && z
+- decompress:         o || s || (!i && z)
+- write uncompressed: o
+- compute stats:      s
+*/
+
+/* compute and print reconstruction error */
+static void
+print_error(const void* fin, const void* fout, zfp_type type, size_t n)
+{
+  const int32* i32i = (const int32*)fin;
+  const int64* i64i = (const int64*)fin;
+  const float* f32i = (const float*)fin;
+  const double* f64i = (const double*)fin;
+  const int32* i32o = (const int32*)fout;
+  const int64* i64o = (const int64*)fout;
+  const float* f32o = (const float*)fout;
+  const double* f64o = (const double*)fout;
+  double fmin = +DBL_MAX;
+  double fmax = -DBL_MAX;
+  double erms = 0;
+  double ermsn = 0;
+  double emax = 0;
+  double psnr = 0;
+  size_t i;
+
+  for (i = 0; i < n; i++) {
+    double d, val;
+    switch (type) {
+      case zfp_type_int32:
+        d = fabs((double)(i32i[i] - i32o[i]));
+        val = (double)i32i[i];
+        break;
+      case zfp_type_int64:
+        d = fabs((double)(i64i[i] - i64o[i]));
+        val = (double)i64i[i];
+        break;
+      case zfp_type_float:
+        d = fabs((double)(f32i[i] - f32o[i]));
+        val = (double)f32i[i];
+        break;
+      case zfp_type_double:
+        d = fabs(f64i[i] - f64o[i]);
+        val = f64i[i];
+        break;
+      default:
+        return;
+    }
+    emax = MAX(emax, d);
+    erms += d * d;
+    fmin = MIN(fmin, val);
+    fmax = MAX(fmax, val);
+  }
+  erms = sqrt(erms / n);
+  ermsn = erms / (fmax - fmin);
+  psnr = 20 * log10((fmax - fmin) / (2 * erms));
+  fprintf(stderr, " rmse=%.4g nrmse=%.4g maxe=%.4g psnr=%.2f", erms, ermsn, emax, psnr);
+}
+
+static void
+usage()
+{
+  fprintf(stderr, "%s\n", zfp_version_string);
+  fprintf(stderr, "Usage: zfp <options>\n");
+  fprintf(stderr, "General options:\n");
+  fprintf(stderr, "  -h : read/write array and compression parameters from/to compressed header\n");
+  fprintf(stderr, "  -q : quiet mode; suppress output\n");
+  fprintf(stderr, "  -s : print error statistics\n");
+  fprintf(stderr, "Input and output:\n");
+  fprintf(stderr, "  -i <path> : uncompressed binary input file (\"-\" for stdin)\n");
+  fprintf(stderr, "  -o <path> : decompressed binary output file (\"-\" for stdout)\n");
+  fprintf(stderr, "  -z <path> : compressed input (w/o -i) or output file (\"-\" for stdin/stdout)\n");
+  fprintf(stderr, "Array type and dimensions (needed with -i):\n");
+  fprintf(stderr, "  -f : single precision (float type)\n");
+  fprintf(stderr, "  -d : double precision (double type)\n");
+  fprintf(stderr, "  -t <i32|i64|f32|f64> : integer or floating scalar type\n");
+  fprintf(stderr, "  -1 <nx> : dimensions for 1D array a[nx]\n");
+  fprintf(stderr, "  -2 <nx> <ny> : dimensions for 2D array a[ny][nx]\n");
+  fprintf(stderr, "  -3 <nx> <ny> <nz> : dimensions for 3D array a[nz][ny][nx]\n");
+  fprintf(stderr, "  -4 <nx> <ny> <nz> <nw> : dimensions for 4D array a[nw][nz][ny][nx]\n");
+  fprintf(stderr, "Compression parameters (needed with -i):\n");
+  fprintf(stderr, "  -r <rate> : fixed rate (# compressed bits per floating-point value)\n");
+  fprintf(stderr, "  -p <precision> : fixed precision (# uncompressed bits per value)\n");
+  fprintf(stderr, "  -a <tolerance> : fixed accuracy (absolute error tolerance)\n");
+  fprintf(stderr, "  -c <minbits> <maxbits> <maxprec> <minexp> : advanced usage\n");
+  fprintf(stderr, "      minbits : min # bits per 4^d values in d dimensions\n");
+  fprintf(stderr, "      maxbits : max # bits per 4^d values in d dimensions (0 for unlimited)\n");
+  fprintf(stderr, "      maxprec : max # bits of precision per value (0 for full)\n");
+  fprintf(stderr, "      minexp : min bit plane # coded (-1074 for all bit planes)\n");
+  fprintf(stderr, "Execution parameters:\n");
+  fprintf(stderr, "  -x serial : serial compression (default)\n");
+  fprintf(stderr, "  -x omp[=threads[,chunk_size]] : OpenMP parallel compression\n");
+  fprintf(stderr, "  -x cuda : CUDA fixed rate parallel compression/decompression\n");
+  fprintf(stderr, "Examples:\n");
+  fprintf(stderr, "  -i file : read uncompressed file and compress to memory\n");
+  fprintf(stderr, "  -z file : read compressed file and decompress to memory\n");
+  fprintf(stderr, "  -i ifile -z zfile : read uncompressed ifile, write compressed zfile\n");
+  fprintf(stderr, "  -z zfile -o ofile : read compressed zfile, write decompressed ofile\n");
+  fprintf(stderr, "  -i ifile -o ofile : read ifile, compress, decompress, write ofile\n");
+  fprintf(stderr, "  -i file -s : read uncompressed file, compress to memory, print stats\n");
+  fprintf(stderr, "  -i - -o - -s : read stdin, compress, decompress, write stdout, print stats\n");
+  fprintf(stderr, "  -f -3 100 100 100 -r 16 : 2x fixed-rate compression of 100x100x100 floats\n");
+  fprintf(stderr, "  -d -1 1000000 -r 32 : 2x fixed-rate compression of 1M doubles\n");
+  fprintf(stderr, "  -d -2 1000 1000 -p 32 : 32-bit precision compression of 1000x1000 doubles\n");
+  fprintf(stderr, "  -d -1 1000000 -a 1e-9 : compression of 1M doubles with < 1e-9 max error\n");
+  fprintf(stderr, "  -d -1 1000000 -c 64 64 0 -1074 : 4x fixed-rate compression of 1M doubles\n");
+  fprintf(stderr, "  -x omp=16,256 : parallel compression with 16 threads, 256-block chunks\n");
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char* argv[])
+{
+  /* default settings */
+  zfp_type type = zfp_type_none;
+  size_t typesize = 0;
+  uint dims = 0;
+  uint nx = 0;
+  uint ny = 0;
+  uint nz = 0;
+  uint nw = 0;
+  size_t count = 0;
+  double rate = 0;
+  uint precision = 0;
+  double tolerance = 0;
+  uint minbits = ZFP_MIN_BITS;
+  uint maxbits = ZFP_MAX_BITS;
+  uint maxprec = ZFP_MAX_PREC;
+  int minexp = ZFP_MIN_EXP;
+  int header = 0;
+  int quiet = 0;
+  int stats = 0;
+  char* inpath = 0;
+  char* zfppath = 0;
+  char* outpath = 0;
+  char mode = 0;
+  zfp_exec_policy exec = zfp_exec_serial;
+  uint threads = 0;
+  uint chunk_size = 0;
+
+  /* local variables */
+  int i;
+  zfp_field* field = NULL;
+  zfp_stream* zfp = NULL;
+  bitstream* stream = NULL;
+  void* fi = NULL;
+  void* fo = NULL;
+  void* buffer = NULL;
+  size_t rawsize = 0;
+  size_t zfpsize = 0;
+  size_t bufsize = 0;
+
+  if (argc == 1)
+    usage();
+
+  /* parse command-line arguments */
+  for (i = 1; i < argc; i++) {
+    if (argv[i][0] != '-' || argv[i][2])
+      usage();
+    switch (argv[i][1]) {
+      case '1':
+        if (++i == argc || sscanf(argv[i], "%u", &nx) != 1)
+          usage();
+        ny = nz = nw = 1;
+        dims = 1;
+        break;
+      case '2':
+        if (++i == argc || sscanf(argv[i], "%u", &nx) != 1 ||
+            ++i == argc || sscanf(argv[i], "%u", &ny) != 1)
+          usage();
+        nz = nw = 1;
+        dims = 2;
+        break;
+      case '3':
+        if (++i == argc || sscanf(argv[i], "%u", &nx) != 1 ||
+            ++i == argc || sscanf(argv[i], "%u", &ny) != 1 ||
+            ++i == argc || sscanf(argv[i], "%u", &nz) != 1)
+          usage();
+        nw = 1;
+        dims = 3;
+        break;
+      case '4':
+        if (++i == argc || sscanf(argv[i], "%u", &nx) != 1 ||
+            ++i == argc || sscanf(argv[i], "%u", &ny) != 1 ||
+            ++i == argc || sscanf(argv[i], "%u", &nz) != 1 ||
+            ++i == argc || sscanf(argv[i], "%u", &nw) != 1)
+          usage();
+        dims = 4;
+        break;
+      case 'a':
+        if (++i == argc || sscanf(argv[i], "%lf", &tolerance) != 1)
+          usage();
+        mode = 'a';
+        break;
+      case 'c':
+        if (++i == argc || sscanf(argv[i], "%u", &minbits) != 1 ||
+            ++i == argc || sscanf(argv[i], "%u", &maxbits) != 1 ||
+            ++i == argc || sscanf(argv[i], "%u", &maxprec) != 1 ||
+            ++i == argc || sscanf(argv[i], "%d", &minexp) != 1)
+          usage();
+        mode = 'c';
+        break;
+      case 'd':
+        type = zfp_type_double;
+        break;
+      case 'f':
+        type = zfp_type_float;
+        break;
+      case 'h':
+        header = 1;
+        break;
+      case 'i':
+        if (++i == argc)
+          usage();
+        inpath = argv[i];
+        break;
+      case 'o':
+        if (++i == argc)
+          usage();
+        outpath = argv[i];
+        break;
+      case 'p':
+        if (++i == argc || sscanf(argv[i], "%u", &precision) != 1)
+          usage();
+        mode = 'p';
+        break;
+      case 'q':
+        quiet = 1;
+        break;
+      case 'r':
+        if (++i == argc || sscanf(argv[i], "%lf", &rate) != 1)
+          usage();
+        mode = 'r';
+        break;
+      case 's':
+        stats = 1;
+        break;
+      case 't':
+        if (++i == argc)
+          usage();
+        if (!strcmp(argv[i], "i32"))
+          type = zfp_type_int32;
+        else if (!strcmp(argv[i], "i64"))
+          type = zfp_type_int64;
+        else if (!strcmp(argv[i], "f32"))
+          type = zfp_type_float;
+        else if (!strcmp(argv[i], "f64"))
+          type = zfp_type_double;
+        else
+          usage();
+        break;
+      case 'x':
+        if (++i == argc)
+          usage();
+        if (!strcmp(argv[i], "serial"))
+          exec = zfp_exec_serial;
+        else if (!strcmp(argv[i], "cuda"))
+          exec = zfp_exec_cuda;
+        else if (sscanf(argv[i], "omp=%u,%u", &threads, &chunk_size) == 2)
+          exec = zfp_exec_omp;
+        else if (sscanf(argv[i], "omp=%u", &threads) == 1) {
+          exec = zfp_exec_omp;
+          chunk_size = 0;
+        }
+        else if (!strcmp(argv[i], "omp")) {
+          exec = zfp_exec_omp;
+          threads = 0;
+          chunk_size = 0;
+        }
+        else
+          usage();
+        break;
+      case 'z':
+        if (++i == argc)
+          usage();
+        zfppath = argv[i];
+        break;
+      default:
+        usage();
+        break;
+    }
+  }
+
+  typesize = zfp_type_size(type);
+  count = (size_t)nx * (size_t)ny * (size_t)nz * (size_t)nw;
+
+  /* make sure one of the array dimensions is not zero */
+  if (!count) {
+    fprintf(stderr, "array size must be nonzero\n");
+    return EXIT_FAILURE;
+  }
+
+  /* make sure we have an input file */
+  if (!inpath && !zfppath) {
+    fprintf(stderr, "must specify uncompressed or compressed input file via -i or -z\n");
+    return EXIT_FAILURE;
+  }
+
+  /* make sure we know floating-point type */
+  if ((inpath || !header) && !typesize) {
+    fprintf(stderr, "must specify scalar type via -f, -d, or -t or header via -h\n");
+    return EXIT_FAILURE;
+  }
+
+  /* make sure we know array dimensions */
+  if ((inpath || !header) && !dims) {
+    fprintf(stderr, "must specify array dimensions via -1, -2, or -3 or header via -h\n");
+    return EXIT_FAILURE;
+  }
+
+  /* make sure we know (de)compression mode and parameters */
+  if ((inpath || !header) && !mode) {
+    fprintf(stderr, "must specify compression parameters via -a, -c, -p, or -r or header via -h\n");
+    return EXIT_FAILURE;
+  }
+
+  /* make sure we have input file for stats */
+  if (stats && !inpath) {
+    fprintf(stderr, "must specify input file via -i to compute stats\n");
+    return EXIT_FAILURE;
+  }
+
+  /* make sure meta data comes from header or command line, not both */
+  if (!inpath && zfppath && header && (typesize || dims)) {
+    fprintf(stderr, "cannot specify both field type/size and header\n");
+    return EXIT_FAILURE;
+  }
+
+  zfp = zfp_stream_open(NULL);
+  field = zfp_field_alloc();
+
+  /* read uncompressed or compressed file */
+  if (inpath) {
+    /* read uncompressed input file */
+    FILE* file = !strcmp(inpath, "-") ? stdin : fopen(inpath, "rb");
+    if (!file) {
+      fprintf(stderr, "cannot open input file\n");
+      return EXIT_FAILURE;
+    }
+    rawsize = typesize * count;
+    fi = malloc(rawsize);
+    if (!fi) {
+      fprintf(stderr, "cannot allocate memory\n");
+      return EXIT_FAILURE;
+    }
+    if (fread(fi, typesize, count, file) != count) {
+      fprintf(stderr, "cannot read input file\n");
+      return EXIT_FAILURE;
+    }
+    fclose(file);
+    zfp_field_set_pointer(field, fi);
+  }
+  else {
+    /* read compressed input file in increasingly large chunks */
+    FILE* file = !strcmp(zfppath, "-") ? stdin : fopen(zfppath, "rb");
+    if (!file) {
+      fprintf(stderr, "cannot open compressed file\n");
+      return EXIT_FAILURE;
+    }
+    bufsize = 0x100;
+    do {
+      bufsize *= 2;
+      buffer = realloc(buffer, bufsize);
+      if (!buffer) {
+        fprintf(stderr, "cannot allocate memory\n");
+        return EXIT_FAILURE;
+      }
+      zfpsize += fread((uchar*)buffer + zfpsize, 1, bufsize - zfpsize, file);
+    } while (zfpsize == bufsize);
+    if (ferror(file)) {
+      fprintf(stderr, "cannot read compressed file\n");
+      return EXIT_FAILURE;
+    }
+    fclose(file);
+
+    /* associate bit stream with buffer */
+    stream = stream_open(buffer, bufsize);
+    if (!stream) {
+      fprintf(stderr, "cannot open compressed stream\n");
+      return EXIT_FAILURE;
+    }
+    zfp_stream_set_bit_stream(zfp, stream);
+  }
+
+  /* set field dimensions and (de)compression parameters */
+  if (inpath || !header) {
+    /* initialize uncompressed field */
+    zfp_field_set_type(field, type);
+    switch (dims) {
+      case 1:
+        zfp_field_set_size_1d(field, nx);
+        break;
+      case 2:
+        zfp_field_set_size_2d(field, nx, ny);
+        break;
+      case 3:
+        zfp_field_set_size_3d(field, nx, ny, nz);
+        break;
+      case 4:
+        zfp_field_set_size_4d(field, nx, ny, nz, nw);
+        break;
+    }
+
+    /* set (de)compression mode */
+    switch (mode) {
+      case 'a':
+        zfp_stream_set_accuracy(zfp, tolerance);
+        break;
+      case 'p':
+        zfp_stream_set_precision(zfp, precision);
+        break;
+      case 'r':
+        zfp_stream_set_rate(zfp, rate, type, dims, 0);
+        break;
+      case 'c':
+        if (!maxbits)
+          maxbits = ZFP_MAX_BITS;
+        if (!maxprec)
+          maxprec = zfp_field_precision(field);
+        if (!zfp_stream_set_params(zfp, minbits, maxbits, maxprec, minexp)) {
+          fprintf(stderr, "invalid compression parameters\n");
+          return EXIT_FAILURE;
+        }
+        break;
+    }
+  }
+
+  /* specify execution policy */
+  switch (exec) {
+    case zfp_exec_omp:
+      if (!zfp_stream_set_execution(zfp, exec) ||
+          !zfp_stream_set_omp_threads(zfp, threads) ||
+          !zfp_stream_set_omp_chunk_size(zfp, chunk_size)) {
+        fprintf(stderr, "OpenMP execution not available\n");
+        return EXIT_FAILURE;
+      }
+      break;
+    case zfp_exec_serial:
+    default:
+      if (!zfp_stream_set_execution(zfp, exec)) {
+        fprintf(stderr, "serial execution not available\n");
+        return EXIT_FAILURE;
+      }
+      break;
+  }
+
+  /* compress input file if provided */
+  if (inpath) {
+    /* allocate buffer for compressed data */
+    bufsize = zfp_stream_maximum_size(zfp, field);
+    if (!bufsize) {
+      fprintf(stderr, "invalid compression parameters\n");
+      return EXIT_FAILURE;
+    }
+    buffer = malloc(bufsize);
+    if (!buffer) {
+      fprintf(stderr, "cannot allocate memory\n");
+      return EXIT_FAILURE;
+    }
+
+    /* associate compressed bit stream with memory buffer */
+    stream = stream_open(buffer, bufsize);
+    if (!stream) {
+      fprintf(stderr, "cannot open compressed stream\n");
+      return EXIT_FAILURE;
+    }
+    zfp_stream_set_bit_stream(zfp, stream);
+
+    /* specify execution policy */
+    switch (exec) {
+      case zfp_exec_omp:
+        if (!zfp_stream_set_execution(zfp, exec) ||
+            !zfp_stream_set_omp_threads(zfp, threads) ||
+            !zfp_stream_set_omp_chunk_size(zfp, chunk_size)) {
+          fprintf(stderr, "OpenMP execution not available\n");
+          return EXIT_FAILURE;
+        }
+        break;
+      case zfp_exec_cuda:
+        if (!zfp_stream_set_execution(zfp, exec)) {
+          fprintf(stderr, "cuda execution not available\n");
+          return EXIT_FAILURE;
+        }
+      case zfp_exec_serial:
+      default:
+        if (!zfp_stream_set_execution(zfp, exec)) {
+          fprintf(stderr, "serial execution not available\n");
+          return EXIT_FAILURE;
+        }
+        break;
+    }
+
+    /* optionally write header */
+    if (header && !zfp_write_header(zfp, field, ZFP_HEADER_FULL)) {
+      fprintf(stderr, "cannot write header\n");
+      return EXIT_FAILURE;
+    }
+
+    /* compress data */
+    zfpsize = zfp_compress(zfp, field);
+    if (zfpsize == 0) {
+      fprintf(stderr, "compression failed\n");
+      return EXIT_FAILURE;
+    }
+
+    /* optionally write compressed data */
+    if (zfppath) {
+      FILE* file = !strcmp(zfppath, "-") ? stdout : fopen(zfppath, "wb");
+      if (!file) {
+        fprintf(stderr, "cannot create compressed file\n");
+        return EXIT_FAILURE;
+      }
+      if (fwrite(buffer, 1, zfpsize, file) != zfpsize) {
+        fprintf(stderr, "cannot write compressed file\n");
+        return EXIT_FAILURE;
+      }
+      fclose(file);
+    }
+  }
+
+  /* decompress data if necessary */
+  if ((!inpath && zfppath) || outpath || stats) {
+    /* obtain metadata from header when present */
+    zfp_stream_rewind(zfp);
+    if (header) {
+      if (!zfp_read_header(zfp, field, ZFP_HEADER_FULL)) {
+        fprintf(stderr, "incorrect or missing header\n");
+        return EXIT_FAILURE;
+      }
+      type = field->type;
+      switch (type) {
+        case zfp_type_float:
+          typesize = sizeof(float);
+          break;
+        case zfp_type_double:
+          typesize = sizeof(double);
+          break;
+        default:
+          fprintf(stderr, "unsupported type\n");
+          return EXIT_FAILURE;
+      }
+      nx = MAX(field->nx, 1u);
+      ny = MAX(field->ny, 1u);
+      nz = MAX(field->nz, 1u);
+      nw = MAX(field->nw, 1u);
+    }
+
+    /* specify execution policy */
+    switch (exec) {
+      case zfp_exec_omp: 
+          fprintf(stderr, "OpenMP decompression not available\n");
+          return EXIT_FAILURE;
+      case zfp_exec_cuda:
+        if (!zfp_stream_set_execution(zfp, exec)) {
+          fprintf(stderr, "cuda execution not available\n");
+          return EXIT_FAILURE;
+        }
+      case zfp_exec_serial:
+      default:
+        if (!zfp_stream_set_execution(zfp, exec)) {
+          fprintf(stderr, "serial execution not available\n");
+          return EXIT_FAILURE;
+        }
+        break;
+    }
+
+    /* allocate memory for decompressed data */
+    rawsize = typesize * count;
+    fo = malloc(rawsize);
+    if (!fo) {
+      fprintf(stderr, "cannot allocate memory\n");
+      return EXIT_FAILURE;
+    }
+    zfp_field_set_pointer(field, fo);
+
+    /* decompress data */
+    while (!zfp_decompress(zfp, field)) {
+      /* fall back on serial decompression if execution policy not supported */
+      if (inpath && zfp_stream_execution(zfp) != zfp_exec_serial) {
+        if (!zfp_stream_set_execution(zfp, zfp_exec_serial)) {
+          fprintf(stderr, "cannot change execution policy\n");
+          return EXIT_FAILURE;
+        }
+      }
+      else {
+        fprintf(stderr, "decompression failed\n");
+        return EXIT_FAILURE;
+      }
+    }
+
+    /* optionally write reconstructed data */
+    if (outpath) {
+      FILE* file = !strcmp(outpath, "-") ? stdout : fopen(outpath, "wb");
+      if (!file) {
+        fprintf(stderr, "cannot create output file\n");
+        return EXIT_FAILURE;
+      }
+      if (fwrite(fo, typesize, count, file) != count) {
+        fprintf(stderr, "cannot write output file\n");
+        return EXIT_FAILURE;
+      }
+      fclose(file);
+    }
+  }
+
+  /* print compression and error statistics */
+  if (!quiet) {
+    const char* type_name[] = { "int32", "int64", "float", "double" };
+    fprintf(stderr, "type=%s nx=%u ny=%u nz=%u nw=%u", type_name[type - zfp_type_int32], nx, ny, nz, nw);
+    fprintf(stderr, " raw=%lu zfp=%lu ratio=%.3g rate=%.4g", (unsigned long)rawsize, (unsigned long)zfpsize, (double)rawsize / zfpsize, CHAR_BIT * (double)zfpsize / count);
+    if (stats)
+      print_error(fi, fo, type, count);
+    fprintf(stderr, "\n");
+  }
+
+  /* free allocated storage */
+  zfp_field_free(field);
+  zfp_stream_close(zfp);
+  stream_close(stream);
+  free(buffer);
+  free(fi);
+  free(fo);
+
+  return EXIT_SUCCESS;
+}