Source code for pygeoprocessing.geoprocessing

# coding=UTF-8
"""A collection of raster and vector algorithms and utilities."""
import collections
import functools
import logging
import math
import os
import pprint
import queue
import re
import shutil
import sys
import tempfile
import threading
import time
import warnings

import numpy
import numpy.ma
import rtree
import scipy.interpolate
import scipy.ndimage
import scipy.signal
import scipy.signal.signaltools
import scipy.sparse
import shapely.ops
import shapely.prepared
import shapely.wkb
from osgeo import gdal
from osgeo import gdal_array
from osgeo import gdalconst
from osgeo import ogr
from osgeo import osr

from . import geoprocessing_core
from .geoprocessing_core import DEFAULT_CREATION_OPTIONS
from .geoprocessing_core import DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS
from .geoprocessing_core import DEFAULT_OSR_AXIS_MAPPING_STRATEGY
from .geoprocessing_core import INT8_CREATION_OPTIONS

# This is used to efficiently pass data to the raster stats worker if available
if sys.version_info >= (3, 8):
    import multiprocessing.shared_memory

GDAL_VERSION = tuple(int(_) for _ in gdal.__version__.split('.'))


[docs]class ReclassificationMissingValuesError(Exception):
    """Raised when a raster value is not a valid key to a dictionary.

    Attributes:
        msg (str) - error message
        missing_values (list) - a list of the missing values from the raster
            that are not keys in the dictionary

    """

    def __init__(self, missing_values, raster_path, value_map):
        """See Attributes for args docstring."""
        self.msg = (
            f'The following {missing_values.size} raster values '
            f'{missing_values} from "{raster_path}" do not have corresponding '
            f'entries in the value map: {value_map}.')
        self.missing_values = missing_values
        super().__init__(self.msg)


LOGGER = logging.getLogger(__name__)

# Used in joining finished TaskGraph Tasks.
_MAX_TIMEOUT = 60.0

_VALID_GDAL_TYPES = (
    set([getattr(gdal, x) for x in dir(gdal.gdalconst) if 'GDT_' in x]))

_LOGGING_PERIOD = 5.0  # min 5.0 seconds per update log message for the module
_LARGEST_ITERBLOCK = 2**16  # largest block for iterblocks to read in cells

[docs]class TimedLoggingAdapter(logging.LoggerAdapter):
    """A logging adapter to restrict logging based on a timer.

    The objective is to have a ``logging.LOGGER``-like object that can be
    called multiple times in rapid successtion, but with log messages only
    propagating every X seconds.

    This object is helpful for creating consistency in logging callbacks and is
    derived from the python stdlib ``logging.LoggerAdapter``.
    """

    def __init__(self, interval_s=_LOGGING_PERIOD):
        """Initialize the timed logging adapter.

        Args:
            interval_s (float): The logging interval, in seconds.  Defaults to
                ``_LOGGING_PERIOD``.
        """
        logging.LoggerAdapter.__init__(self, LOGGER, extra=None)
        self.interval = interval_s
        self.last_time = time.time()

[docs]    def log(self, level, msg, *args, **kwargs):
        """Log a ``LogRecord``.

        Args:
            level (int): The logging level.
            msg (str): The log message.
            args (list): The user-defined positional arguments for the log
                message.
            kwargs (dict): The user-defined keyword arguments for the log
                message.

        Returns:
            ``None``.
        """
        # The stacklevel arg to logging was introduced to logging in python
        # 3.8 and there isn't a clear way to spoof this in python 3.7, so
        # ignore.
        if sys.version_info >= (3, 8):
            # Don't override user-defined stacklevel if present.
            if 'stacklevel' not in kwargs:
                # Based on logging internals, 3 is the expected stack depth.
                kwargs['stacklevel'] = 3

            # Python 3.11 modified the stacklevel argument to be more
            # consistent with the behavior in the warnings module.
            # https://github.com/python/cpython/issues/89334
            if sys.version_info >= (3, 11):
                kwargs['stacklevel'] -= 1

        now = time.time()
        if now >= self.last_time + self.interval:
            self.last_time = now
            self.logger.log(level, msg, *args, **kwargs)


[docs]def raster_calculator(
        base_raster_path_band_const_list, local_op, target_raster_path,
        datatype_target, nodata_target,
        calc_raster_stats=True, use_shared_memory=False,
        largest_block=_LARGEST_ITERBLOCK, max_timeout=_MAX_TIMEOUT,
        raster_driver_creation_tuple=DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS):
    """Apply local a raster operation on a stack of rasters.

    This function applies a user defined function across a stack of
    rasters' pixel stack. The rasters in ``base_raster_path_band_list`` must
    be spatially aligned and have the same cell sizes.

    Args:
        base_raster_path_band_const_list (sequence): a sequence containing:

            * ``(str, int)`` tuples, referring to a raster path/band index pair
              to use as an input.
            * ``numpy.ndarray`` s of up to two dimensions.  These inputs must
              all be broadcastable to each other AND the size of the raster
              inputs.
            * ``(object, 'raw')`` tuples, where ``object`` will be passed
              directly into the ``local_op``.

            All rasters must have the same raster size. If only arrays are
            input, numpy arrays must be broadcastable to each other and the
            final raster size will be the final broadcast array shape. A value
            error is raised if only "raw" inputs are passed.
        local_op (function): a function that must take in as many parameters as
            there are elements in ``base_raster_path_band_const_list``. The
            parameters in ``local_op`` will map 1-to-1 in order with the values
            in ``base_raster_path_band_const_list``. ``raster_calculator`` will
            call ``local_op`` to generate the pixel values in ``target_raster``
            along memory block aligned processing windows. Note any
            particular call to ``local_op`` will have the arguments from
            ``raster_path_band_const_list`` sliced to overlap that window.
            If an argument from ``raster_path_band_const_list`` is a
            raster/path band tuple, it will be passed to ``local_op`` as a 2D
            numpy array of pixel values that align with the processing window
            that ``local_op`` is targeting. A 2D or 1D array will be sliced to
            match the processing window and in the case of a 1D array tiled in
            whatever dimension is flat. If an argument is a scalar it is
            passed as as scalar.
            The return value must be a 2D array of the same size as any of the
            input parameter 2D arrays and contain the desired pixel values
            for the target raster.
        target_raster_path (string): the path of the output raster.  The
            projection, size, and cell size will be the same as the rasters
            in ``base_raster_path_const_band_list`` or the final broadcast
            size of the constant/ndarray values in the list.
        datatype_target (gdal datatype; int): the desired GDAL output type of
            the target raster.
        nodata_target (numerical value): the desired nodata value of the
            target raster.
        calc_raster_stats (boolean): If True, calculates and sets raster
            statistics (min, max, mean, and stdev) for target raster.
        use_shared_memory (boolean): If True, uses Python Multiprocessing
            shared memory to calculate raster stats for faster performance.
            This feature is available for Python >= 3.8 and will otherwise
            be ignored for earlier versions of Python.
        largest_block (int): Attempts to internally iterate over raster blocks
            with this many elements.  Useful in cases where the blocksize is
            relatively small, memory is available, and the function call
            overhead dominates the iteration.  Defaults to 2**20.  A value of
            anything less than the original blocksize of the raster will
            result in blocksizes equal to the original size.
        max_timeout (float): amount of time in seconds to wait for stats
            worker thread to join. Default is _MAX_TIMEOUT.
        raster_driver_creation_tuple (tuple): a tuple containing a GDAL driver
            name string as the first element and a GDAL creation options
            tuple/list as the second. Defaults to
            geoprocessing.DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS.

    Return:
        None

    Raises:
        ValueError: invalid input provided

    """
    if not base_raster_path_band_const_list:
        raise ValueError(
            "`base_raster_path_band_const_list` is empty and "
            "should have at least one value.")

    # It's a common error to not pass in path/band tuples, so check for that
    # and report error if so
    bad_raster_path_list = False
    if not isinstance(base_raster_path_band_const_list, (list, tuple)):
        bad_raster_path_list = True
    else:
        for value in base_raster_path_band_const_list:
            if (not _is_raster_path_band_formatted(value) and
                not isinstance(value, numpy.ndarray) and
                not (isinstance(value, tuple) and len(value) == 2 and
                     value[1] == 'raw')):
                bad_raster_path_list = True
                break
    if bad_raster_path_list:
        raise ValueError(
            "Expected a sequence of path / integer band tuples, "
            "ndarrays, or (value, 'raw') pairs for "
            "`base_raster_path_band_const_list`, instead got: "
            "%s" % pprint.pformat(base_raster_path_band_const_list))

    # check that any rasters exist on disk and have enough bands
    not_found_paths = []
    gdal.PushErrorHandler('CPLQuietErrorHandler')
    base_raster_path_band_list = [
        path_band for path_band in base_raster_path_band_const_list
        if _is_raster_path_band_formatted(path_band)]
    for value in base_raster_path_band_list:
        if gdal.OpenEx(value[0], gdal.OF_RASTER) is None:
            not_found_paths.append(value[0])
    gdal.PopErrorHandler()
    if not_found_paths:
        raise ValueError(
            "The following files were expected but do not exist on the "
            "filesystem: " + str(not_found_paths))

    # check that band index exists in raster
    invalid_band_index_list = []
    for value in base_raster_path_band_list:
        raster = gdal.OpenEx(value[0], gdal.OF_RASTER)
        if not (1 <= value[1] <= raster.RasterCount):
            invalid_band_index_list.append(value)
        raster = None
    if invalid_band_index_list:
        raise ValueError(
            "The following rasters do not contain requested band "
            "indexes: %s" % invalid_band_index_list)

    # check that the target raster is not also an input raster
    if target_raster_path in [x[0] for x in base_raster_path_band_list]:
        raise ValueError(
            "%s is used as a target path, but it is also in the base input "
            "path list %s" % (
                target_raster_path, str(base_raster_path_band_const_list)))

    # check that raster inputs are all the same dimensions
    raster_info_list = [
        get_raster_info(path_band[0])
        for path_band in base_raster_path_band_const_list
        if _is_raster_path_band_formatted(path_band)]
    geospatial_info_set = set()
    for raster_info in raster_info_list:
        geospatial_info_set.add(raster_info['raster_size'])
    if len(geospatial_info_set) > 1:
        raise ValueError(
            "Input Rasters are not the same dimensions. The "
            "following raster are not identical %s" % str(
                geospatial_info_set))

    numpy_broadcast_list = [
        x for x in base_raster_path_band_const_list
        if isinstance(x, numpy.ndarray)]
    stats_worker_thread = None
    try:
        # numpy.broadcast can only take up to 32 arguments, this loop works
        # around that restriction:
        while len(numpy_broadcast_list) > 1:
            numpy_broadcast_list = (
                [numpy.broadcast(*numpy_broadcast_list[:32])] +
                numpy_broadcast_list[32:])
        if numpy_broadcast_list:
            numpy_broadcast_size = numpy_broadcast_list[0].shape
    except ValueError:
        # this gets raised if numpy.broadcast fails
        raise ValueError(
            "Numpy array inputs cannot be broadcast into a single shape %s" %
            numpy_broadcast_list)

    if numpy_broadcast_list and len(numpy_broadcast_list[0].shape) > 2:
        raise ValueError(
            "Numpy array inputs must be 2 dimensions or less %s" %
            numpy_broadcast_list)

    # if there are both rasters and arrays, check the numpy shape will
    # be broadcastable with raster shape
    if raster_info_list and numpy_broadcast_list:
        # geospatial lists x/y order and numpy does y/x so reverse size list
        raster_shape = tuple(reversed(raster_info_list[0]['raster_size']))
        invalid_broadcast_size = False
        if len(numpy_broadcast_size) == 1:
            # if there's only one dimension it should match the last
            # dimension first, in the raster case this is the columns
            # because of the row/column order of numpy. No problem if
            # that value is ``1`` because it will be broadcast, otherwise
            # it should be the same as the raster.
            if (numpy_broadcast_size[0] != raster_shape[1] and
                    numpy_broadcast_size[0] != 1):
                invalid_broadcast_size = True
        else:
            for dim_index in range(2):
                # no problem if 1 because it'll broadcast, otherwise must
                # be the same value
                if (numpy_broadcast_size[dim_index] !=
                        raster_shape[dim_index] and
                        numpy_broadcast_size[dim_index] != 1):
                    invalid_broadcast_size = True
        if invalid_broadcast_size:
            raise ValueError(
                "Raster size %s cannot be broadcast to numpy shape %s" % (
                    raster_shape, numpy_broadcast_size))

    # create a "canonical" argument list that's bands, 2d numpy arrays, or
    # raw values only
    base_canonical_arg_list = []
    base_raster_list = []
    base_band_list = []
    for value in base_raster_path_band_const_list:
        # the input has been tested and value is either a raster/path band
        # tuple, 1d ndarray, 2d ndarray, or (value, 'raw') tuple.
        if _is_raster_path_band_formatted(value):
            # it's a raster/path band, keep track of open raster and band
            # for later so we can `None` them.
            base_raster_list.append(gdal.OpenEx(value[0], gdal.OF_RASTER))
            base_band_list.append(
                base_raster_list[-1].GetRasterBand(value[1]))
            base_canonical_arg_list.append(base_band_list[-1])
        elif isinstance(value, numpy.ndarray):
            if value.ndim == 1:
                # easier to process as a 2d array for writing to band
                base_canonical_arg_list.append(
                    value.reshape((1, value.shape[0])))
            else:  # dimensions are two because we checked earlier.
                base_canonical_arg_list.append(value)
        elif isinstance(value, tuple):
            base_canonical_arg_list.append(value)
        else:
            raise ValueError(
                "An unexpected ``value`` occurred. This should never happen. "
                "Value: %r" % value)

    # create target raster
    if raster_info_list:
        # if rasters are passed, the target is the same size as the raster
        n_cols, n_rows = raster_info_list[0]['raster_size']
    elif numpy_broadcast_list:
        # numpy arrays in args and no raster result is broadcast shape
        # expanded to two dimensions if necessary
        if len(numpy_broadcast_size) == 1:
            n_rows, n_cols = 1, numpy_broadcast_size[0]
        else:
            n_rows, n_cols = numpy_broadcast_size
    else:
        raise ValueError(
            "Only (object, 'raw') values have been passed. Raster "
            "calculator requires at least a raster or numpy array as a "
            "parameter. This is the input list: %s" % pprint.pformat(
                base_raster_path_band_const_list))

    if datatype_target not in _VALID_GDAL_TYPES:
        raise ValueError(
            'Invalid target type, should be a gdal.GDT_* type, received '
            '"%s"' % datatype_target)

    # create target raster
    raster_driver = gdal.GetDriverByName(raster_driver_creation_tuple[0])
    try:
        os.makedirs(os.path.dirname(target_raster_path))
    except OSError:
        pass
    target_raster = raster_driver.Create(
        target_raster_path, n_cols, n_rows, 1, datatype_target,
        options=raster_driver_creation_tuple[1])

    target_band = target_raster.GetRasterBand(1)
    if nodata_target is not None:
        target_band.SetNoDataValue(nodata_target)
    if base_raster_list:
        # use the first raster in the list for the projection and geotransform
        target_raster.SetProjection(base_raster_list[0].GetProjection())
        target_raster.SetGeoTransform(base_raster_list[0].GetGeoTransform())
    target_band.FlushCache()
    target_raster.FlushCache()

    try:
        timed_logger = TimedLoggingAdapter(_LOGGING_PERIOD)

        block_offset_list = list(iterblocks(
            (target_raster_path, 1), offset_only=True,
            largest_block=largest_block))

        if calc_raster_stats:
            # if this queue is used to send computed valid blocks of
            # the raster to an incremental statistics calculator worker
            stats_worker_queue = queue.Queue()
            exception_queue = queue.Queue()

            if sys.version_info >= (3, 8) and use_shared_memory:
                # The stats worker keeps running variables as a float64, so
                # all input rasters are dtype float64 -- make the shared memory
                # size equivalent.
                block_size_bytes = (
                    numpy.dtype(numpy.float64).itemsize *
                    block_offset_list[0]['win_xsize'] *
                    block_offset_list[0]['win_ysize'])

                shared_memory = multiprocessing.shared_memory.SharedMemory(
                    create=True, size=block_size_bytes)
        else:
            stats_worker_queue = None

        if calc_raster_stats:
            # To avoid doing two passes on the raster to calculate standard
            # deviation, we implement a continuous statistics calculation
            # as the raster is computed. This computational effort is high
            # and benefits from running in parallel. This queue and worker
            # takes a valid block of a raster and incrementally calculates
            # the raster's statistics. When ``None`` is pushed to the queue
            # the worker will finish and return a (min, max, mean, std)
            # tuple.
            LOGGER.info('starting stats_worker')
            stats_worker_thread = threading.Thread(
                target=geoprocessing_core.stats_worker,
                args=(stats_worker_queue, len(block_offset_list)))
            stats_worker_thread.daemon = True
            stats_worker_thread.start()
            LOGGER.info('started stats_worker %s', stats_worker_thread)

        pixels_processed = 0
        n_pixels = n_cols * n_rows

        # iterate over each block and calculate local_op
        for block_offset in block_offset_list:
            # read input blocks
            offset_list = (block_offset['yoff'], block_offset['xoff'])
            blocksize = (block_offset['win_ysize'], block_offset['win_xsize'])
            data_blocks = []
            for value in base_canonical_arg_list:
                if isinstance(value, gdal.Band):
                    data_blocks.append(value.ReadAsArray(**block_offset))
                    # I've encountered the following error when a gdal raster
                    # is corrupt, often from multiple threads writing to the
                    # same file. This helps to catch the error early rather
                    # than lead to confusing values of ``data_blocks`` later.
                    if not isinstance(data_blocks[-1], numpy.ndarray):
                        raise ValueError(
                            f"got a {data_blocks[-1]} when trying to read "
                            f"{value.GetDataset().GetFileList()} at "
                            f"{block_offset}, expected numpy.ndarray.")
                elif isinstance(value, numpy.ndarray):
                    # must be numpy array and all have been conditioned to be
                    # 2d, so start with 0:1 slices and expand if possible
                    slice_list = [slice(0, 1)] * 2
                    tile_dims = list(blocksize)
                    for dim_index in [0, 1]:
                        if value.shape[dim_index] > 1:
                            slice_list[dim_index] = slice(
                                offset_list[dim_index],
                                offset_list[dim_index] +
                                blocksize[dim_index],)
                            tile_dims[dim_index] = 1
                    data_blocks.append(
                        numpy.tile(value[tuple(slice_list)], tile_dims))
                else:
                    # must be a raw tuple
                    data_blocks.append(value[0])

            target_block = local_op(*data_blocks)

            if (not isinstance(target_block, numpy.ndarray) or
                    target_block.shape != blocksize):
                raise ValueError(
                    "Expected `local_op` to return a numpy.ndarray of "
                    "shape %s but got this instead: %s" % (
                        blocksize, target_block))

            target_band.WriteArray(
                target_block, yoff=block_offset['yoff'],
                xoff=block_offset['xoff'])

            # send result to stats calculator
            if stats_worker_queue:
                # guard against an undefined nodata target
                if nodata_target is not None:
                    target_block = target_block[target_block != nodata_target]
                target_block = target_block.astype(numpy.float64).flatten()

                if sys.version_info >= (3, 8) and use_shared_memory:
                    shared_memory_array = numpy.ndarray(
                        target_block.shape, dtype=target_block.dtype,
                        buffer=shared_memory.buf)
                    shared_memory_array[:] = target_block[:]

                    stats_worker_queue.put((
                        shared_memory_array.shape, shared_memory_array.dtype,
                        shared_memory))
                else:
                    stats_worker_queue.put(target_block)

            pixels_processed += blocksize[0] * blocksize[1]
            timed_logger.info(
                '%s %.1f%% complete',
                os.path.basename(target_raster_path),
                float(pixels_processed) / n_pixels * 100.0)

        LOGGER.info('100.0% complete')

        if calc_raster_stats:
            LOGGER.info("Waiting for raster stats worker result.")
            stats_worker_thread.join(max_timeout)
            if stats_worker_thread.is_alive():
                LOGGER.error("stats_worker_thread.join() timed out")
                raise RuntimeError("stats_worker_thread.join() timed out")
            payload = stats_worker_queue.get(True, max_timeout)
            if payload is not None:
                target_min, target_max, target_mean, target_stddev = payload
                # In Cython 3.0.0+, taking a square root may return a complex.
                # Using only the real component of the complex value mimics the
                # C behavior that we expect from our stats worker.
                if isinstance(target_stddev, complex):
                    target_stddev = target_stddev.real
                target_band.SetStatistics(
                    float(target_min), float(target_max), float(target_mean),
                    float(target_stddev))
                target_band.FlushCache()
    except Exception:
        LOGGER.exception('exception encountered in raster_calculator')
        raise
    finally:
        # This block ensures that rasters are destroyed even if there's an
        # exception raised.
        base_band_list[:] = []
        base_raster_list[:] = []
        target_band.FlushCache()
        target_band = None
        target_raster.FlushCache()
        target_raster = None

        if calc_raster_stats and stats_worker_thread:
            if stats_worker_thread.is_alive():
                stats_worker_queue.put(None, True, max_timeout)
                LOGGER.info("Waiting for raster stats worker result.")
                stats_worker_thread.join(max_timeout)
                if stats_worker_thread.is_alive():
                    LOGGER.error("stats_worker_thread.join() timed out")
                    raise RuntimeError(
                        "stats_worker_thread.join() timed out")

            if sys.version_info >= (3, 8) and use_shared_memory:
                LOGGER.debug(
                    f'unlink shared memory for process {os.getpid()}')
                shared_memory.close()
                shared_memory.unlink()
                LOGGER.debug(
                    f'unlinked shared memory for process {os.getpid()}')

            # check for an exception in the workers, otherwise get result
            # and pass to writer
            try:
                exception = exception_queue.get_nowait()
                LOGGER.error("Exception encountered at termination.")
                raise exception
            except queue.Empty:
                pass


[docs]def array_equals_nodata(array, nodata):
    """Check for the presence of ``nodata`` values in ``array``.

    The comparison supports ``numpy.nan`` and unset (``None``) nodata values.

    Args:
        array (numpy array): the array to mask for nodata values.
        nodata (number): the nodata value to check for. Supports ``numpy.nan``.

    Returns:
        A boolean numpy array with values of 1 where ``array`` is equal to
        ``nodata`` and 0 otherwise.
    """
    # If nodata is undefined, nothing matches nodata.
    if nodata is None:
        return numpy.zeros(array.shape, dtype=bool)

    # comparing an integer array against numpy.nan works correctly and is
    # faster than using numpy.isclose().
    if numpy.issubdtype(array.dtype, numpy.integer):
        return array == nodata
    return numpy.isclose(array, nodata, equal_nan=True)


[docs]def choose_dtype(*raster_paths):
    """
    Choose an appropriate dtype for an output derived from the given inputs.

    Returns the dtype with the greatest size/precision among the inputs, so
    that information will not be lost.

    Args:
        *raster_paths: series of raster path strings

    Returns:
        numpy dtype
    """
    dtypes = [get_raster_info(path)['numpy_type'] for path in raster_paths]
    return numpy.result_type(*dtypes)


[docs]def choose_nodata(dtype):
    """
    Choose an appropriate nodata value for data of a given dtype.

    Args:
        dtype (numpy.dtype): data type for which to choose nodata

    Returns:
        number to use as nodata value
    """
    try:
        return float(numpy.finfo(dtype).max)
    except ValueError:
        return int(numpy.iinfo(dtype).max)


[docs]def raster_map(op, rasters, target_path, target_nodata=None, target_dtype=None,
               raster_driver_creation_tuple=DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS):
    """Apply a pixelwise function to a series of raster inputs.

    The output raster will have nodata where any input raster has nodata.
    Raster inputs are split into aligned blocks, and the function is
    applied individually to each stack of blocks (as numpy arrays).

    Args:
        op (function): Function to apply to the inputs. It should accept a
            number of arguments equal to the length of ``*inputs``. It should
            return a numpy array with the same shape as its array input(s).
        rasters (list[str]): Paths to rasters to input to ``op``, in the order
            that they will be passed to ``op``. All rasters should be aligned
            and have the same dimensions.
        target_path (str): path to write out the output raster.
        target_nodata (number): Nodata value to use for the output raster.
            Optional. If not provided, a suitable nodata value will be chosen.
        target_dtype (numpy.dtype): dtype to use for the output. Optional. If
            not provided, a suitable dtype will be chosen.
        raster_driver_creation_tuple (tuple): a tuple containing a GDAL driver
            name string as the first element and a GDAL creation options
            tuple/list as the second. Defaults to
            geoprocessing.DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS. If the
            ``target_dtype`` is int8, the ``PIXELTYPE=SIGNEDBYTE`` option will
            be added to the creation options tuple if it is not already there.

    Returns:
        ``None``
    """
    nodatas = []
    for raster in rasters:
        raster_info = get_raster_info(raster)
        if raster_info['n_bands'] > 1:
            LOGGER.warning(f'{raster} has more than one band. Only the first '
                           'band will be used.')
        nodatas.append(raster_info['nodata'][0])

    # choose an appropriate dtype if none was given
    if target_dtype is None:
        target_dtype = choose_dtype(*rasters)

    # choose an appropriate nodata value if none was given
    # if the user provides a target nodata,
    # check that it can fit in the target dtype
    if target_nodata is None:
        target_nodata = choose_nodata(target_dtype)
    else:
        if not numpy.can_cast(target_nodata, target_dtype):
            raise ValueError(
                f'Target nodata value {target_nodata} is incompatible with '
                f'the target dtype {target_dtype}')

    driver, options = raster_driver_creation_tuple
    gdal_type, type_creation_options = _numpy_to_gdal_type(target_dtype)
    options = list(options) + type_creation_options

    def apply_op(*arrays):
        """Apply the function ``op`` to the input arrays.

        Args:
            *arrays: numpy arrays with the same shape.

        Returns:
            numpy array
        """
        result = numpy.full(arrays[0].shape, target_nodata, dtype=target_dtype)

        # make a mask that is True where all input arrays are valid,
        # and False where any input array is invalid.
        valid_mask = numpy.full(arrays[0].shape, True)
        for array, nodata in zip(arrays, nodatas):
            valid_mask &= ~array_equals_nodata(array, nodata)

        # mask all arrays to the area where they all are valid
        masked_arrays = [array[valid_mask] for array in arrays]
        # apply op to the masked arrays in order
        result[valid_mask] = op(*masked_arrays)
        return result

    raster_calculator(
        [(path, 1) for path in rasters],  # assume the first band
        apply_op,
        target_path,
        gdal_type,
        target_nodata,
        raster_driver_creation_tuple=(driver, options))


[docs]def raster_reduce(function, raster_path_band, initializer, mask_nodata=True,
                  largest_block=_LARGEST_ITERBLOCK):
    """Cumulatively apply a reducing function to each block of a raster.

    This effectively reduces the entire raster to a single value, but it works
    by blocks to be memory-efficient.

    The ``function`` signature should be ``function(aggregator, block)``, where
    ``aggregator`` is the aggregated value so far, and ``block`` is a flattened
    numpy array containing the data from the block to reduce next.

    ``function`` is called once on each block. On the first ``function`` call,
    ``aggregator`` is initialized with ``initializer``. The return value from
    each ``function`` call is passed in as the ``aggregator`` argument to the
    subsequent ``function`` call. When all blocks have been reduced, the return
    value of the final ``function`` call is returned.

    Example:
        Calculate the sum of all values in a raster::

            raster_reduce(lambda total, block: total + numpy.sum(block),
                          (raster_path, 1), 0)

        Calculate a histogram of all values in a raster::

            def add_to_histogram(histogram, block):
                return histogram + numpy.histogram(block, bins=10)[0]

            raster_reduce(add_to_histogram, (raster_path, 1), numpy.zeros(10))

        Calculate the sum of all values in a raster, excluding nodata::

            nodata = pygeoprocessing.get_raster_info(raster_path)['nodata'][0]
            def sum_excluding_nodata(total, block):
                return total + numpy.sum(block[block != nodata])

            raster_reduce(sum_excluding_nodata, (raster_path, 1), 0)

    Args:
        function (func): function to apply to each raster block
        raster_path_band (tuple): (path, band) tuple of the raster to reduce
        initializer (obj): value to initialize the aggregator for the
            first function call
        mask_nodata (bool): if True, mask out nodata before aggregating. A
            flattened array of non-nodata pixels from each block is passed to
            the ``function``. if False, each block is passed to the
            ``function`` without masking.
        largest_block (int): largest block parameter to pass to ``iterblocks``

    Returns:
        aggregate value, the final value returned from ``function``
    """
    aggregator = initializer
    timed_logger = TimedLoggingAdapter(_LOGGING_PERIOD)
    pixels_processed = 0
    raster_info = get_raster_info(raster_path_band[0])
    x_size, y_size = raster_info['raster_size']
    n_pixels = x_size * y_size
    for (_, block) in iterblocks(raster_path_band,
                                 largest_block=largest_block):
        if mask_nodata:
            data = block[~array_equals_nodata(
                block, raster_info['nodata'][raster_path_band[1] - 1])]
        else:
            data = block.flatten()
        aggregator = function(aggregator, data)
        pixels_processed += block.size
        timed_logger.info(
            f'{raster_path_band[0]} reduce '
            f'{pixels_processed / n_pixels * 100:.1f}%% complete'
        )

    LOGGER.info('100.0%% complete')
    return aggregator


[docs]def align_and_resize_raster_stack(
        base_raster_path_list, target_raster_path_list, resample_method_list,
        target_pixel_size, bounding_box_mode, base_vector_path_list=None,
        raster_align_index=None, base_projection_wkt_list=None,
        target_projection_wkt=None, mask_options=None,
        vector_mask_options=None, gdal_warp_options=None,
        raster_driver_creation_tuple=DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS,
        osr_axis_mapping_strategy=DEFAULT_OSR_AXIS_MAPPING_STRATEGY,
        working_dir=None):
    """Generate rasters from a base such that they align geospatially.

    This function resizes base rasters that are in the same geospatial
    projection such that the result is an aligned stack of rasters that have
    the same cell size, dimensions, and bounding box. This is achieved by
    clipping or resizing the rasters to intersected, unioned, or equivocated
    bounding boxes of all the raster and vector input.

    Args:
        base_raster_path_list (sequence): a sequence of base raster paths that
            will be transformed and will be used to determine the target
            bounding box.
        target_raster_path_list (sequence): a sequence of raster paths that
            will be created to one-to-one map with ``base_raster_path_list``
            as aligned versions of those original rasters. If there are
            duplicate paths in this list, the function will raise a ValueError.
        resample_method_list (sequence): a sequence of resampling methods
            which one to one map each path in ``base_raster_path_list`` during
            resizing.  Each element must be one of
            "near|bilinear|cubic|cubicspline|lanczos|mode".
        target_pixel_size (list/tuple): the target raster's x and y pixel size
            example: (30, -30).
        bounding_box_mode (string): one of "union", "intersection", or
            a sequence of floats of the form [minx, miny, maxx, maxy] in the
            target projection coordinate system.  Depending
            on the value, output extents are defined as the union,
            intersection, or the explicit bounding box.
        base_vector_path_list (sequence): a sequence of base vector paths
            whose bounding boxes will be used to determine the final bounding
            box of the raster stack if mode is 'union' or 'intersection'.  If
            mode is 'bb=[...]' then these vectors are not used in any
            calculation.
        raster_align_index (int): indicates the index of a
            raster in ``base_raster_path_list`` that the target rasters'
            bounding boxes pixels should align with.  This feature allows
            rasters whose raster dimensions are the same, but bounding boxes
            slightly shifted less than a pixel size to align with a desired
            grid layout.  If ``None`` then the bounding box of the target
            rasters is calculated as the precise intersection, union, or
            bounding box.
        base_projection_wkt_list (sequence): if not ``None``, this is a
            sequence of base projections of the rasters in
            ``base_raster_path_list``. If a value is ``None``, the projection
            is read directly from the raster. Use this argument if there are
            rasters with no projection defined, but the projections are known.
        target_projection_wkt (string): if not ``None``, this is the desired
            projection of all target rasters in Well Known Text format, and
            target rasters will be warped to this projection. If ``None``,
            the base SRS will be passed to the target.
        mask_options (dict): optional, if not None, this is a
            dictionary of options to use an existing vector's geometry to
            mask out pixels in the target raster that do not overlap the
            vector's geometry. Keys to this dictionary are:

            * ``'mask_vector_path'`` (str): path to the mask vector file.
              This vector will be automatically projected to the target
              projection if its base coordinate system does not match the
              target.
            * ``'mask_layer_name'`` (str): the layer name to use for masking.
              If this key is not in the dictionary the default is to use
              the layer at index 0.
            * ``'mask_vector_where_filter'`` (str): an SQL WHERE string.
              This will be used to filter the geometry in the mask. Ex: ``'id
              > 10'`` would use all features whose field value of 'id' is >
              10.
            * ``'mask_raster_path'`` (str): Optional. the string path to where
              the mask raster should be written on disk.  If not provided, a
              temporary file will be created within ``working_dir``.

        vector_mask_options (dict): optional.  Alias for ``mask_options``.
            This parameter is deprecated and will be removed in a future
            version of ``pygeoprocessing``.
        gdal_warp_options (sequence): if present, the contents of this list
            are passed to the ``warpOptions`` parameter of ``gdal.Warp``. See
            the `GDAL Warp documentation
            <https://gdal.org/api/gdalwarp_cpp.html#_CPPv415GDALWarpOptions>`_
            for valid options.
        raster_driver_creation_tuple (tuple): a tuple containing a GDAL driver
            name string as the first element and a GDAL creation options
            tuple/list as the second. Defaults to a GTiff driver tuple
            defined at geoprocessing.DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS.
        osr_axis_mapping_strategy (int): OSR axis mapping strategy for
            ``SpatialReference`` objects. Defaults to
            ``geoprocessing.DEFAULT_OSR_AXIS_MAPPING_STRATEGY``. This parameter
            should not be changed unless you know what you are doing.
        working_dir=None (str): if present, the path to a directory within
           which a temporary directory will be created.  If not provided, the
           new directory will be created within the system's temporary
           directory.

    Returns:
        None

    Raises:
        ValueError
            If any combination of the raw bounding boxes, raster
            bounding boxes, vector bounding boxes, and/or vector_mask
            bounding box does not overlap to produce a valid target.
        ValueError
            If any of the input or target lists are of different
            lengths.
        ValueError
            If there are duplicate paths on the target list which would
            risk corrupted output.
        ValueError
            If some combination of base, target, and embedded source
            reference systems results in an ambiguous target coordinate
            system.
        ValueError
            If ``mask_options`` is not None but the
            ``mask_vector_path`` is undefined or doesn't point to a valid
            file.
        ValueError
            If ``pixel_size`` is not a 2 element sequence of numbers.
    """
    if vector_mask_options is not None:
        warnings.warn('The vector_mask_options parameter is deprecated and '
                      'will be removed in a future release of '
                      'pygeoprocessing. Please use mask_options instead.',
                      DeprecationWarning)
        mask_options = vector_mask_options

    # make sure that the input lists are of the same length
    list_lengths = [
        len(base_raster_path_list), len(target_raster_path_list),
        len(resample_method_list)]
    if len(set(list_lengths)) != 1:
        raise ValueError(
            "base_raster_path_list, target_raster_path_list, and "
            "resample_method_list must be the same length "
            " current lengths are %s" % (str(list_lengths)))

    unique_targets = set(target_raster_path_list)
    if len(unique_targets) != len(target_raster_path_list):
        seen = set()
        duplicate_list = []
        for path in target_raster_path_list:
            if path not in seen:
                seen.add(path)
            else:
                duplicate_list.append(path)
        raise ValueError(
            "There are duplicated paths on the target list. This is an "
            "invalid state of ``target_path_list``. Duplicates: %s" % (
                duplicate_list))

    # we can accept 'union', 'intersection', or a 4 element list/tuple
    if bounding_box_mode not in ["union", "intersection"] and (
            not isinstance(bounding_box_mode, (list, tuple)) or
            len(bounding_box_mode) != 4):
        raise ValueError("Unknown bounding_box_mode %s" % (
            str(bounding_box_mode)))

    n_rasters = len(base_raster_path_list)
    if ((raster_align_index is not None) and
            ((raster_align_index < 0) or (raster_align_index >= n_rasters))):
        raise ValueError(
            "Alignment index is out of bounds of the datasets index: %s"
            " n_elements %s" % (raster_align_index, n_rasters))

    _assert_is_valid_pixel_size(target_pixel_size)

    # used to get bounding box, projection, and possible alignment info
    raster_info_list = [
        get_raster_info(path) for path in base_raster_path_list]

    # get the literal or intersecting/unioned bounding box
    if isinstance(bounding_box_mode, (list, tuple)):
        # if it's a sequence or tuple, it must be a manual bounding box
        LOGGER.debug(
            "assuming manual bounding box mode of %s", bounding_box_mode)
        target_bounding_box = bounding_box_mode
    else:
        # either intersection or union, get list of bounding boxes, reproject
        # if necessary, and reduce to a single box
        if base_vector_path_list is not None:
            # vectors are only interesting for their bounding boxes, that's
            # this construction is inside an else.
            vector_info_list = [
                get_vector_info(path) for path in base_vector_path_list]
        else:
            vector_info_list = []

        raster_bounding_box_list = []
        for raster_index, raster_info in enumerate(raster_info_list):
            # this block calculates the base projection of ``raster_info`` if
            # ``target_projection_wkt`` is defined, thus implying a
            # reprojection will be necessary.
            if target_projection_wkt:
                if base_projection_wkt_list and \
                        base_projection_wkt_list[raster_index]:
                    # a base is defined, use that
                    base_raster_projection_wkt = \
                        base_projection_wkt_list[raster_index]
                else:
                    # otherwise use the raster's projection and there must
                    # be one since we're reprojecting
                    base_raster_projection_wkt = raster_info['projection_wkt']
                    if not base_raster_projection_wkt:
                        raise ValueError(
                            "no projection for raster %s" %
                            base_raster_path_list[raster_index])
                # since the base spatial reference is potentially different
                # than the target, we need to transform the base bounding
                # box into target coordinates so later we can calculate
                # accurate bounding box overlaps in the target coordinate
                # system
                raster_bounding_box_list.append(
                    transform_bounding_box(
                        raster_info['bounding_box'],
                        base_raster_projection_wkt, target_projection_wkt))
            else:
                raster_bounding_box_list.append(raster_info['bounding_box'])

        # include the vector bounding box information to make a global list
        # of target bounding boxes
        bounding_box_list = [
            vector_info['bounding_box'] if target_projection_wkt is None else
            transform_bounding_box(
                vector_info['bounding_box'],
                vector_info['projection_wkt'], target_projection_wkt)
            for vector_info in vector_info_list] + raster_bounding_box_list

        target_bounding_box = merge_bounding_box_list(
            bounding_box_list, bounding_box_mode)

    if mask_options:
        # ensure the mask exists and intersects with the target bounding box
        if 'mask_vector_path' not in mask_options:
            raise ValueError(
                'mask_options passed, but no value for '
                '"mask_vector_path": %s', mask_options)

        mask_vector_info = get_vector_info(
            mask_options['mask_vector_path'])

        if 'mask_vector_where_filter' in mask_options:
            # the bounding box only exists for the filtered features
            mask_vector = gdal.OpenEx(
                mask_options['mask_vector_path'], gdal.OF_VECTOR)
            mask_layer = mask_vector.GetLayer()
            mask_layer.SetAttributeFilter(
                mask_options['mask_vector_where_filter'])
            mask_bounding_box = merge_bounding_box_list(
                [[feature.GetGeometryRef().GetEnvelope()[i]
                  for i in [0, 2, 1, 3]] for feature in mask_layer],
                'union')
            mask_layer = None
            mask_vector = None
        else:
            # if no where filter then use the raw vector bounding box
            mask_bounding_box = mask_vector_info['bounding_box']

        mask_vector_projection_wkt = mask_vector_info['projection_wkt']
        if mask_vector_projection_wkt is not None and \
                target_projection_wkt is not None:
            mask_vector_bb = transform_bounding_box(
                mask_bounding_box, mask_vector_info['projection_wkt'],
                target_projection_wkt)
        else:
            mask_vector_bb = mask_vector_info['bounding_box']
        # Calling `merge_bounding_box_list` will raise an ValueError if the
        # bounding box of the mask and the target do not intersect. The
        # result is otherwise not used.
        _ = merge_bounding_box_list(
            [target_bounding_box, mask_vector_bb], 'intersection')

    if raster_align_index is not None and raster_align_index >= 0:
        # bounding box needs alignment
        align_bounding_box = (
            raster_info_list[raster_align_index]['bounding_box'])
        align_pixel_size = (
            raster_info_list[raster_align_index]['pixel_size'])
        # adjust bounding box so lower left corner aligns with a pixel in
        # raster[raster_align_index]
        for index in [0, 1]:
            n_pixels = int(
                (target_bounding_box[index] - align_bounding_box[index]) /
                float(align_pixel_size[index]))
            target_bounding_box[index] = (
                n_pixels * align_pixel_size[index] +
                align_bounding_box[index])

    if mask_options:
        # Create a warped VRT.
        # This allows us to cheaply figure out the dimensions, projection, etc.
        # of the target raster without actually warping the entire raster to a
        # GTiff.
        temp_working_dir = tempfile.mkdtemp(dir=working_dir, prefix='mask-')
        warped_vrt = os.path.join(temp_working_dir, 'warped.vrt')
        warp_raster(
            base_raster_path=base_raster_path_list[0],
            target_pixel_size=target_pixel_size,
            target_raster_path=warped_vrt,
            resample_method='near',
            target_bb=target_bounding_box,
            raster_driver_creation_tuple=('VRT', []),
            target_projection_wkt=target_projection_wkt,
            base_projection_wkt=(
                None if not base_projection_wkt_list else
                base_projection_wkt_list[0]),
            gdal_warp_options=gdal_warp_options)

        # Convert the warped VRT to a GTiff for rasterization.
        if 'mask_raster_path' in mask_options:
            mask_raster_path = mask_options['mask_raster_path']
        else:
            # Add the mask raster path ot the vector mask options to force
            # warp_raster to use the existing raster mask.
            mask_raster_path = os.path.join(temp_working_dir, 'mask.tif')
            mask_options['mask_raster_path'] = mask_raster_path
        new_raster_from_base(
            warped_vrt, mask_raster_path, gdal.GDT_Byte, [0], [0])

        # Rasterize the vector onto the new GTiff.
        rasterize(mask_options['mask_vector_path'],
                  mask_raster_path, burn_values=[1],
                  where_clause=(
                      mask_options['mask_vector_where_filter']
                      if 'mask_vector_where_filter' in mask_options
                      else None))

    for index, (base_path, target_path, resample_method) in enumerate(zip(
            base_raster_path_list, target_raster_path_list,
            resample_method_list)):
        warp_raster(
            base_path, target_pixel_size, target_path, resample_method,
            target_bb=target_bounding_box,
            raster_driver_creation_tuple=(raster_driver_creation_tuple),
            target_projection_wkt=target_projection_wkt,
            base_projection_wkt=(
                None if not base_projection_wkt_list else
                base_projection_wkt_list[index]),
            mask_options=mask_options,
            gdal_warp_options=gdal_warp_options)
        LOGGER.info(
            '%d of %d aligned: %s', index+1, n_rasters,
            os.path.basename(target_path))

    LOGGER.info("aligned all %d rasters.", n_rasters)

    if mask_options:
        shutil.rmtree(temp_working_dir, ignore_errors=True)


[docs]def new_raster_from_base(
        base_path, target_path, datatype, band_nodata_list,
        fill_value_list=None, n_rows=None, n_cols=None,
        raster_driver_creation_tuple=DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS):
    """Create new raster by coping spatial reference/geotransform of base.

    A convenience function to simplify the creation of a new raster from the
    basis of an existing one.  Depending on the input mode, one can create
    a new raster of the same dimensions, geotransform, and georeference as
    the base.  Other options are provided to change the raster dimensions,
    number of bands, nodata values, data type, and core raster creation
    options.

    Args:
        base_path (string): path to existing raster.
        target_path (string): path to desired target raster.
        datatype: the pixel datatype of the output raster, for example
            gdal.GDT_Float32.  See the following header file for supported
            pixel types:
            http://www.gdal.org/gdal_8h.html#22e22ce0a55036a96f652765793fb7a4
        band_nodata_list (sequence): list of nodata values, one for each band,
            to set on target raster.  If value is 'None' the nodata value is
            not set for that band.  The number of target bands is inferred
            from the length of this list.
        fill_value_list (sequence): list of values to fill each band with. If
            None, no filling is done.
        n_rows (int): if not None, defines the number of target raster rows.
        n_cols (int): if not None, defines the number of target raster
            columns.
        raster_driver_creation_tuple (tuple): a tuple containing a GDAL driver
            name string as the first element and a GDAL creation options
            tuple/list as the second. Defaults to a GTiff driver tuple
            defined at geoprocessing.DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS.

    Return:
        None
    """
    base_raster = gdal.OpenEx(base_path, gdal.OF_RASTER)
    if n_rows is None:
        n_rows = base_raster.RasterYSize
    if n_cols is None:
        n_cols = base_raster.RasterXSize
    driver = gdal.GetDriverByName(raster_driver_creation_tuple[0])

    local_raster_creation_options = list(raster_driver_creation_tuple[1])
    numpy_dtype = _gdal_to_numpy_type(datatype, local_raster_creation_options)
    base_band = base_raster.GetRasterBand(1)
    block_size = base_band.GetBlockSize()
    # It's not clear how or IF we can determine if the output should be
    # striped or tiled.  Here we leave it up to the default inputs or if its
    # obviously not striped we tile.
    if not any(
            ['TILED' in option for option in local_raster_creation_options]):
        # TILED not set, so lets try to set it to a reasonable value
        if block_size[0] != n_cols:
            # if x block is not the width of the raster it *must* be tiled
            # otherwise okay if it's striped or tiled, I can't construct a
            # test case to cover this, but there is nothing in the spec that
            # restricts this so I have it just in case.
            local_raster_creation_options.append('TILED=YES')

    if not any(
            ['BLOCK' in option for option in local_raster_creation_options]):
        # not defined, so lets copy what we know from the current raster
        local_raster_creation_options.extend([
            'BLOCKXSIZE=%d' % block_size[0],
            'BLOCKYSIZE=%d' % block_size[1]])

    # make target directory if it doesn't exist
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        pass

    base_band = None
    n_bands = len(band_nodata_list)
    target_raster = driver.Create(
        target_path, n_cols, n_rows, n_bands, datatype,
        options=local_raster_creation_options)
    target_raster.SetProjection(base_raster.GetProjection())
    target_raster.SetGeoTransform(base_raster.GetGeoTransform())
    base_raster = None

    for index, nodata_value in enumerate(band_nodata_list):
        if nodata_value is None:
            continue
        target_band = target_raster.GetRasterBand(index + 1)
        try:
            target_band.SetNoDataValue(nodata_value.item())
        except AttributeError:
            target_band.SetNoDataValue(nodata_value)

    target_raster.FlushCache()
    timed_logger = TimedLoggingAdapter(_LOGGING_PERIOD)
    pixels_processed = 0
    n_pixels = n_cols * n_rows
    if fill_value_list is not None:
        for index, fill_value in enumerate(fill_value_list):
            if fill_value is None:
                continue
            target_band = target_raster.GetRasterBand(index + 1)
            # some rasters are very large and a fill can appear to cause
            # computation to hang. This block, though possibly slightly less
            # efficient than ``band.Fill`` will give real-time feedback about
            # how the fill is progressing.
            for offsets in iterblocks((target_path, 1), offset_only=True):
                shape = (offsets['win_ysize'], offsets['win_xsize'])
                fill_array = numpy.full(shape, fill_value, dtype=numpy_dtype)
                pixels_processed += (
                    offsets['win_ysize'] * offsets['win_xsize'])
                target_band.WriteArray(
                    fill_array, offsets['xoff'], offsets['yoff'])
                timed_logger.info(
                    f'filling new raster {target_path} with {fill_value} '
                    f'-- {float(pixels_processed)/n_pixels*100.0:.2f}% '
                    f'complete')
            target_band = None
    target_band = None
    target_raster = None


[docs]def create_raster_from_vector_extents(
        base_vector_path, target_raster_path, target_pixel_size,
        target_pixel_type, target_nodata, fill_value=None,
        raster_driver_creation_tuple=DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS):
    """Create a blank raster based on a vector file extent.

    Args:
        base_vector_path (string): path to vector shapefile to base the
            bounding box for the target raster.
        target_raster_path (string): path to location of generated geotiff;
            the upper left hand corner of this raster will be aligned with the
            bounding box of the source vector and the extent will be exactly
            equal or contained the source vector's bounding box depending on
            whether the pixel size divides evenly into the source bounding
            box; if not coordinates will be rounded up to contain the original
            extent.
        target_pixel_size (list/tuple): the x/y pixel size as a sequence
            Example::

                [30.0, -30.0]

        target_pixel_type (int): gdal GDT pixel type of target raster
        target_nodata (numeric): target nodata value. Can be None if no nodata
            value is needed.
        fill_value (int/float): value to fill in the target raster; no fill if
            value is None
        raster_driver_creation_tuple (tuple): a tuple containing a GDAL driver
            name string as the first element and a GDAL creation options
            tuple/list as the second. Defaults to a GTiff driver tuple
            defined at geoprocessing.DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS.

    Return:
        None
    """
    if target_pixel_type not in _VALID_GDAL_TYPES:
        raise ValueError(
            f'Invalid target type, should be a gdal.GDT_* type, received '
            f'"{target_pixel_type}"')
    # Determine the width and height of the tiff in pixels based on the
    # maximum size of the combined envelope of all the features
    vector = gdal.OpenEx(base_vector_path, gdal.OF_VECTOR)
    shp_extent = None
    for layer_index in range(vector.GetLayerCount()):
        layer = vector.GetLayer(layer_index)
        for feature in layer:
            try:
                # envelope is [xmin, xmax, ymin, ymax]
                feature_extent = feature.GetGeometryRef().GetEnvelope()
                if shp_extent is None:
                    shp_extent = list(feature_extent)
                else:
                    # expand bounds of current bounding box to include that
                    # of the newest feature
                    shp_extent = [
                        f(shp_extent[index], feature_extent[index])
                        for index, f in enumerate([min, max, min, max])]
            except AttributeError as error:
                # For some valid OGR objects the geometry can be undefined
                # since it's valid to have a NULL entry in the attribute table
                # this is expressed as a None value in the geometry reference
                # this feature won't contribute
                LOGGER.warning(error)
        layer = None

    target_srs_wkt = vector.GetLayer(0).GetSpatialRef().ExportToWkt()
    vector = None

    if shp_extent is None:
        raise ValueError(
            f'the vector at {base_vector_path} has no geometry, cannot '
            f'create a raster from these extents')

    create_raster_from_bounding_box(
        target_bounding_box=[
            shp_extent[0], shp_extent[2], shp_extent[1], shp_extent[3]
        ],
        target_raster_path=target_raster_path,
        target_pixel_size=target_pixel_size,
        target_pixel_type=target_pixel_type,
        target_srs_wkt=target_srs_wkt,
        target_nodata=target_nodata,
        fill_value=fill_value,
        raster_driver_creation_tuple=raster_driver_creation_tuple
    )


[docs]def create_raster_from_bounding_box(
        target_bounding_box, target_raster_path, target_pixel_size,
        target_pixel_type, target_srs_wkt, target_nodata, fill_value=None,
        raster_driver_creation_tuple=DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS):
    """Create a raster from a given bounding box.

    Args:
        target_bounding_box (tuple): a 4-element iterable of (minx, miny,
            maxx, maxy) in projected units matching the SRS of
            ``target_srs_wkt``.
        target_raster_path (string): The path to where the new raster should be
            created on disk.
        target_pixel_size (tuple): A 2-element tuple of the (x, y) pixel size
            of the target raster.  Elements are in units of the target SRS.
        target_pixel_type (int): The GDAL GDT_* type of the target raster.
        target_srs_wkt (string): The SRS of the target raster, in Well-Known
            Text format.
        target_nodata (float): The nodata value of the target raster, or
            ``None`` if no nodata value is to be set.
        fill_value=None (number): If provided, the value that the target raster
            should be filled with.
        raster_driver_creation_tuple (tuple): a tuple containing a GDAL driver
            name string as the first element and a GDAL creation options
            tuple/list as the second. Defaults to a GTiff driver tuple
            defined at geoprocessing.DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS.

    Returns:
        ``None``
    """
    if target_pixel_type not in _VALID_GDAL_TYPES:
        raise ValueError(
            f'Invalid target type, should be a gdal.GDT_* type, received '
            f'"{target_pixel_type}"')

    bbox_minx, bbox_miny, bbox_maxx, bbox_maxy = target_bounding_box

    driver = gdal.GetDriverByName(raster_driver_creation_tuple[0])
    n_bands = 1

    # determine the raster size that bounds the input bounding box and then
    # adjust the bounding box to be that size
    target_x_size = int(abs(
        float(bbox_maxx - bbox_minx) / target_pixel_size[0]))
    target_y_size = int(abs(
        float(bbox_maxy - bbox_miny) / target_pixel_size[1]))
    x_residual = (
        abs(target_x_size * target_pixel_size[0]) -
        (bbox_maxx - bbox_minx))
    if not numpy.isclose(x_residual, 0.0):
        target_x_size += 1
    y_residual = (
        abs(target_y_size * target_pixel_size[1]) -
        (bbox_maxy - bbox_miny))
    if not numpy.isclose(y_residual, 0.0):
        target_y_size += 1

    if target_x_size == 0:
        LOGGER.warning(
            "bounding_box is so small that x dimension rounds to 0; "
            "clamping to 1.")
        target_x_size = 1
    if target_y_size == 0:
        LOGGER.warning(
            "bounding_box is so small that y dimension rounds to 0; "
            "clamping to 1.")
        target_y_size = 1

    raster = driver.Create(
        target_raster_path, target_x_size, target_y_size, n_bands,
        target_pixel_type, options=raster_driver_creation_tuple[1])
    raster.SetProjection(target_srs_wkt)

    # Set the transform based on the upper left corner and given pixel
    # dimensions.
    x_source = bbox_maxx if target_pixel_size[0] < 0 else bbox_minx
    y_source = bbox_maxy if target_pixel_size[1] < 0 else bbox_miny
    raster_transform = [
        x_source, target_pixel_size[0], 0,
        y_source, 0, target_pixel_size[1]]
    raster.SetGeoTransform(raster_transform)

    # Fill the band if requested.
    band = raster.GetRasterBand(1)
    if fill_value is not None:
        band.Fill(fill_value)

    # Set the nodata value.
    if target_nodata is not None:
        band.SetNoDataValue(float(target_nodata))

    band = None
    raster = None


[docs]def interpolate_points(
        base_vector_path, vector_attribute_field, target_raster_path_band,
        interpolation_mode):
    """Interpolate point values onto an existing raster.

    Args:
        base_vector_path (string): path to a shapefile that contains point
            vector layers.
        vector_attribute_field (field): a string in the vector referenced at
            ``base_vector_path`` that refers to a numeric value in the
            vector's attribute table.  This is the value that will be
            interpolated across the raster.
        target_raster_path_band (tuple): a path/band number tuple to an
            existing raster which likely intersects or is nearby the source
            vector. The band in this raster will take on the interpolated
            numerical values  provided at each point.
        interpolation_mode (string): the interpolation method to use for
            scipy.interpolate.griddata, one of 'linear', near', or 'cubic'.

    Return:
        None
    """
    source_vector = gdal.OpenEx(base_vector_path, gdal.OF_VECTOR)
    point_list = []
    value_list = []
    for layer_index in range(source_vector.GetLayerCount()):
        layer = source_vector.GetLayer(layer_index)
        for point_feature in layer:
            value = point_feature.GetField(vector_attribute_field)
            # Add in the numpy notation which is row, col
            # Here the point geometry is in the form x, y (col, row)
            geometry = point_feature.GetGeometryRef()
            point = geometry.GetPoint()
            point_list.append([point[1], point[0]])
            value_list.append(value)

    point_array = numpy.array(point_list)
    value_array = numpy.array(value_list)

    # getting the offsets first before the raster is opened in update mode
    offset_list = list(
        iterblocks(target_raster_path_band, offset_only=True))
    target_raster = gdal.OpenEx(
        target_raster_path_band[0], gdal.OF_RASTER | gdal.GA_Update)
    band = target_raster.GetRasterBand(target_raster_path_band[1])
    nodata = band.GetNoDataValue()
    geotransform = target_raster.GetGeoTransform()
    for offset in offset_list:
        grid_y, grid_x = numpy.mgrid[
            offset['yoff']:offset['yoff']+offset['win_ysize'],
            offset['xoff']:offset['xoff']+offset['win_xsize']]
        grid_y = grid_y * geotransform[5] + geotransform[3]
        grid_x = grid_x * geotransform[1] + geotransform[0]

        # this is to be consistent with GDAL 2.0's change of 'nearest' to
        # 'near' for an interpolation scheme that SciPy did not change.
        if interpolation_mode == 'near':
            interpolation_mode = 'nearest'
        raster_out_array = scipy.interpolate.griddata(
            point_array, value_array, (grid_y, grid_x), interpolation_mode,
            nodata)
        band.WriteArray(raster_out_array, offset['xoff'], offset['yoff'])


[docs]def align_bbox(geotransform, bbox):
    """Pad a bounding box so that it aligns with the grid of a given geotransform.

    Ignores row and column rotation.

    Args:
        geotransform (list): GDAL raster geotransform to align to
        bbox (list): bounding box in the form [minx, miny, maxx, maxy]

    Returns:
        padded bounding box in the form [minx, miny, maxx, maxy]

    Raises:
        ValueError if invalid geotransform or bounding box are provided
    """
    # NOTE: x_origin and y_origin do not necessarily equal minx and miny.
    # If pixel_width is positive, x_origin = minx
    # If pixel_width is negative, x_origin = maxx
    # If pixel height is positive, y_origin = miny
    # If pixel height is negative, y_origin = maxy
    x_origin, pixel_width, r_x, y_origin, r_y, pixel_height = geotransform
    if r_x != 0 or r_y != 0:
        LOGGER.warning('Row and/or column rotation supplied to align_bbox '
                       'will be ignored')
    if pixel_width == 0 or pixel_height == 0:
        raise ValueError('Pixel width and height must not be 0')
    if bbox[2] < bbox[0] or bbox[3] < bbox[1]:
        raise ValueError('bbox must be in order [minX, minY, maxX, maxY]')
    if ((pixel_width > 0 and bbox[0] < x_origin) or
        (pixel_height > 0 and bbox[1] < y_origin) or
        (pixel_width < 0 and bbox[0] > x_origin) or
        (pixel_height < 0 and bbox[1] > y_origin)):
        raise ValueError('bbox must fall within geotransform grid')
    return [
        x_origin + abs(pixel_width) * math.floor(
            (bbox[0] - x_origin) / pixel_width * numpy.sign(pixel_width)),
        y_origin + abs(pixel_height) * math.floor(
            (bbox[1] - y_origin) / pixel_height * numpy.sign(pixel_height)),
        x_origin + abs(pixel_width) * math.ceil(
            (bbox[2] - x_origin) / pixel_width * numpy.sign(pixel_width)),
        y_origin + abs(pixel_height) * math.ceil(
            (bbox[3] - y_origin) / pixel_height * numpy.sign(pixel_height))]


[docs]def zonal_statistics(
        base_raster_path_band, aggregate_vector_path,
        aggregate_layer_name=None, ignore_nodata=True,
        polygons_might_overlap=True, include_value_counts=False,
        working_dir=None):
    """Collect stats on pixel values which lie within polygons.

    This function summarizes raster statistics including min, max,
    mean, and pixel count over the regions on the raster that are
    overlapped by the polygons in the vector layer. Statistics are calculated
    in two passes, where first polygons aggregate over pixels in the raster
    whose centers intersect with the polygon. In the second pass, any polygons
    that are not aggregated use their bounding box to intersect with the
    raster for overlap statistics.

    Statistics are calculated on the set of pixels that fall within each
    feature polygon. If ``ignore_nodata`` is false, nodata pixels are considered
    valid when calculating the statistics:

    - 'min': minimum valid pixel value
    - 'max': maximum valid pixel value
    - 'sum': sum of valid pixel values
    - 'count': number of valid pixels
    - 'nodata_count': number of nodata pixels
    - 'value_counts': number of pixels having each unique value

    Note:
        There may be some degenerate cases where the bounding box vs. actual
        geometry intersection would be incorrect, but these are so unlikely as
        to be manually constructed. If you encounter one of these please create
        an issue at https://github.com/natcap/pygeoprocessing/issues with the
        datasets used.

    Args:
        base_raster_path_band (tuple or list[tuple]): base raster
            (path, band) tuple(s) to analyze. If a list of multiple raster
            bands is provided, they must be aligned (all having the same
            bounding box, geotransform, and projection).
        aggregate_vector_path (string): a path to a polygon vector whose
            geometric features indicate the areas in
            ``base_raster_path_band`` to calculate zonal statistics.
        aggregate_layer_name (string): name of vector layer that will be
            used to aggregate results over.  If set to None, the first layer
            in the DataSource will be used as retrieved by ``.GetLayer()``.
            Note: it is normal and expected to set this field at None if the
            aggregating vector dataset has a single layer as many do,
            including the common 'ESRI Shapefile'.
        ignore_nodata: if true, then nodata pixels are not accounted for when
            calculating min, max, count, or mean.  However, the value of
            ``nodata_count`` will always be the number of nodata pixels
            aggregated under the polygon.
        polygons_might_overlap (boolean): if True the function calculates
            aggregation coverage close to optimally by rasterizing sets of
            polygons that don't overlap.  However, this step can be
            computationally expensive for cases where there are many polygons.
            Setting this flag to ``False`` directs the function rasterize in
            one step.
        include_value_counts (boolean): If True, the function tallies the
            number of pixels of each value under the polygon.  This is useful
            for classified rasters but could exhaust available memory when run
            on a continuous (floating-point) raster.  Defaults to False.
        working_dir (string): If not None, indicates where temporary files
            should be created during this run.

    Return:
        If `base_raster_path_band` is a tuple, the return value is a nested
        dictionary of stats for that raster band. Top-level keys are the
        aggregate feature FIDs. Each nested FID dictionary then contains
        statistics about that feature: 'min', 'max', 'sum', 'count',
        'nodata_count', and optionally 'value_counts'. Example::
            {
                0: {
                    'min': 0,
                    'max': 14,
                    'sum': 42,
                    'count': 8,
                    'nodata_count': 1,
                    'value_counts': {
                        2: 5,
                        4: 1,
                        14: 2
                    }
                }
            }

        If `base_raster_path_band` is a list of tuples, the return value is
        a list of the nested dictionaries described above. Each dictionary in
        the list contains the stats calculated for the corresponding raster
        band in the `base_raster_path_band` list.


    Raises:
        ValueError
            if ``base_raster_path_band`` is incorrectly formatted, or if
            not all of the input raster bands are aligned with each other

        RuntimeError
            if the aggregate vector or layer cannot be opened

    """
    # Check that the raster path/band input is formatted correctly
    multi_raster_mode = isinstance(base_raster_path_band, list)
    if not multi_raster_mode:
        base_raster_path_band = [base_raster_path_band]
    for path_band_tuple in base_raster_path_band:
        if not _is_raster_path_band_formatted(path_band_tuple):
            raise ValueError(
                '`base_raster_path_band` not formatted as expected.  Expected '
                f'(path, band_index), received {repr(path_band_tuple)}')

    # Check that all input rasters are aligned. This should hold true if they
    # have the same projection, geotransform, and bounding box.
    for attr in ['geotransform', 'bounding_box', 'projection_wkt']:
        vals = set()
        for path, _ in base_raster_path_band:
            raster_info = get_raster_info(path)
            vals.add(str(raster_info[attr]))
        if len(vals) > 1:
            raise ValueError(
                'All input rasters must be aligned. Multiple values of '
                f'"{attr}" were found among the input rasters: {vals}')

    # Check that the aggregate vector and layer can be opened
    aggregate_vector = gdal.OpenEx(aggregate_vector_path, gdal.OF_VECTOR)
    if aggregate_vector is None:
        raise RuntimeError(
            f"Could not open aggregate vector at {aggregate_vector_path}")
    if aggregate_layer_name is None:
        aggregate_layer = aggregate_vector.GetLayer()
    else:
        aggregate_layer = aggregate_vector.GetLayerByName(aggregate_layer_name)
    if aggregate_layer is None:
        raise RuntimeError(
            f"Could not open layer {aggregate_layer_name} of {aggregate_vector_path}")


    # Define the default/empty statistics values
    # These values will be returned for features that have no geometry or
    # don't overlap any valid pixels.
    def default_aggregate_dict():
        default_aggregate_dict = {
            'min': None, 'max': None, 'count': 0, 'nodata_count': 0, 'sum': 0}
        if include_value_counts:
            default_aggregate_dict['value_counts'] = collections.Counter()
        return default_aggregate_dict

    # Create a copy of the aggregate vector with the FID copied into a
    # persistent attribute. Do this before reprojecting because converting to
    # GPKG can change the FIDs.
    # If the aggregate vector already contains a field called 'original_fid',
    # this will break and GDAL will emit a warning.
    temp_working_dir = tempfile.mkdtemp(dir=working_dir)
    copy_path = os.path.join(temp_working_dir, 'vector_copy.gpkg')
    target_vector = ogr.GetDriverByName('GPKG').CreateDataSource(copy_path)
    target_layer_id = 'disjoint_vector'
    target_layer = target_vector.CreateLayer(
        name=target_layer_id,
        srs=aggregate_layer.GetSpatialRef(),
        geom_type=ogr.wkbPolygon)
    fid_field_name = 'original_fid'
    target_layer.CreateField(ogr.FieldDefn(fid_field_name, ogr.OFTInteger))
    valid_fid_set = set()
    aggregate_stats_list = [{} for _ in base_raster_path_band]
    original_to_new_fid_map = {}
    for feature in aggregate_layer:
        fid = feature.GetFID()
        # Initialize the output data structure:
        # a list of zonal_stats dicts, one for each raster input
        for stats_dict in aggregate_stats_list:
            stats_dict[fid] = default_aggregate_dict()
        geom_ref = feature.GetGeometryRef()
        if geom_ref is None:
            LOGGER.warning(
                f'Skipping feature with FID {fid} because it has no geometry')
            continue
        valid_fid_set.add(fid)
        feature_copy = ogr.Feature(target_layer.GetLayerDefn())
        feature_copy.SetGeometry(geom_ref.Clone())
        feature_copy.SetField(fid_field_name, fid)
        target_layer.CreateFeature(feature_copy)
        original_to_new_fid_map[fid] = feature_copy.GetFID()
    target_layer, target_vector, feature, feature_copy = None, None, None, None
    geom_ref, aggregate_layer, aggregate_vector = None, None, None

    # Reproject the vector to match the raster projection
    target_vector_path = os.path.join(temp_working_dir, 'reprojected.gpkg')
    reproject_vector(
        base_vector_path=copy_path,
        target_projection_wkt=raster_info['projection_wkt'],
        target_path=target_vector_path,
        layer_id=target_layer_id,
        driver_name='GPKG',
        copy_fields=True)
    target_vector = gdal.OpenEx(target_vector_path, gdal.OF_VECTOR)

    try:
        bbox_intersection = merge_bounding_box_list(
            [raster_info['bounding_box'],
             get_vector_info(target_vector_path)['bounding_box']
            ], 'intersection')
    except ValueError as err:
        if 'Bounding boxes do not intersect' in repr(err):
            LOGGER.error(
                f'Aggregate vector {aggregate_vector_path} does not intersect '
                'the input raster(s)')
            if multi_raster_mode:
                return aggregate_stats_list
            else:
                return aggregate_stats_list[0]
        else:
            # this would be very unexpected to get here, but if it happened
            # and we didn't raise an exception, execution could get weird.
            raise

    # Expand the intersection bounding box to align with the nearest pixels
    # in the original raster
    aligned_bbox = align_bbox(raster_info['geotransform'], bbox_intersection)

    # Clip base rasters to their intersection with the aggregate vector
    LOGGER.info('Clipping rasters to their intersection with the vector')
    target_raster_path_band_list = []
    for i, (base_path, band) in enumerate(base_raster_path_band):
        raster_info = get_raster_info(path)
        if (raster_info['datatype'] in {gdal.GDT_Float32, gdal.GDT_Float64}
                and include_value_counts):
            LOGGER.warning(
                f'Value counts requested on a floating-point raster: {path}. '
                'This can cause excessive memory usage if the raster has '
                'continuous values.')
        target_path = os.path.join(temp_working_dir, f'{i}.tif')
        target_raster_path_band_list.append((target_path, band))
        gdal.Warp(
            destNameOrDestDS=target_path,
            srcDSOrSrcDSTab=base_path,
            format='GTIFF',
            # specify the original pixel size because warp doesn't necessarily
            # preserve it by default. resolution should always be positive
            xRes=abs(raster_info['pixel_size'][0]),
            yRes=abs(raster_info['pixel_size'][1]),
            outputBounds=aligned_bbox,
            callback=_make_logger_callback("Warp %.1f%% complete %s"))

    # Calculate disjoint polygon sets
    if polygons_might_overlap:
        LOGGER.info('calculating disjoint polygon sets')
        # Only consider polygons that overlap the rasters
        # Use the original vector to be sure that the correct FIDs are returned
        disjoint_fid_sets = calculate_disjoint_polygon_set(
            aggregate_vector_path, bounding_box=bbox_intersection)
    else:
        disjoint_fid_sets = [valid_fid_set]

    # Rasterize each disjoint polygon set onto its own raster layer
    fid_nodata = numpy.iinfo(numpy.uint16).max
    fid_raster_paths = []
    for i, disjoint_fid_set in enumerate(disjoint_fid_sets):
        fid_raster_path = os.path.join(temp_working_dir, f'fid_set_{i}.tif')
        fid_set_str = ", ".join(str(fid) for fid in disjoint_fid_set)
        gdal.Rasterize(
            destNameOrDestDS=fid_raster_path,
            srcDS=target_vector,
            allTouched=False,
            attribute=fid_field_name,
            noData=fid_nodata,
            outputBounds=aligned_bbox,
            xRes=abs(raster_info['pixel_size'][0]),  # resolution must be > 0
            yRes=abs(raster_info['pixel_size'][1]),
            format='GTIFF',
            outputType=gdal.GDT_UInt16,
            creationOptions=DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS[1],
            callback=_make_logger_callback(
                f'rasterizing disjoint polygon set {i + 1} of '
                f'{len(disjoint_fid_sets)} set %.1f%% complete (%s)'),
            where=(f'{fid_field_name} IN ({fid_set_str})'))
        fid_raster_paths.append(fid_raster_path)

    timed_logger = TimedLoggingAdapter(_LOGGING_PERIOD)
    # Calculate statistics for each raster and each feature
    # working block-wise through the rasters
    for i, (raster_path, band) in enumerate(target_raster_path_band_list):
        LOGGER.info('calculating stats on raster '
                    f'{i} of {len(target_raster_path_band_list)}')
        # fetch the block offsets before the raster is opened for writing
        offset_list = list(iterblocks((raster_path, band), offset_only=True))
        nodata = get_raster_info(raster_path)['nodata'][band - 1]
        data_source = gdal.OpenEx(raster_path, gdal.OF_RASTER)
        data_band = data_source.GetRasterBand(band)
        found_fids = set()  # track FIDs found on at least one pixel

        for set_index, fid_raster_path in enumerate(fid_raster_paths):
            LOGGER.info(
                f'disjoint polygon set {set_index + 1} of '
                f'{len(fid_raster_paths)}')
            fid_raster = gdal.OpenEx(fid_raster_path, gdal.OF_RASTER)
            fid_band = fid_raster.GetRasterBand(1)

            for offset_index, offset in enumerate(offset_list):
                timed_logger.info(
                    "%.1f%% done calculating stats for polygon set %s on raster %s",
                    offset_index / len(offset_list) * 100, set_index, i)
                fid_block = fid_band.ReadAsArray(**offset)
                data_block = data_band.ReadAsArray(**offset)

                # Update stats for each FID found in this block of data
                for fid in numpy.unique(fid_block):
                    if fid == fid_nodata:
                        continue
                    found_fids.add(fid)
                    # get the pixels that fall within the feature
                    feature_data = data_block[fid_block == fid]
                    nodata_mask = array_equals_nodata(feature_data, nodata)
                    aggregate_stats_list[i][fid]['nodata_count'] += nodata_mask.sum()
                    if ignore_nodata:
                        feature_data = feature_data[~nodata_mask]
                    if feature_data.size == 0:
                        continue

                    # compute stats
                    if aggregate_stats_list[i][fid]['min'] is None:
                        # initialize min/max to an arbitrary data value
                        aggregate_stats_list[i][fid]['min'] = feature_data[0]
                        aggregate_stats_list[i][fid]['max'] = feature_data[0]
                    aggregate_stats_list[i][fid]['min'] = min(
                        feature_data.min(), aggregate_stats_list[i][fid]['min'])
                    aggregate_stats_list[i][fid]['max'] = max(
                        feature_data.max(), aggregate_stats_list[i][fid]['max'])
                    aggregate_stats_list[i][fid]['sum'] += feature_data.sum()
                    aggregate_stats_list[i][fid]['count'] += feature_data.size
                    if include_value_counts:
                        # .update() here is operating on a Counter, so values are
                        # ADDED, not replaced.
                        aggregate_stats_list[i][fid]['value_counts'].update(
                            dict(zip(*numpy.unique(
                                feature_data, return_counts=True))))
        fid_band, fid_raster = None, None
        # Handle edge cases: features that have a geometry but do not
        # overlap the center point of any pixel will not be captured by the
        # method above.
        unset_fids = valid_fid_set.difference(found_fids)
        x_origin, pixel_width, _, y_origin, _, pixel_height = data_source.GetGeoTransform()
        # subtract 1 because bands are 1-indexed
        raster_nodata = get_raster_info(raster_path)['nodata'][band - 1]
        target_layer = target_vector.GetLayerByName(target_layer_id)
        for unset_fid in unset_fids:
            # Look up by the new FID
            # FIDs in target_layer may not be the same as in the input layer
            unset_feat = target_layer.GetFeature(original_to_new_fid_map[unset_fid])
            unset_geom_ref = unset_feat.GetGeometryRef()

            geom_x_min, geom_x_max, geom_y_min, geom_y_max = unset_geom_ref.GetEnvelope()
            unset_geom_ref, unset_feat = None, None
            if pixel_width < 0:
                geom_x_min, geom_x_max = geom_x_max, geom_x_min
            if pixel_height < 0:
                geom_y_min, geom_y_max = geom_y_max, geom_y_min

            xoff = int((geom_x_min - x_origin) / pixel_width)
            yoff = int((geom_y_min - y_origin) / pixel_height)
            win_xsize = int(numpy.ceil(
                (geom_x_max - x_origin) / pixel_width)) - xoff
            win_ysize = int(numpy.ceil(
                (geom_y_max - y_origin) / pixel_height)) - yoff

            # clamp offset to the side of the raster if it's negative
            if xoff < 0:
                win_xsize += xoff
                xoff = 0
            if yoff < 0:
                win_ysize += yoff
                yoff = 0

            # clamp the window to the side of the raster if too big
            if xoff + win_xsize > data_band.XSize:
                win_xsize = data_band.XSize - xoff
            if yoff + win_ysize > data_band.YSize:
                win_ysize = data_band.YSize - yoff

            if win_xsize <= 0 or win_ysize <= 0:
                continue

            # here we consider the pixels that intersect with the geometry's
            # bounding box as being the proxy for the intersection with the
            # polygon itself. This is not a bad approximation since the case
            # that caused the polygon to be skipped in the first phase is that it
            # is as small as a pixel. There could be some degenerate cases that
            # make this estimation very wrong, but we do not know of any that
            # would come from natural data. If you do encounter such a dataset
            # please email the description and datset to jdouglass@stanford.edu.
            unset_fid_block = data_band.ReadAsArray(
                xoff=xoff, yoff=yoff, win_xsize=win_xsize, win_ysize=win_ysize)
            unset_fid_nodata_mask = array_equals_nodata(
                unset_fid_block, raster_nodata)
            if ignore_nodata:
                unset_fid_block = unset_fid_block[~unset_fid_nodata_mask]
            if unset_fid_block.size == 0:
                aggregate_stats_list[i][unset_fid]['min'] = 0
                aggregate_stats_list[i][unset_fid]['max'] = 0
                aggregate_stats_list[i][unset_fid]['sum'] = 0
            else:
                aggregate_stats_list[i][unset_fid]['min'] = numpy.min(
                    unset_fid_block)
                aggregate_stats_list[i][unset_fid]['max'] = numpy.max(
                    unset_fid_block)
                aggregate_stats_list[i][unset_fid]['sum'] = numpy.sum(
                    unset_fid_block)
            aggregate_stats_list[i][unset_fid]['count'] = unset_fid_block.size
            aggregate_stats_list[i][unset_fid]['nodata_count'] = numpy.sum(
                unset_fid_nodata_mask)
            if include_value_counts:
                # .update() here is operating on a Counter, so values are ADDED,
                # not replaced.
                aggregate_stats_list[i][unset_fid]['value_counts'].update(dict(
                    zip(*numpy.unique(unset_fid_block, return_counts=True))))

        # Convert counter object to dictionary
        if include_value_counts:
            for key, value in aggregate_stats_list[i].items():
                aggregate_stats_list[i][key]['value_counts'] = dict(
                    aggregate_stats_list[i][key]['value_counts'])
        LOGGER.info(
            f'all done processing polygon sets for {os.path.basename(aggregate_vector_path)}')

    # dereference gdal objects
    data_band, data_source = None, None
    disjoint_layer, target_layer, target_vector = None, None, None

    shutil.rmtree(temp_working_dir)
    if multi_raster_mode:
        return aggregate_stats_list
    else:
        return aggregate_stats_list[0]


[docs]def get_vector_info(vector_path, layer_id=0):
    """Get information about an GDAL vector.

    Args:
        vector_path (str): a path to a GDAL vector.
        layer_id (str/int): name or index of underlying layer to analyze.
            Defaults to 0.

    Raises:
        ValueError if ``vector_path`` does not exist on disk or cannot be
        opened as a gdal.OF_VECTOR.

    Return:
        raster_properties (dictionary):
            a dictionary with the following key-value pairs:

            * ``'projection_wkt'`` (string): projection of the vector in Well
              Known Text.
            * ``'bounding_box'`` (sequence): sequence of floats representing
              the bounding box in projected coordinates in the order
              [minx, miny, maxx, maxy].
            * ``'file_list'`` (sequence): sequence of string paths to the files
              that make up this vector.

    """
    vector = gdal.OpenEx(vector_path, gdal.OF_VECTOR)
    if not vector:
        raise ValueError(
            "Could not open %s as a gdal.OF_VECTOR" % vector_path)
    vector_properties = {}
    vector_properties['file_list'] = vector.GetFileList()
    layer = vector.GetLayer(iLayer=layer_id)
    # projection is same for all layers, so just use the first one
    spatial_ref = layer.GetSpatialRef()
    if spatial_ref:
        vector_projection_wkt = spatial_ref.ExportToWkt()
    else:
        vector_projection_wkt = None
    vector_properties['projection_wkt'] = vector_projection_wkt
    layer_bb = layer.GetExtent()
    layer = None
    vector = None
    # convert form [minx,maxx,miny,maxy] to [minx,miny,maxx,maxy]
    vector_properties['bounding_box'] = [layer_bb[i] for i in [0, 2, 1, 3]]
    return vector_properties


[docs]def get_raster_info(raster_path):
    """Get information about a GDAL raster (dataset).

    Args:
       raster_path (String): a path to a GDAL raster.

    Raises:
        ValueError
            if ``raster_path`` is not a file or cannot be opened as a
            ``gdal.OF_RASTER``.

    Return:
        raster_properties (dictionary):
            a dictionary with the properties stored under relevant keys.

        * ``'pixel_size'`` (tuple): (pixel x-size, pixel y-size)
          from geotransform.
        * ``'raster_size'`` (tuple):  number of raster pixels in (x, y)
          direction.
        * ``'nodata'`` (sequence): a sequence of the nodata values in the bands
          of the raster in the same order as increasing band index.
        * ``'n_bands'`` (int): number of bands in the raster.
        * ``'geotransform'`` (tuple): a 6-tuple representing the geotransform
          of (x orign, x-increase, xy-increase, y origin, yx-increase,
          y-increase).
        * ``'datatype'`` (int): An instance of an enumerated gdal.GDT_* int
          that represents the datatype of the raster.
        * ``'projection_wkt'`` (string): projection of the raster in Well Known
          Text.
        * ``'bounding_box'`` (sequence): sequence of floats representing the
          bounding box in projected coordinates in the order
          [minx, miny, maxx, maxy]
        * ``'block_size'`` (tuple): underlying x/y raster block size for
          efficient reading.
        * ``'numpy_type'`` (numpy type): this is the equivalent numpy datatype
          for the raster bands including signed bytes.
        * ``'overviews'`` (sequence): A list of (x, y) tuples for the
          number of pixels in the width and height of each overview level of
          the raster.
        * ``'file_list'`` (sequence): A list of files that make up this raster.

    """
    raster = gdal.OpenEx(raster_path, gdal.OF_RASTER)
    if not raster:
        raise ValueError(
            "Could not open %s as a gdal.OF_RASTER" % raster_path)
    raster_properties = {}
    raster_properties['file_list'] = raster.GetFileList()
    projection_wkt = raster.GetProjection()
    if not projection_wkt:
        projection_wkt = None
    raster_properties['projection_wkt'] = projection_wkt
    geo_transform = raster.GetGeoTransform()
    raster_properties['geotransform'] = geo_transform
    raster_properties['pixel_size'] = (geo_transform[1], geo_transform[5])
    raster_properties['raster_size'] = (
        raster.GetRasterBand(1).XSize,
        raster.GetRasterBand(1).YSize)
    raster_properties['n_bands'] = raster.RasterCount
    raster_properties['nodata'] = [
        raster.GetRasterBand(index).GetNoDataValue() for index in range(
            1, raster_properties['n_bands']+1)]

    # GDAL creates overviews for the whole raster but has overviews accessed
    # per band.  We assume that all bands have the same overviews.
    raster_properties['overviews'] = []
    for overview_index in range(raster.GetRasterBand(1).GetOverviewCount()):
        overview_band = raster.GetRasterBand(1).GetOverview(overview_index)
        raster_properties['overviews'].append((
            overview_band.XSize, overview_band.YSize))

    # blocksize is the same for all bands, so we can just get the first
    raster_properties['block_size'] = raster.GetRasterBand(1).GetBlockSize()

    # we dont' really know how the geotransform is laid out, all we can do is
    # calculate the x and y bounds, then take the appropriate min/max
    x_bounds = [
        geo_transform[0], geo_transform[0] +
        raster_properties['raster_size'][0] * geo_transform[1] +
        raster_properties['raster_size'][1] * geo_transform[2]]
    y_bounds = [
        geo_transform[3], geo_transform[3] +
        raster_properties['raster_size'][0] * geo_transform[4] +
        raster_properties['raster_size'][1] * geo_transform[5]]

    raster_properties['bounding_box'] = [
        numpy.min(x_bounds), numpy.min(y_bounds),
        numpy.max(x_bounds), numpy.max(y_bounds)]

    # datatype is the same for the whole raster, but is associated with band
    band = raster.GetRasterBand(1)
    raster_properties['datatype'] = band.DataType
    raster_properties['numpy_type'] = _gdal_to_numpy_type(
        band.DataType, band.GetMetadata('IMAGE_STRUCTURE'))
    band = None
    raster = None
    return raster_properties


[docs]def reproject_vector(
        base_vector_path, target_projection_wkt, target_path, layer_id=0,
        driver_name='ESRI Shapefile', copy_fields=True,
        target_layer_name=None,
        osr_axis_mapping_strategy=DEFAULT_OSR_AXIS_MAPPING_STRATEGY):
    """Reproject OGR DataSource (vector).

    Transforms the features of the base vector to the desired output
    projection in a new vector.

    Note:
        If the ESRI Shapefile driver is used, the ``target_layer_name``
        optional parameter is ignored. ESRI Shapefiles by definition use the
        filename to define the layer name.

    Args:
        base_vector_path (string): Path to the base shapefile to transform.
        target_projection_wkt (string): the desired output projection in Well
            Known Text (by layer.GetSpatialRef().ExportToWkt())
        target_path (string): the filepath to the transformed shapefile
        layer_id (str/int): name or index of layer in ``base_vector_path`` to
            reproject. Defaults to 0.
        driver_name (string): String to pass to ogr.GetDriverByName, defaults
            to 'ESRI Shapefile'.
        copy_fields (bool or iterable): If True, all the fields in
            ``base_vector_path`` will be copied to ``target_path`` during the
            reprojection step. If it is an iterable, it will contain the
            field names to exclusively copy. An unmatched fieldname will be
            ignored. If ``False`` no fields are copied into the new vector.
        target_layer_name=None (str): The name to use for the target layer in
            the new vector.  If ``None`` (the default), the layer name from the
            source layer will be used.
        osr_axis_mapping_strategy (int): OSR axis mapping strategy for
            ``SpatialReference`` objects. Defaults to
            ``geoprocessing.DEFAULT_OSR_AXIS_MAPPING_STRATEGY``. This parameter
            should not be changed unless you know what you are doing.

    Returns:
        None
    """
    base_vector = gdal.OpenEx(base_vector_path, gdal.OF_VECTOR)

    # if this file already exists, then remove it
    if os.path.isfile(target_path):
        LOGGER.warning(
            "%s already exists, removing and overwriting", target_path)
        os.remove(target_path)

    target_sr = osr.SpatialReference(target_projection_wkt)

    # create a new shapefile from the orginal_datasource
    target_driver = ogr.GetDriverByName(driver_name)
    target_vector = target_driver.CreateDataSource(target_path)

    layer = base_vector.GetLayer(layer_id)
    layer_dfn = layer.GetLayerDefn()

    # Create new layer for target_vector using same name and
    # geometry type from base vector but new projection
    layer_name = layer_dfn.GetName()
    if target_layer_name is not None:
        layer_name = target_layer_name
        if driver_name == 'ESRI Shapefile':
            target_file_basename = os.path.splitext(
                os.path.basename(target_path)[0])
            if layer_name != target_file_basename:
                LOGGER.warning(
                    f'Ignoring user-defined layer name {layer_name}. '
                    f'Defining a layer name is incompatible with the ESRI '
                    'Shapefile vector format.  Use the filename instead or '
                    'use a different vector format.')
    target_layer = target_vector.CreateLayer(
        layer_name, target_sr, layer_dfn.GetGeomType())

    # this will map the target field index to the base index it came from
    # in case we don't need to copy all the fields
    target_to_base_field_id_map = {}
    if copy_fields:
        # Get the number of fields in original_layer
        original_field_count = layer_dfn.GetFieldCount()
        # For every field that's copying, create a duplicate field in the
        # new layer

        for fld_index in range(original_field_count):
            original_field = layer_dfn.GetFieldDefn(fld_index)
            field_name = original_field.GetName()
            if copy_fields is True or field_name in copy_fields:
                target_field = ogr.FieldDefn(
                    field_name, original_field.GetType())
                target_layer.CreateField(target_field)
                target_to_base_field_id_map[fld_index] = len(
                    target_to_base_field_id_map)

    # Get the SR of the original_layer to use in transforming
    base_sr = layer.GetSpatialRef()

    base_sr.SetAxisMappingStrategy(osr_axis_mapping_strategy)
    target_sr.SetAxisMappingStrategy(osr_axis_mapping_strategy)

    # Create a coordinate transformation
    coord_trans = osr.CreateCoordinateTransformation(base_sr, target_sr)

    # Copy all of the features in layer to the new shapefile
    target_layer.StartTransaction()
    error_count = 0
    timed_logger = TimedLoggingAdapter(_LOGGING_PERIOD)
    LOGGER.info("starting reprojection")
    for feature_index, base_feature in enumerate(layer):
        timed_logger.info(
            "reprojection approximately %.1f%% complete on %s",
            100.0 * float(feature_index+1) / (layer.GetFeatureCount()),
            os.path.basename(target_path))

        geom = base_feature.GetGeometryRef()
        if geom is None:
            # we encountered this error occasionally when transforming clipped
            # global polygons.  Not clear what is happening but perhaps a
            # feature was retained that otherwise wouldn't have been included
            # in the clip
            error_count += 1
            continue

        # Transform geometry into format desired for the new projection
        error_code = geom.Transform(coord_trans)
        if error_code != 0:  # error
            # this could be caused by an out of range transformation
            # whatever the case, don't put the transformed poly into the
            # output set
            error_count += 1
            continue

        # Copy original_datasource's feature and set as new shapes feature
        target_feature = ogr.Feature(target_layer.GetLayerDefn())
        target_feature.SetGeometry(geom)

        # For all the fields in the feature set the field values from the
        # source field
        for target_index, base_index in (
                target_to_base_field_id_map.items()):
            target_feature.SetField(
                target_index, base_feature.GetField(base_index))

        target_layer.CreateFeature(target_feature)
        target_feature = None
        base_feature = None
    target_layer.CommitTransaction()
    LOGGER.info(
        "reprojection 100.0%% complete on %s", os.path.basename(target_path))
    if error_count > 0:
        LOGGER.warning(
            '%d features out of %d were unable to be transformed and are'
            ' not in the output vector at %s', error_count,
            layer.GetFeatureCount(), target_path)
    layer = None
    base_vector = None


[docs]def reclassify_raster(
        base_raster_path_band, value_map, target_raster_path, target_datatype,
        target_nodata, values_required=True,
        raster_driver_creation_tuple=DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS):
    """Reclassify pixel values in a raster.

    A function to reclassify values in raster to any output type. By default
    the values except for nodata must be in ``value_map``. Include the
    base raster nodata value in ``value_map`` to reclassify nodata pixels
    to a valid value.

    Args:
        base_raster_path_band (tuple): a tuple including file path to a raster
            and the band index to operate over. ex: (path, band_index)
        value_map (dictionary): a dictionary of values of
            {source_value: dest_value, ...} where source_value's type is the
            same as the values in ``base_raster_path`` at band ``band_index``.
            Cannot be empty and must contain a mapping for all raster values
            if ``values_required=True``. If nodata is mapped, nodata will be
            reclassified, otherwise nodata will be set to ``target_nodata``.
        target_raster_path (string): target raster output path; overwritten if
            it exists
        target_datatype (gdal type): the numerical type for the target raster
        target_nodata (numerical type): the nodata value for the target raster.
            Must be the same type as target_datatype. All nodata pixels in
            the base raster will be reclassified to this value, unless the
            base raster notata values are present in ``value_map``.
        values_required (bool): If True, raise a ValueError if there is a
            value in the raster that is not found in ``value_map``.
        raster_driver_creation_tuple (tuple): a tuple containing a GDAL driver
            name string as the first element and a GDAL creation options
            tuple/list as the second. Defaults to a GTiff driver tuple
            defined at geoprocessing.DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS.

    Return:
        None

    Raises:
        ReclassificationMissingValuesError
            if ``values_required`` is ``True``
            and a pixel value from ``base_raster_path_band`` is not a key in
            ``value_map``.

    """
    if len(value_map) == 0:
        raise ValueError("value_map must contain at least one value")
    if not _is_raster_path_band_formatted(base_raster_path_band):
        raise ValueError(
            "Expected a (path, band_id) tuple, instead got '%s'" %
            base_raster_path_band)
    raster_info = get_raster_info(base_raster_path_band[0])
    nodata = raster_info['nodata'][base_raster_path_band[1]-1]
    # If nodata was included in the value_map pop it from our lists
    # and handle it separately. Doing this on the off chance nodata is
    # a max or min float which can cause floating decimal discrepencies.
    value_map_copy = value_map.copy()
    nodata_dest_value = target_nodata
    if nodata is not None:
        for key, val in value_map.items():
            if numpy.isclose(key, nodata):
                nodata_dest_value = val
                del value_map_copy[key]
                break

    if target_nodata is None and nodata_dest_value is None:
        raise ValueError(
            "target_nodata was set to None and the base raster nodata"
            " value was not represented in the value_map. reclassify_raster"
            " does not assume the base raster nodata value should be used"
            " as the target_nodata value. Set target_nodata to a valid number"
            " and/or add a base raster nodata value mapping to value_map.")

    keys = sorted(numpy.array(list(value_map_copy.keys())))
    values = numpy.array([value_map_copy[x] for x in keys])

    numpy_dtype = _gdal_to_numpy_type(
        target_datatype, raster_driver_creation_tuple[1])

    def _map_dataset_to_value_op(original_values):
        """Convert a block of original values to the lookup values."""
        out_array = numpy.full(
            original_values.shape, target_nodata,
            dtype=numpy_dtype)
        if nodata is None:
            valid_mask = numpy.full(original_values.shape, True)
        else:
            valid_mask = ~numpy.isclose(original_values, nodata)
            out_array[~valid_mask] = nodata_dest_value

        if values_required:
            unique = numpy.unique(original_values[valid_mask])
            has_map = numpy.isin(unique, keys)
            if not all(has_map):
                missing_values = unique[~has_map]
                raise ReclassificationMissingValuesError(
                    missing_values, base_raster_path_band[0], value_map
                )
        index = numpy.digitize(original_values[valid_mask], keys, right=True)
        out_array[valid_mask] = values[index]
        return out_array

    raster_calculator(
        [base_raster_path_band], _map_dataset_to_value_op,
        target_raster_path, target_datatype, target_nodata,
        raster_driver_creation_tuple=raster_driver_creation_tuple)


[docs]def warp_raster(
        base_raster_path, target_pixel_size, target_raster_path,
        resample_method, target_bb=None, base_projection_wkt=None,
        target_projection_wkt=None, n_threads=None, mask_options=None,
        vector_mask_options=None, gdal_warp_options=None, working_dir=None,
        use_overview_level=-1,
        raster_driver_creation_tuple=DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS,
        osr_axis_mapping_strategy=DEFAULT_OSR_AXIS_MAPPING_STRATEGY):
    """Resize/resample raster to desired pixel size, bbox and projection.

    Args:
        base_raster_path (string): path to base raster.
        target_pixel_size (list/tuple): a two element sequence indicating
            the x and y pixel size in projected units.
        target_raster_path (string): the location of the resized and
            resampled raster.
        resample_method (string): the resampling algorithm. Must be a valid
            resampling algorithm for `gdal.WarpRaster`, one of:
            'rms | mode | sum | q1 | near | q3 | average | cubicspline |
            bilinear | max | med | min | cubic | lanczos'
        target_bb (sequence): if None, target bounding box is the same as the
            source bounding box.  Otherwise it's a sequence of float
            describing target bounding box in target coordinate system as
            [minx, miny, maxx, maxy].
        base_projection_wkt (string): if not None, interpret the projection of
            ``base_raster_path`` as this.
        target_projection_wkt (string): if not None, desired target projection
            in Well Known Text format.
        n_threads (int): optional, if not None this sets the ``N_THREADS``
            option for ``gdal.Warp``.
        mask_options (dict or None): optional. If None, no masking will
            be done.  If a dict, it is a dictionary of options relating to the
            dataset mask. Keys to this dictionary are:

            * ``'mask_vector_path'``: (str) path to the mask vector file. This
              vector will be automatically projected to the target
              projection if its base coordinate system does not match
              the target.  Where there are geometries in this vector, pixels in
              ``base_raster_path`` will propagate to ``target_raster_path``.
            * ``'mask_layer_id'``: (int/str) the layer index or name to use
              for masking, if this key is not in the dictionary the default
              is to use the layer at index 0.
            * ``'mask_vector_where_filter'``: (str) an SQL ``WHERE`` string
              that can be used to filter the geometry in the mask.
              Ex: 'id > 10' would use all features whose field value of 'id' is
              > 10.
            * ``'mask_raster_path'``: (str).  If present in the dict, all other
              keys in ``mask_options`` are ignored.  This string must be
              a path to a raster representing a validity mask, where pixel
              values of 1 indicate validity.  This raster must be in the same
              projection and have the same dimensions as the target warped
              raster.  The general (and easiest) use case for ``warp_raster``
              is to use ``'mask_vector_path'`` instead.

        vector_mask_options=None (dict): Alias for ``mask_options``.
            This option is deprecated and will be removed in a future release
            of ``pygeoprocessing``.
        gdal_warp_options (sequence): if present, the contents of this list
            are passed to the ``warpOptions`` parameter of ``gdal.Warp``. See
            the GDAL Warp documentation for valid options.
        working_dir (string): if defined uses this directory to make
            temporary working files for calculation. Otherwise uses system's
            temp directory.
        use_overview_level=-1 (int/str): The overview level to use for warping.
            A value of ``-1`` (the default) indicates that the base raster
            should be used for the source pixels. A value of ``'AUTO'``
            will make GDAL select the overview with the resolution that is
            closest to the target pixel size and warp using that overview's
            pixel values.  Any other integer indicates that that overview index
            should be used.  For example, suppose the raster has overviews at
            levels 2, 4 and 8.  To use level 2, set ``use_overview_level=0``.
            To use level 8, set ``use_overview_level=2``.
        raster_driver_creation_tuple (tuple): a tuple containing a GDAL driver
            name string as the first element and a GDAL creation options
            tuple/list as the second. Defaults to a GTiff driver tuple
            defined at geoprocessing.DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS.
        osr_axis_mapping_strategy (int): OSR axis mapping strategy for
            ``SpatialReference`` objects. Defaults to
            ``geoprocessing.DEFAULT_OSR_AXIS_MAPPING_STRATEGY``. This parameter
            should not be changed unless you know what you are doing.

    Return:
        None

    Raises:
        ValueError
            if ``pixel_size`` is not a 2 element sequence of numbers.
        ValueError
            if ``mask_options`` is not None but the
            ``mask_vector_path`` is undefined or doesn't point to a valid
            file.

    """
    _assert_is_valid_pixel_size(target_pixel_size)

    base_raster_info = get_raster_info(base_raster_path)
    if target_projection_wkt is None:
        target_projection_wkt = base_raster_info['projection_wkt']

    if vector_mask_options is not None:
        warnings.warn('The vector_mask_options parameter is deprecated and '
                      'will be removed in a future release of '
                      'pygeoprocessing. Please use mask_options instead.',
                      DeprecationWarning)
        mask_options = vector_mask_options

    if target_bb is None:
        # ensure it's a sequence so we can modify it
        working_bb = list(get_raster_info(base_raster_path)['bounding_box'])
        # transform the working_bb if target_projection_wkt is not None
        if target_projection_wkt is not None:
            LOGGER.debug(
                "transforming bounding box from %s ", working_bb)
            working_bb = transform_bounding_box(
                base_raster_info['bounding_box'],
                base_raster_info['projection_wkt'], target_projection_wkt)
            LOGGER.debug(
                "transforming bounding to %s ", working_bb)
    else:
        # ensure it's a sequence so we can modify it
        working_bb = list(target_bb)

    # determine the raster size that bounds the input bounding box and then
    # adjust the bounding box to be that size
    target_x_size = int(abs(
        float(working_bb[2] - working_bb[0]) / target_pixel_size[0]))
    target_y_size = int(abs(
        float(working_bb[3] - working_bb[1]) / target_pixel_size[1]))

    # sometimes bounding boxes are numerically perfect, this checks for that
    x_residual = (
        abs(target_x_size * target_pixel_size[0]) -
        (working_bb[2] - working_bb[0]))
    if not numpy.isclose(x_residual, 0.0):
        target_x_size += 1
    y_residual = (
        abs(target_y_size * target_pixel_size[1]) -
        (working_bb[3] - working_bb[1]))
    if not numpy.isclose(y_residual, 0.0):
        target_y_size += 1

    if target_x_size == 0:
        LOGGER.warning(
            "bounding_box is so small that x dimension rounds to 0; "
            "clamping to 1.")
        target_x_size = 1
    if target_y_size == 0:
        LOGGER.warning(
            "bounding_box is so small that y dimension rounds to 0; "
            "clamping to 1.")
        target_y_size = 1

    # this ensures the bounding boxes perfectly fit a multiple of the target
    # pixel size
    working_bb[2] = working_bb[0] + abs(target_pixel_size[0] * target_x_size)
    working_bb[3] = working_bb[1] + abs(target_pixel_size[1] * target_y_size)

    reproject_callback = _make_logger_callback(
        "Warp %.1f%% complete %s")

    warp_options = []
    if n_threads:
        warp_options.append('NUM_THREADS=%d' % n_threads)
    if gdal_warp_options:
        warp_options.extend(gdal_warp_options)

    mask_vector_path = None
    mask_layer_id = 0
    mask_vector_where_filter = None
    if mask_options:
        if 'mask_raster_path' not in mask_options:
            # translate pygeoprocessing terminology into GDAL warp options.
            if 'mask_vector_path' not in mask_options:
                raise ValueError(
                    'mask_options passed, but no value for '
                    '"mask_vector_path": %s', mask_options)
            mask_vector_path = mask_options['mask_vector_path']
            if not os.path.exists(mask_vector_path):
                raise ValueError(
                    'The mask vector at %s was not found.', mask_vector_path)
            if 'mask_layer_id' in mask_options:
                mask_layer_id = mask_options['mask_layer_id']
            if 'mask_vector_where_filter' in mask_options:
                mask_vector_where_filter = (
                    mask_options['mask_vector_where_filter'])

    if mask_options:
        temp_working_dir = tempfile.mkdtemp(dir=working_dir)
        warped_raster_path = os.path.join(
            temp_working_dir, os.path.basename(target_raster_path).replace(
                '.tif', '_nonmasked.tif'))
    else:
        # if there is no vector path the result is the warp
        warped_raster_path = target_raster_path
    base_raster = gdal.OpenEx(base_raster_path, gdal.OF_RASTER)

    raster_creation_options = list(raster_driver_creation_tuple[1])
    _, type_creation_options = _numpy_to_gdal_type(
        base_raster_info['numpy_type'])
    raster_creation_options += type_creation_options

    gdal.Warp(
        warped_raster_path, base_raster,
        format=raster_driver_creation_tuple[0],
        outputBounds=working_bb,
        xRes=abs(target_pixel_size[0]),
        yRes=abs(target_pixel_size[1]),
        resampleAlg=resample_method,
        outputBoundsSRS=target_projection_wkt,
        srcSRS=base_projection_wkt,
        dstSRS=target_projection_wkt,
        multithread=True if warp_options else False,
        warpOptions=warp_options,
        overviewLevel=use_overview_level,
        creationOptions=raster_creation_options,
        callback=reproject_callback,
        callback_data=[target_raster_path])

    if mask_options:
        if 'mask_raster_path' in mask_options:
            # If the user provided a mask raster, use that directly; assume
            # it's been rasterized correctly.
            source_raster_info = get_raster_info(warped_raster_path)
            source_nodata = source_raster_info['nodata'][0]

            def _mask_values(array, mask):
                output = numpy.full(array.shape, source_nodata)
                valid_mask = (
                    mask == 1 & ~array_equals_nodata(array, source_nodata))
                output[valid_mask] = array[valid_mask]
                return output

            raster_calculator(
                [(warped_raster_path, 1),
                 (mask_options['mask_raster_path'], 1)],
                _mask_values, target_raster_path,
                source_raster_info['datatype'], source_nodata)
        else:
            # If the user did not provide a mask in raster form, we can just
            # call down to ``mask_raster``, which will rasterize the vector and
            # then mask out pixels in ``warped_raster_path`` for us.
            updated_raster_driver_creation_tuple = (
                raster_driver_creation_tuple[0],
                tuple(raster_creation_options))
            mask_raster(
                (warped_raster_path, 1),
                mask_options['mask_vector_path'],
                target_raster_path,
                mask_layer_id=mask_layer_id,
                where_clause=mask_vector_where_filter,
                target_mask_value=None, working_dir=temp_working_dir,
                all_touched=False,
                raster_driver_creation_tuple=(
                    updated_raster_driver_creation_tuple))

        shutil.rmtree(temp_working_dir)


[docs]def rasterize(
        vector_path, target_raster_path, burn_values=None, option_list=None,
        layer_id=0, where_clause=None):
    """Project a vector onto an existing raster.

    Burn the layer at ``layer_id`` in ``vector_path`` to an existing
    raster at ``target_raster_path_band``.

    Args:
        vector_path (string): filepath to vector to rasterize.
        target_raster_path (string): path to an existing raster to burn vector
            into.  Can have multiple bands.
        burn_values (list/tuple): optional sequence of values to burn into
            each band of the raster.  If used, should have the same length as
            number of bands at the ``target_raster_path`` raster.  If ``None``
            then ``option_list`` must have a valid value.
        option_list (list/tuple): optional a sequence of burn options, if None
            then a valid value for ``burn_values`` must exist. Otherwise, each
            element is a string of the form:

            * ``"ATTRIBUTE=?"``: Identifies an attribute field on the features
              to be used for a burn in value. The value will be burned into all
              output bands. If specified, ``burn_values`` will not be used and
              can be None.
            * ``"CHUNKYSIZE=?"``: The height in lines of the chunk to operate
              on. The larger the chunk size the less times we need to make a
              pass through all the shapes. If it is not set or set to zero the
              default chunk size will be used. Default size will be estimated
              based on the GDAL cache buffer size using formula:
              ``cache_size_bytes/scanline_size_bytes``, so the chunk will not
              exceed the cache.
            * ``"ALL_TOUCHED=TRUE/FALSE"``: May be set to ``TRUE`` to set all
              pixels touched by the line or polygons, not just those whose
              center is within the polygon or that are selected by Brezenhams
              line algorithm. Defaults to ``FALSE``.
            * ``"BURN_VALUE_FROM"``: May be set to "Z" to use the Z values of
              the geometries. The value from burn_values or the
              attribute field value is added to this before burning. In
              default case dfBurnValue is burned as it is (richpsharp:
              note, I'm not sure what this means, but copied from formal
              docs). This is implemented properly only for points and
              lines for now. Polygons will be burned using the Z value
              from the first point.
            * ``"MERGE_ALG=REPLACE/ADD"``: REPLACE results in overwriting of
              value, while ADD adds the new value to the existing
              raster, suitable for heatmaps for instance.

            Example::

                ["ATTRIBUTE=npv", "ALL_TOUCHED=TRUE"]

        layer_id (str/int): name or index of the layer to rasterize. Defaults
            to 0.
        where_clause (str): If not None, is an SQL query-like string to filter
            which features are used to rasterize, (e.x. where="value=1").

    Return:
        None
    """
    gdal.PushErrorHandler('CPLQuietErrorHandler')
    raster = gdal.OpenEx(target_raster_path, gdal.GA_Update | gdal.OF_RASTER)
    gdal.PopErrorHandler()
    if raster is None:
        raise ValueError(
            "%s doesn't exist, but needed to rasterize." % target_raster_path)

    rasterize_callback = _make_logger_callback(
        "pygeoprocessing.rasterize RasterizeLayer %.1f%% complete %s")

    if burn_values is None:
        burn_values = []
    if option_list is None:
        option_list = []

    if not burn_values and not option_list:
        raise ValueError(
            "Neither `burn_values` nor `option_list` is set. At least "
            "one must have a value.")

    if not isinstance(burn_values, (list, tuple)):
        raise ValueError(
            "`burn_values` is not a list/tuple, the value passed is '%s'",
            repr(burn_values))

    if not isinstance(option_list, (list, tuple)):
        raise ValueError(
            "`option_list` is not a list/tuple, the value passed is '%s'",
            repr(option_list))

    vector = gdal.OpenEx(vector_path, gdal.OF_VECTOR)
    layer = vector.GetLayer(layer_id)
    if where_clause:
        layer.SetAttributeFilter(where_clause)

    try:
        result = gdal.RasterizeLayer(
            raster, [1], layer, burn_values=burn_values,
            options=option_list, callback=rasterize_callback)
    except Exception:
        # something bad happened, but still clean up
        # this case came out of a flaky test condition where the raster
        # would still be in use by the rasterize layer function
        LOGGER.exception('bad error on rasterizelayer')
        result = -1

    layer = None
    vector = None

    if result != 0:
        # need this __swig_destroy__ because we sometimes encounter a flaky
        # test where the path to the raster cannot be cleaned up because
        # it is still in use somewhere, likely a bug in gdal.RasterizeLayer
        # note it is only invoked if there is a serious error
        gdal.Dataset.__swig_destroy__(raster)
        raise RuntimeError('Rasterize returned a nonzero exit code.')
    raster = None


[docs]def calculate_disjoint_polygon_set(
        vector_path, layer_id=0, bounding_box=None,
        geometries_may_touch=False):
    """Create a sequence of sets of polygons that don't overlap.

    Determining the minimal number of those sets is an np-complete problem so
    this is an approximation that builds up sets of maximal subsets.

    Args:
        vector_path (string): a path to an OGR vector.
        layer_id (str/int): name or index of underlying layer in
            ``vector_path`` to calculate disjoint set. Defaults to 0.
        bounding_box (sequence): sequence of floats representing a bounding
            box to filter any polygons by. If a feature in ``vector_path``
            does not intersect this bounding box it will not be considered
            in the disjoint calculation. Coordinates are in the order
            [minx, miny, maxx, maxy].
        geometries_may_touch=False(bool): If ``True``, geometries in a subset
            are allowed to have touching boundaries, but are not allowed to
            have intersecting interiors.  If ``False`` (the default), no
            geometries in a subset may intersect in any way.

    Return:
        subset_list (sequence): sequence of sets of FIDs from vector_path

    """
    vector = gdal.OpenEx(vector_path, gdal.OF_VECTOR)
    vector_layer = vector.GetLayer(layer_id)
    feature_count = vector_layer.GetFeatureCount()

    if feature_count == 0:
        raise RuntimeError('Vector must have geometries but does not: %s'
                           % vector_path)

    LOGGER.info("build shapely polygon list")
    timed_logger = TimedLoggingAdapter(_LOGGING_PERIOD)

    if bounding_box is None:
        bounding_box = get_vector_info(vector_path)['bounding_box']
    bounding_box = shapely.prepared.prep(shapely.geometry.box(*bounding_box))

    shapely_polygon_lookup = {}
    for poly_feat in vector_layer:
        poly_geom_ref = poly_feat.GetGeometryRef()
        if poly_geom_ref is None:
            LOGGER.warning(
                f'no geometry in {vector_path} FID: {poly_feat.GetFID()}, '
                'skipping...')
            continue
        if poly_geom_ref.IsEmpty():
            LOGGER.warning(
                f'empty geometry in {vector_path} FID: {poly_feat.GetFID()}, '
                'skipping...')
            continue
        # with GDAL>=3.3.0 ExportToWkb returns a bytearray instead of bytes
        shapely_polygon_lookup[poly_feat.GetFID()] = (
            shapely.wkb.loads(bytes(poly_geom_ref.ExportToWkb())))
        poly_geom_ref = None
    poly_feat = None

    LOGGER.info("build shapely rtree index")
    r_tree_index_stream = [
        (poly_fid, poly.bounds, None)
        for poly_fid, poly in shapely_polygon_lookup.items()
        if bounding_box.intersects(poly)]
    if r_tree_index_stream:
        poly_rtree_index = rtree.index.Index(r_tree_index_stream)
    else:
        LOGGER.warning("no polygons intersected the bounding box")
        return []

    vector_layer = None
    vector = None
    LOGGER.info(
        'poly feature lookup 100.0%% complete on %s',
        os.path.basename(vector_path))

    LOGGER.info('build poly intersection lookup')
    poly_intersect_lookup = collections.defaultdict(set)
    for poly_index, (poly_fid, poly_geom) in enumerate(
            shapely_polygon_lookup.items()):
        timed_logger.info(
            "poly intersection lookup approximately %.1f%% complete "
            "on %s", 100.0 * float(poly_index+1) / len(
                shapely_polygon_lookup), os.path.basename(vector_path))
        possible_intersection_set = list(poly_rtree_index.intersection(
            poly_geom.bounds))
        # no reason to prep the polygon to intersect itself
        if len(possible_intersection_set) > 1:
            polygon = shapely.prepared.prep(poly_geom)
        else:
            polygon = poly_geom
        for intersect_poly_fid in possible_intersection_set:
            # If geometries touch (share 1+ boundary point), then do not count
            # it as an intersection.
            if geometries_may_touch and polygon.touches(
                    shapely_polygon_lookup[intersect_poly_fid]):
                continue

            if intersect_poly_fid == poly_fid or polygon.intersects(
                    shapely_polygon_lookup[intersect_poly_fid]):
                poly_intersect_lookup[poly_fid].add(intersect_poly_fid)
        polygon = None
    LOGGER.info(
        'poly intersection feature lookup 100.0%% complete on %s',
        os.path.basename(vector_path))

    # Build maximal subsets
    subset_list = []
    while len(poly_intersect_lookup) > 0:
        # sort polygons by increasing number of intersections
        intersections_list = [
            (len(poly_intersect_set), poly_fid, poly_intersect_set)
            for poly_fid, poly_intersect_set in
            poly_intersect_lookup.items()]
        intersections_list.sort()

        # build maximal subset
        maximal_set = set()
        for _, poly_fid, poly_intersect_set in intersections_list:
            timed_logger.info(
                "maximal subset build approximately %.1f%% complete "
                "on %s", 100.0 * float(
                    feature_count - len(poly_intersect_lookup)) /
                feature_count, os.path.basename(vector_path))
            if not poly_intersect_set.intersection(maximal_set):
                # no intersection, add poly_fid to the maximal set and remove
                # the polygon from the lookup
                maximal_set.add(poly_fid)
                del poly_intersect_lookup[poly_fid]
        # remove all the polygons from intersections once they're computed
        for poly_fid, poly_intersect_set in poly_intersect_lookup.items():
            poly_intersect_lookup[poly_fid] = (
                poly_intersect_set.difference(maximal_set))
        subset_list.append(maximal_set)
    LOGGER.info(
        'maximal subset build 100.0%% complete on %s',
        os.path.basename(vector_path))
    return subset_list


[docs]def distance_transform_edt(
        base_region_raster_path_band, target_distance_raster_path,
        sampling_distance=(1., 1.), working_dir=None,
        raster_driver_creation_tuple=DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS):
    """Calculate the euclidean distance transform on base raster.

    Calculates the euclidean distance transform on the base raster in units of
    pixels multiplied by an optional scalar constant. The implementation is
    based off the algorithm described in:  Meijster, Arnold, Jos BTM Roerdink,
    and Wim H. Hesselink. "A general algorithm for computing distance
    transforms in linear time." Mathematical Morphology and its applications
    to image and signal processing. Springer, Boston, MA, 2002. 331-340.

    The base mask raster represents the area to distance transform from as
    any pixel that is not 0 or nodata. It is computationally convenient to
    calculate the distance transform on the entire raster irrespective of
    nodata placement and thus produces a raster that will have distance
    transform values even in pixels that are nodata in the base.

    Args:
        base_region_raster_path_band (tuple): a tuple including file path to a
            raster and the band index to define the base region pixels. Any
            pixel  that is not 0 and nodata are considered to be part of the
            region.
        target_distance_raster_path (string): path to the target raster that
            is the exact euclidean distance transform from any pixel in the
            base raster that is not nodata and not 0. The units are in
            ``(pixel distance * sampling_distance)``.
        sampling_distance (tuple/list): an optional parameter used to scale
            the pixel distances when calculating the distance transform.
            Defaults to (1.0, 1.0). First element indicates the distance
            traveled in the x direction when changing a column index, and the
            second element in y when changing a row index. Both values must
            be > 0.
        working_dir (string): If not None, indicates where temporary files
            should be created during this run.
        raster_driver_creation_tuple (tuple): a tuple containing a GDAL driver
            name string as the first element and a GDAL creation options
            tuple/list as the second. Defaults to a GTiff driver tuple
            defined at geoprocessing.DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS.

    Return:
        None
    """
    working_raster_paths = {}
    for raster_prefix in ['region_mask_raster', 'g_raster']:
        with tempfile.NamedTemporaryFile(
                prefix=raster_prefix, suffix='.tif', delete=False,
                dir=working_dir) as tmp_file:
            working_raster_paths[raster_prefix] = tmp_file.name
    nodata = (get_raster_info(base_region_raster_path_band[0])['nodata'])[
        base_region_raster_path_band[1]-1]
    nodata_out = 255

    def mask_op(base_array):
        """Convert base_array to 1 if not 0 and nodata, 0 otherwise."""
        if nodata is not None:
            return ~numpy.isclose(base_array, nodata) & (base_array != 0)
        else:
            return base_array != 0

    if not isinstance(sampling_distance, (tuple, list)):
        raise ValueError(
            "`sampling_distance` should be a tuple/list, instead it's %s" % (
                type(sampling_distance)))

    sample_d_x, sample_d_y = sampling_distance
    if sample_d_x <= 0. or sample_d_y <= 0.:
        raise ValueError(
            "Sample distances must be > 0.0, instead got %s",
            sampling_distance)

    raster_calculator(
        [base_region_raster_path_band], mask_op,
        working_raster_paths['region_mask_raster'], gdal.GDT_Byte, nodata_out,
        calc_raster_stats=False,
        raster_driver_creation_tuple=raster_driver_creation_tuple)
    geoprocessing_core._distance_transform_edt(
        working_raster_paths['region_mask_raster'],
        working_raster_paths['g_raster'], sampling_distance[0],
        sampling_distance[1], target_distance_raster_path,
        raster_driver_creation_tuple)

    for path in working_raster_paths.values():
        try:
            os.remove(path)
        except OSError:
            LOGGER.warning("couldn't remove file %s", path)


def _next_regular(base):
    """Find the next regular number greater than or equal to base.

    Regular numbers are composites of the prime factors 2, 3, and 5.
    Also known as 5-smooth numbers or Hamming numbers, these are the optimal
    size for inputs to FFTPACK.

    This source was taken directly from scipy.signaltools and saves us from
    having to access a protected member in a library that could change in
    future releases:

    https://github.com/scipy/scipy/blob/v0.17.1/scipy/signal/signaltools.py#L211

    Args:
        base (int): a positive integer to start to find the next Hamming
            number.

    Return:
        The next regular number greater than or equal to ``base``.

    """
    if base <= 6:
        return base

    # Quickly check if it's already a power of 2
    if not (base & (base-1)):
        return base

    match = float('inf')  # Anything found will be smaller
    p5 = 1
    while p5 < base:
        p35 = p5
        while p35 < base:
            # Ceiling integer division, avoiding conversion to float
            # (quotient = ceil(base / p35))
            quotient = -(-base // p35)

            # Quickly find next power of 2 >= quotient
            p2 = 2**((quotient - 1).bit_length())

            N = p2 * p35
            if N == base:
                return N
            elif N < match:
                match = N
            p35 *= 3
            if p35 == base:
                return p35
        if p35 < match:
            match = p35
        p5 *= 5
        if p5 == base:
            return p5
    if p5 < match:
        match = p5
    return match


[docs]def convolve_2d(
        signal_path_band, kernel_path_band, target_path,
        ignore_nodata_and_edges=False, mask_nodata=True,
        normalize_kernel=False, target_datatype=gdal.GDT_Float64,
        target_nodata=None, working_dir=None, set_tol_to_zero=1e-8,
        max_timeout=_MAX_TIMEOUT,
        raster_driver_creation_tuple=DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS):
    """Convolve 2D kernel over 2D signal.

    Convolves the raster in ``kernel_path_band`` over ``signal_path_band``.
    Nodata values are treated as 0.0 during the convolution and masked to
    nodata for the output result where ``signal_path`` has nodata.

    Note with default values, boundary effects can be seen in the result where
    the kernel would hang off the edge of the raster or in regions with
    nodata pixels. The function would treat these areas as values with "0.0"
    by default thus pulling the total convolution down in these areas. This
    is similar to setting ``mode='same'`` in Numpy's ``convolve`` function:
    https://numpy.org/doc/stable/reference/generated/numpy.convolve.html

    This boundary effect can be avoided by setting
    ``ignore_nodata_and_edges=True`` which normalizes the target result by
    dynamically accounting for the number of valid signal pixels the kernel
    overlapped during the convolution step.

    Args:
        signal_path_band (tuple): a 2 tuple of the form
            (filepath to signal raster, band index).
        kernel_path_band (tuple): a 2 tuple of the form
            (filepath to kernel raster, band index), all pixel values should
            be valid -- output is not well defined if the kernel raster has
            nodata values.  To create a kernel raster, see the documentation
            and helper functions available in the
            :doc:`pygeoprocessing kernels module <pygeoprocessing.kernels>`.
        target_path (string): filepath to target raster that's the convolution
            of signal with kernel.  Output will be a single band raster of
            same size and projection as ``signal_path_band``. Any nodata pixels
            that align with ``signal_path_band`` will be set to nodata.
        ignore_nodata_and_edges (boolean): If true, any pixels that are equal
            to ``signal_path_band``'s nodata value or signal pixels where the
            kernel extends beyond the edge of the raster are not included when
            averaging the convolution filter. This has the effect of
            "spreading" the result as though nodata and edges beyond the
            bounds of the raster are 0s. If set to false this tends to "pull"
            the signal away from nodata holes or raster edges. Set this value
            to ``True`` to avoid distortions signal values near edges for
            large integrating kernels. It can be useful to set this value to
            ``True`` to fill nodata holes through distance weighted averaging.
            In this case ``mask_nodata`` must be set to ``False`` so the result
            does not mask out these areas which are filled in. When using this
            technique be careful of cases where the kernel does not extend
            over any areas except nodata holes, in this case the resulting
            values in these areas will be nonsensical numbers, perhaps
            numerical infinity or NaNs.
        normalize_kernel (boolean): If true, the result is divided by the
            sum of the kernel.
        mask_nodata (boolean): If true, ``target_path`` raster's output is
            nodata where ``signal_path_band``'s pixels were nodata. Note that
            setting ``ignore_nodata_and_edges`` to ``True`` while setting
            ``mask_nodata`` to ``False`` can allow for a technique involving
            distance weighted averaging to define areas that would otherwise
            be nodata. Be careful in cases where the kernel does not
            extend over any valid non-nodata area since the result can be
            numerical infinity or NaNs.
        target_datatype (GDAL type): a GDAL raster type to set the output
            raster type to, as well as the type to calculate the convolution
            in.  Defaults to GDT_Float64.  Note signed byte is not
            supported.
        target_nodata (int/float): nodata value to set on output raster.
            If ``target_datatype`` is not gdal.GDT_Float64, this value must
            be set.  Otherwise defaults to the minimum value of a float32.
        raster_creation_options (sequence): an argument list that will be
            passed to the GTiff driver for creating ``target_path``.  Useful
            for blocksizes, compression, and more.
        working_dir (string): If not None, indicates where temporary files
            should be created during this run.
        set_tol_to_zero (float): any value within +- this from 0.0 will get
            set to 0.0. This is to handle numerical roundoff errors that
            sometimes result in "numerical zero", such as -1.782e-18 that
            cannot be tolerated by users of this function. If `None` no
            adjustment will be done to output values.
        max_timeout (float): maximum amount of time to wait for worker thread
            to terminate.
        raster_driver_creation_tuple (tuple): a tuple containing a GDAL driver
            name string as the first element and a GDAL creation options
            tuple/list as the second. Defaults to a GTiff driver tuple
            defined at geoprocessing.DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS.

    Returns:
        ``None``

    Raises:
        ValueError:
            if ``ignore_nodata_and_edges`` is ``True`` and ``mask_nodata``
            is ``False``.
        ValueError:
            if ``signal_path_band`` or ``kernel_path_band`` is a row based
            blocksize which would result in slow runtimes due to gdal
            cache thrashing.
    """
    if target_datatype is not gdal.GDT_Float64 and target_nodata is None:
        raise ValueError(
            "`target_datatype` is set, but `target_nodata` is None. "
            "`target_nodata` must be set if `target_datatype` is not "
            "`gdal.GDT_Float64`.  `target_nodata` is set to None.")
    if target_nodata is None:
        target_nodata = float(numpy.finfo(numpy.float32).min)

    if ignore_nodata_and_edges and not mask_nodata:
        LOGGER.debug(
            'ignore_nodata_and_edges is True while mask_nodata is False -- '
            'this can yield a nonsensical result in areas where the kernel '
            'touches only nodata values.')

    bad_raster_path_list = []
    for raster_id, raster_path_band in [
            ('signal', signal_path_band), ('kernel', kernel_path_band)]:
        if (not _is_raster_path_band_formatted(raster_path_band)):
            bad_raster_path_list.append((raster_id, raster_path_band))
    if bad_raster_path_list:
        raise ValueError(
            "Expected raster path band sequences for the following arguments "
            f"but instead got: {bad_raster_path_list}")

    signal_raster_info = get_raster_info(signal_path_band[0])
    kernel_raster_info = get_raster_info(kernel_path_band[0])

    for info_dict, raster_path_band in zip(
            [signal_raster_info, kernel_raster_info],
            [signal_path_band, kernel_path_band]):
        if 1 in info_dict['block_size']:
            raise ValueError(
                f'{raster_path_band} has a row blocksize which can make this '
                f'function run very slow, create a square blocksize using '
                f'`warp_raster` or `align_and_resize_raster_stack` which '
                f'creates square blocksizes by default')

    # The nodata value is reset to a different value at the end of this
    # function. Here 0 is chosen as a default value since data are
    # incrementally added to the raster
    new_raster_from_base(
        signal_path_band[0], target_path, target_datatype, [0],
        raster_driver_creation_tuple=raster_driver_creation_tuple)

    n_cols_signal, n_rows_signal = signal_raster_info['raster_size']
    n_cols_kernel, n_rows_kernel = kernel_raster_info['raster_size']
    s_path_band = signal_path_band
    k_path_band = kernel_path_band
    s_nodata = signal_raster_info['nodata'][0]

    # we need the original signal raster info because we want the output to
    # be clipped and NODATA masked to it
    signal_raster = gdal.OpenEx(signal_path_band[0], gdal.OF_RASTER)
    signal_band = signal_raster.GetRasterBand(signal_path_band[1])
    # getting the offset list before it's opened for updating
    target_offset_list = list(iterblocks((target_path, 1), offset_only=True))
    target_raster = gdal.OpenEx(target_path, gdal.OF_RASTER | gdal.GA_Update)
    target_band = target_raster.GetRasterBand(1)

    # if we're ignoring nodata, we need to make a parallel convolved signal
    # of the nodata mask
    if ignore_nodata_and_edges:
        raster_file, mask_raster_path = tempfile.mkstemp(
            suffix='.tif', prefix='convolved_mask',
            dir=os.path.dirname(target_path))
        os.close(raster_file)
        new_raster_from_base(
            signal_path_band[0], mask_raster_path, gdal.GDT_Float64,
            [0.0], raster_driver_creation_tuple=raster_driver_creation_tuple)
        mask_raster = gdal.OpenEx(
            mask_raster_path, gdal.GA_Update | gdal.OF_RASTER)
        mask_band = mask_raster.GetRasterBand(1)

    LOGGER.info('starting convolve')
    timed_logger = TimedLoggingAdapter(_LOGGING_PERIOD)

    # calculate the kernel sum for normalization
    kernel_nodata = kernel_raster_info['nodata'][0]
    kernel_sum = 0.0
    for _, kernel_block in iterblocks(kernel_path_band):
        if kernel_nodata is not None and ignore_nodata_and_edges:
            kernel_block[numpy.isclose(kernel_block, kernel_nodata)] = 0.0
        kernel_sum += numpy.sum(kernel_block)

    # limit the size of the work queue since a large kernel / signal with small
    # block size can have a large memory impact when queuing offset lists.
    work_queue = queue.Queue(10)
    signal_offset_list = list(iterblocks(s_path_band, offset_only=True))
    kernel_offset_list = list(iterblocks(k_path_band, offset_only=True))
    n_blocks = len(signal_offset_list) * len(kernel_offset_list)

    LOGGER.debug('start fill work queue thread')

    def _fill_work_queue():
        """Asynchronously fill the work queue."""
        LOGGER.debug('fill work queue')
        for signal_offset in signal_offset_list:
            for kernel_offset in kernel_offset_list:
                work_queue.put((signal_offset, kernel_offset))
        work_queue.put(None)
        LOGGER.debug('work queue full')

    fill_work_queue_worker = threading.Thread(
        target=_fill_work_queue)
    fill_work_queue_worker.daemon = True
    fill_work_queue_worker.start()

    # limit the size of the write queue so we don't accidentally load a whole
    # array into memory
    LOGGER.debug('start worker thread')
    write_queue = queue.Queue(10)
    worker = threading.Thread(
        target=_convolve_2d_worker,
        args=(
            signal_path_band, kernel_path_band,
            ignore_nodata_and_edges, normalize_kernel,
            set_tol_to_zero, work_queue, write_queue))
    worker.daemon = True
    worker.start()

    n_blocks_processed = 0
    LOGGER.info(f'{n_blocks} sent to workers, wait for worker results')
    while True:
        # the timeout guards against a worst case scenario where the
        # ``_convolve_2d_worker`` has crashed.
        write_payload = write_queue.get(timeout=_MAX_TIMEOUT)
        if write_payload:
            (index_dict, result, mask_result,
             left_index_raster, right_index_raster,
             top_index_raster, bottom_index_raster,
             left_index_result, right_index_result,
             top_index_result, bottom_index_result) = write_payload
        else:
            worker.join(max_timeout)
            break

        output_array = numpy.empty(
            (index_dict['win_ysize'], index_dict['win_xsize']),
            dtype=numpy.float32)

        # the inital data value in target_band is 0 because that is the
        # temporary nodata selected so that manual resetting of initial
        # data values weren't necessary. at the end of this function the
        # target nodata value is set to `target_nodata`.
        current_output = target_band.ReadAsArray(**index_dict)

        # read the signal block so we know where the nodata are
        potential_nodata_signal_array = signal_band.ReadAsArray(**index_dict)

        valid_mask = numpy.ones(
            potential_nodata_signal_array.shape, dtype=bool)

        # guard against a None nodata value
        if s_nodata is not None and mask_nodata:
            valid_mask[:] = (
                ~numpy.isclose(potential_nodata_signal_array, s_nodata))
        output_array[:] = target_nodata
        output_array[valid_mask] = (
            (result[top_index_result:bottom_index_result,
                    left_index_result:right_index_result])[valid_mask] +
            current_output[valid_mask])
        target_band.WriteArray(
            output_array, xoff=index_dict['xoff'],
            yoff=index_dict['yoff'])

        if ignore_nodata_and_edges:
            # we'll need to save off the mask convolution so we can divide
            # it in total later
            current_mask = mask_band.ReadAsArray(**index_dict)

            output_array[valid_mask] = (
                (mask_result[
                    top_index_result:bottom_index_result,
                    left_index_result:right_index_result])[valid_mask] +
                current_mask[valid_mask])
            mask_band.WriteArray(
                output_array, xoff=index_dict['xoff'],
                yoff=index_dict['yoff'])

        n_blocks_processed += 1
        timed_logger.info(
            "convolution worker approximately %.1f%% complete on %s",
            100.0 * float(n_blocks_processed) / (n_blocks),
            os.path.basename(target_path))

    LOGGER.info(
        f"convolution worker 100.0% complete on "
        f"{os.path.basename(target_path)}")

    target_band.FlushCache()
    if ignore_nodata_and_edges:
        signal_nodata = get_raster_info(signal_path_band[0])['nodata'][
            signal_path_band[1]-1]
        LOGGER.info(
            "need to normalize result so nodata values are not included")
        mask_pixels_processed = 0
        mask_band.FlushCache()
        for target_offset_data in target_offset_list:
            target_block = target_band.ReadAsArray(
                **target_offset_data).astype(numpy.float64)
            signal_block = signal_band.ReadAsArray(**target_offset_data)
            mask_block = mask_band.ReadAsArray(**target_offset_data)
            if mask_nodata and signal_nodata is not None:
                valid_mask = ~numpy.isclose(signal_block, signal_nodata)
            else:
                valid_mask = numpy.ones(target_block.shape, dtype=bool)
            valid_mask &= (mask_block > 0)
            # divide the target_band by the mask_band
            target_block[valid_mask] /= mask_block[valid_mask].astype(
                numpy.float64)

            # scale by kernel sum if necessary since mask division will
            # automatically normalize kernel
            if not normalize_kernel:
                target_block[valid_mask] *= kernel_sum

            target_band.WriteArray(
                target_block, xoff=target_offset_data['xoff'],
                yoff=target_offset_data['yoff'])

            mask_pixels_processed += target_block.size
            timed_logger.info(
                f"""convolution nodata normalizer approximately {
                100 * mask_pixels_processed / (n_cols_signal * n_rows_signal)
                :.1f}% complete on {os.path.basename(target_path)}""")

        mask_raster = None
        mask_band = None
        os.remove(mask_raster_path)
        LOGGER.info(
            f"convolution nodata normalize 100.0% complete on "
            f"{os.path.basename(target_path)}")

    # set the nodata value from 0 to a reasonable value for the result
    target_band.SetNoDataValue(target_nodata)

    target_band = None
    target_raster = None


[docs]def iterblocks(
        raster_path_band, largest_block=_LARGEST_ITERBLOCK,
        offset_only=False):
    """Iterate across all the memory blocks in the input raster.

    Result is a generator of block location information and numpy arrays.

    This is especially useful when a single value needs to be derived from the
    pixel values in a raster, such as the sum total of all pixel values, or
    a sequence of unique raster values.  In such cases, ``raster_local_op``
    is overkill, since it writes out a raster.

    As a generator, this can be combined multiple times with itertools.izip()
    to iterate 'simultaneously' over multiple rasters, though the user should
    be careful to do so only with prealigned rasters.

    Args:
        raster_path_band (tuple): a path/band index tuple to indicate
            which raster band iterblocks should iterate over.
        largest_block (int): Attempts to iterate over raster blocks with
            this many elements.  Useful in cases where the blocksize is
            relatively small, memory is available, and the function call
            overhead dominates the iteration.  Defaults to 2**20.  A value of
            anything less than the original blocksize of the raster will
            result in blocksizes equal to the original size.
        offset_only (boolean): defaults to False, if True ``iterblocks`` only
            returns offset dictionary and doesn't read any binary data from
            the raster.  This can be useful when iterating over writing to
            an output.

    Yields:
        If ``offset_only`` is false, on each iteration, a tuple containing a
        dict of block data and a 2-dimensional numpy array are
        yielded. The dict of block data has these attributes:

        * ``data['xoff']`` - The X offset of the upper-left-hand corner of the
          block.
        * ``data['yoff']`` - The Y offset of the upper-left-hand corner of the
          block.
        * ``data['win_xsize']`` - The width of the block.
        * ``data['win_ysize']`` - The height of the block.

        If ``offset_only`` is True, the function returns only the block offset
        data and does not attempt to read binary data from the raster.

    """
    if not _is_raster_path_band_formatted(raster_path_band):
        raise ValueError(
            "`raster_path_band` not formatted as expected.  Expects "
            "(path, band_index), received %s" % repr(raster_path_band))
    raster = gdal.OpenEx(raster_path_band[0], gdal.OF_RASTER)
    if raster is None:
        raise ValueError(
            "Raster at %s could not be opened." % raster_path_band[0])
    band = raster.GetRasterBand(raster_path_band[1])
    block = band.GetBlockSize()
    cols_per_block = block[0]
    rows_per_block = block[1]

    n_cols = raster.RasterXSize
    n_rows = raster.RasterYSize

    block_area = cols_per_block * rows_per_block
    # try to make block wider
    if int(largest_block / block_area) > 0:
        width_factor = int(largest_block / block_area)
        cols_per_block *= width_factor
        if cols_per_block > n_cols:
            cols_per_block = n_cols
        block_area = cols_per_block * rows_per_block
    # try to make block taller
    if int(largest_block / block_area) > 0:
        height_factor = int(largest_block / block_area)
        rows_per_block *= height_factor
        if rows_per_block > n_rows:
            rows_per_block = n_rows

    n_col_blocks = int(math.ceil(n_cols / float(cols_per_block)))
    n_row_blocks = int(math.ceil(n_rows / float(rows_per_block)))

    for row_block_index in range(n_row_blocks):
        row_offset = row_block_index * rows_per_block
        row_block_width = n_rows - row_offset
        if row_block_width > rows_per_block:
            row_block_width = rows_per_block
        for col_block_index in range(n_col_blocks):
            col_offset = col_block_index * cols_per_block
            col_block_width = n_cols - col_offset
            if col_block_width > cols_per_block:
                col_block_width = cols_per_block

            offset_dict = {
                'xoff': col_offset,
                'yoff': row_offset,
                'win_xsize': col_block_width,
                'win_ysize': row_block_width,
            }
            if offset_only:
                yield offset_dict
            else:
                yield (offset_dict, band.ReadAsArray(**offset_dict))

    band = None
    raster = None


[docs]def transform_bounding_box(
        bounding_box, base_projection_wkt, target_projection_wkt,
        edge_samples=11,
        osr_axis_mapping_strategy=DEFAULT_OSR_AXIS_MAPPING_STRATEGY):
    """Transform input bounding box to output projection.

    This transform accounts for the fact that the reprojected square bounding
    box might be warped in the new coordinate system.  To account for this,
    the function samples points along the original bounding box edges and
    attempts to make the largest bounding box around any transformed point
    on the edge whether corners or warped edges.

    Args:
        bounding_box (sequence): a sequence of 4 coordinates in ``base_epsg``
            coordinate system describing the bound in the order
            [xmin, ymin, xmax, ymax].
        base_projection_wkt (string): the spatial reference of the input
            coordinate system in Well Known Text.
        target_projection_wkt (string): the spatial reference of the desired
            output coordinate system in Well Known Text.
        edge_samples (int): the number of interpolated points along each
            bounding box edge to sample along. A value of 2 will sample just
            the corners while a value of 3 will also sample the corners and
            the midpoint.
        osr_axis_mapping_strategy (int): OSR axis mapping strategy for
            ``SpatialReference`` objects. Defaults to
            ``geoprocessing.DEFAULT_OSR_AXIS_MAPPING_STRATEGY``. This
            parameter should not be changed unless you know what you are
            doing.

    Return:
        A list of the form [xmin, ymin, xmax, ymax] that describes the largest
        fitting bounding box around the original warped bounding box in
        ``new_epsg`` coordinate system.

    Raises:
        ``ValueError`` if resulting transform yields non-finite coordinates.
        This would indicate an ill posed transform region that the user
        should address.

    """
    base_ref = osr.SpatialReference()
    base_ref.ImportFromWkt(base_projection_wkt)

    target_ref = osr.SpatialReference()
    target_ref.ImportFromWkt(target_projection_wkt)

    base_ref.SetAxisMappingStrategy(osr_axis_mapping_strategy)
    target_ref.SetAxisMappingStrategy(osr_axis_mapping_strategy)

    # Create a coordinate transformation
    transformer = osr.CreateCoordinateTransformation(base_ref, target_ref)

    def _transform_point(point):
        """Transform an (x,y) point tuple from base_ref to target_ref."""
        trans_x, trans_y, _ = (transformer.TransformPoint(*point))
        return (trans_x, trans_y)

    # The following list comprehension iterates over each edge of the bounding
    # box, divides each edge into ``edge_samples`` number of points, then
    # reduces that list to an appropriate ``bounding_fn`` given the edge.
    # For example the left edge needs to be the minimum x coordinate so
    # we generate ``edge_samples` number of points between the upper left and
    # lower left point, transform them all to the new coordinate system
    # then get the minimum x coordinate "min(p[0] ...)" of the batch.
    # points are numbered from 0 starting upper right as follows:
    # 0--3
    # |  |
    # 1--2
    p_0 = numpy.array((bounding_box[0], bounding_box[3]))
    p_1 = numpy.array((bounding_box[0], bounding_box[1]))
    p_2 = numpy.array((bounding_box[2], bounding_box[1]))
    p_3 = numpy.array((bounding_box[2], bounding_box[3]))
    raw_bounding_box = [
        bounding_fn(
            [_transform_point(
                p_a * v + p_b * (1 - v)) for v in numpy.linspace(
                    0, 1, edge_samples)])
        for p_a, p_b, bounding_fn in [
            (p_0, p_1, lambda p_list: min([p[0] for p in p_list])),
            (p_1, p_2, lambda p_list: min([p[1] for p in p_list])),
            (p_2, p_3, lambda p_list: max([p[0] for p in p_list])),
            (p_3, p_0, lambda p_list: max([p[1] for p in p_list]))]]

    # sometimes a transform will be so tight that a sampling around it may
    # flip the coordinate system. This flips it back. I found this when
    # transforming the bounding box of Gibraltar in a utm coordinate system
    # to lat/lng.
    minx, maxx = sorted([raw_bounding_box[0], raw_bounding_box[2]])
    miny, maxy = sorted([raw_bounding_box[1], raw_bounding_box[3]])
    transformed_bounding_box = [minx, miny, maxx, maxy]
    if not all(numpy.isfinite(numpy.array(transformed_bounding_box))):
        raise ValueError(
            f'Could not transform bounding box from base to target projection.'
            f'Some transformed coordinates are not finite: '
            f'{transformed_bounding_box}, base bounding box may not fit into '
            f'target coordinate projection system.\n'
            f'Original bounding box: {bounding_box}\n'
            f'Base projection: {base_projection_wkt}\n'
            f'Target projection: {target_projection_wkt}\n')
    return transformed_bounding_box


[docs]def mask_raster(
        base_raster_path_band, mask_vector_path, target_mask_raster_path,
        mask_layer_id=0, target_mask_value=None, working_dir=None,
        all_touched=False, where_clause=None,
        raster_driver_creation_tuple=DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS):
    """Mask a raster band with a given vector.

    Args:
        base_raster_path_band (tuple): a (path, band number) tuple indicating
            the data to mask.
        mask_vector_path (path): path to a vector that will be used to mask
            anything outside of the polygon that overlaps with
            ``base_raster_path_band`` to ``target_mask_value`` if defined or
            else ``base_raster_path_band``'s nodata value.
        target_mask_raster_path (str): path to desired target raster that
            is a copy of ``base_raster_path_band`` except any pixels that do
            not intersect with ``mask_vector_path`` are set to
            ``target_mask_value`` or ``base_raster_path_band``'s nodata value
            if ``target_mask_value`` is None.
        mask_layer_id (str/int): an index or name to identify the mask
            geometry layer in ``mask_vector_path``, default is 0.
        target_mask_value (numeric): If not None, this value is written to
            any pixel in ``base_raster_path_band`` that does not intersect
            with ``mask_vector_path``. Otherwise the nodata value of
            ``base_raster_path_band`` is used.
        working_dir (str): this is a path to a directory that can be used to
            hold temporary files required to complete this operation.
        all_touched (bool): if False, a pixel is only masked if its centroid
            intersects with the mask. If True a pixel is masked if any point
            of the pixel intersects the polygon mask.
        where_clause (str): (optional) if not None, it is an SQL compatible
            where clause that can be used to filter the features that are used
            to mask the base raster.
        raster_driver_creation_tuple (tuple): a tuple containing a GDAL driver
            name string as the first element and a GDAL creation options
            tuple/list as the second. Defaults to a GTiff driver tuple
            defined at geoprocessing.DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS.

    Return:
        None
    """
    with tempfile.NamedTemporaryFile(
            prefix='mask_raster', delete=False, suffix='.tif',
            dir=working_dir) as mask_raster_file:
        mask_raster_path = mask_raster_file.name

    new_raster_from_base(
        base_raster_path_band[0], mask_raster_path, gdal.GDT_Byte, [255],
        fill_value_list=[0],
        raster_driver_creation_tuple=raster_driver_creation_tuple)

    base_raster_info = get_raster_info(base_raster_path_band[0])

    rasterize(
        mask_vector_path, mask_raster_path, burn_values=[1],
        layer_id=mask_layer_id,
        option_list=[('ALL_TOUCHED=%s' % all_touched).upper()],
        where_clause=where_clause)

    base_nodata = base_raster_info['nodata'][base_raster_path_band[1]-1]

    if target_mask_value is None:
        mask_value = base_nodata
        if mask_value is None:
            LOGGER.warning(
                "No mask value was passed and target nodata is undefined, "
                "defaulting to 0 as the target mask value.")
            mask_value = 0
    else:
        mask_value = target_mask_value

    def mask_op(base_array, mask_array):
        result = numpy.copy(base_array)
        result[mask_array == 0] = mask_value
        return result

    raster_calculator(
        [base_raster_path_band, (mask_raster_path, 1)], mask_op,
        target_mask_raster_path, base_raster_info['datatype'], base_nodata,
        raster_driver_creation_tuple=raster_driver_creation_tuple)

    os.remove(mask_raster_path)


def _gdal_to_numpy_type(gdal_type, metadata):
    """Calculate the equivalent numpy datatype from a GDAL type and metadata.

    Args:
        gdal_type: GDAL.GDT_* data type code
        metadata: mapping or list of strings to check for the existence of
            the 'PIXELTYPE=SIGNEDBYTE' flag

    Returns:
        numpy.dtype that is the equivalent of the input gdal type

    Raises:
        ValueError if an unsupported data type is entered
    """
    if (GDAL_VERSION < (3, 7, 0) and gdal_type == gdal.GDT_Byte and
        (('PIXELTYPE=SIGNEDBYTE' in metadata) or
         ('PIXELTYPE' in metadata and metadata['PIXELTYPE'] == 'SIGNEDBYTE'))):
            return numpy.int8

    numpy_type = gdal_array.GDALTypeCodeToNumericTypeCode(gdal_type)
    if numpy_type is None:
        raise ValueError(f"Unsupported DataType: {gdal_type}")
    return numpy_type


def _numpy_to_gdal_type(numpy_type):
    """Calculate the equivalent GDAL type and metadata from a numpy type.

    Args:
        numpy_type: numpy data type

    Returns:
        (gdal type, metadata) tuple. gdal type is a gdal.GDT_* type code.
        metadata is an empty list in most cases, or ['PIXELTYPE=SIGNEDBYTE']
        if needed to indicate a signed byte type.

    Raises:
        ValueError if an unsupported data type is entered
    """
    numpy_dtype = numpy.dtype(numpy_type)

    if GDAL_VERSION < (3, 7, 0) and numpy_dtype == numpy.dtype(numpy.int8):
        return gdal.GDT_Byte, ['PIXELTYPE=SIGNEDBYTE']

    gdal_type = gdal_array.NumericTypeCodeToGDALTypeCode(numpy_dtype)
    if gdal_type is None:
        raise ValueError(f"Unsupported DataType: {numpy_type}")
    return gdal_type, []


[docs]def merge_bounding_box_list(bounding_box_list, bounding_box_mode):
    """Create a single bounding box by union or intersection of the list.

    Args:
        bounding_box_list (sequence): a sequence of bounding box coordinates
            in the order [minx, miny, maxx, maxy].
        mode (string): either ``'union'`` or ``'intersection'`` for the
            corresponding reduction mode.

    Return:
        A four tuple bounding box that is the union or intersection of the
        input bounding boxes.

    Raises:
        ValueError
            if the bounding boxes in ``bounding_box_list`` do not
            intersect if the ``bounding_box_mode`` is 'intersection'.

    """
    def _merge_bounding_boxes(bb1, bb2, mode):
        """Merge two bounding boxes through union or intersection.

        Args:
            bb1, bb2 (sequence): sequence of float representing bounding box
                in the form bb=[minx,miny,maxx,maxy]
            mode (string); one of 'union' or 'intersection'

        Return:
            Reduced bounding box of bb1/bb2 depending on mode.

        """
        def _less_than_or_equal(x_val, y_val):
            return x_val if x_val <= y_val else y_val

        def _greater_than(x_val, y_val):
            return x_val if x_val > y_val else y_val

        if mode == "union":
            comparison_ops = [
                _less_than_or_equal, _less_than_or_equal,
                _greater_than, _greater_than]
        if mode == "intersection":
            comparison_ops = [
                _greater_than, _greater_than,
                _less_than_or_equal, _less_than_or_equal]

        bb_out = [op(x, y) for op, x, y in zip(comparison_ops, bb1, bb2)]
        return bb_out

    result_bb = functools.reduce(
        functools.partial(_merge_bounding_boxes, mode=bounding_box_mode),
        bounding_box_list)
    if result_bb[0] > result_bb[2] or result_bb[1] > result_bb[3]:
        raise ValueError(
            "Bounding boxes do not intersect. Base list: %s mode: %s "
            " result: %s" % (bounding_box_list, bounding_box_mode, result_bb))
    return result_bb


[docs]def get_gis_type(path):
    """Calculate the GIS type of the file located at ``path``.

    Args:
        path (str): path to a file on disk or network.

    Raises:
        ValueError
            if ``path`` is not a file or cannot be opened as a
            ``gdal.OF_RASTER`` or ``gdal.OF_VECTOR``.

    Return:
        A bitwise OR of all GIS types that PyGeoprocessing models, currently
        this is
        ``pygeoprocessing.RASTER_TYPE``, or ``pygeoprocessing.VECTOR_TYPE``.

    """
    from pygeoprocessing import UNKNOWN_TYPE
    gis_type = UNKNOWN_TYPE
    gis_raster = gdal.OpenEx(path, gdal.OF_RASTER)
    if gis_raster is not None:
        from pygeoprocessing import RASTER_TYPE
        gis_type |= RASTER_TYPE
        gis_raster = None
    gis_vector = gdal.OpenEx(path, gdal.OF_VECTOR)
    if gis_vector is not None:
        from pygeoprocessing import VECTOR_TYPE
        gis_type |= VECTOR_TYPE
        gis_vector = None
    if gis_type == UNKNOWN_TYPE:
        raise ValueError(
            f"Could not open {path} as a gdal.OF_RASTER or gdal.OF_VECTOR.")
    return gis_type


def _make_logger_callback(message):
    """Build a timed logger callback that prints ``message`` replaced.

    Args:
        message (string): a string that expects 2 placement %% variables,
            first for % complete from ``df_complete``, second from
            ``p_progress_arg[0]``.

    Return:
        Function with signature:
            logger_callback(df_complete, psz_message, p_progress_arg)

    """
    def logger_callback(df_complete, _, p_progress_arg):
        """Argument names come from the GDAL API for callbacks."""
        try:
            current_time = time.time()
            if ((current_time - logger_callback.last_time) > 5.0 or
                    (df_complete == 1.0 and
                     logger_callback.total_time >= 5.0)):
                # In some multiprocess applications I was encountering a
                # ``p_progress_arg`` of None. This is unexpected and I suspect
                # was an issue for some kind of GDAL race condition. So I'm
                # guarding against it here and reporting an appropriate log
                # if it occurs.
                if p_progress_arg:
                    LOGGER.info(message, df_complete * 100, p_progress_arg[0])
                else:
                    LOGGER.info(message, df_complete * 100, '')
                logger_callback.last_time = current_time
                logger_callback.total_time += current_time
        except AttributeError:
            logger_callback.last_time = time.time()
            logger_callback.total_time = 0.0
        except Exception:
            LOGGER.exception("Unhandled error occurred while logging "
                             "progress.  df_complete: %s, p_progress_arg: %s",
                             df_complete, p_progress_arg)

    return logger_callback


def _is_raster_path_band_formatted(raster_path_band):
    """Return true if raster path band is a (str, int) tuple/list."""
    if not isinstance(raster_path_band, (list, tuple)):
        return False
    elif len(raster_path_band) != 2:
        return False
    elif not isinstance(raster_path_band[0], str):
        return False
    elif not isinstance(raster_path_band[1], int):
        return False
    else:
        return True


def _convolve_2d_worker(
        signal_path_band, kernel_path_band,
        ignore_nodata, normalize_kernel, set_tol_to_zero,
        work_queue, write_queue):
    """Worker function to be used by ``convolve_2d``.

    Args:
        signal_path_band (tuple): a 2 tuple of the form
            (filepath to signal raster, band index).
        kernel_path_band (tuple): a 2 tuple of the form
            (filepath to kernel raster, band index).
        ignore_nodata (boolean): If true, any pixels that are equal to
            ``signal_path_band``'s nodata value are not included when
            averaging the convolution filter.
        normalize_kernel (boolean): If true, the result is divided by the
            sum of the kernel.
        set_tol_to_zero (float): Value to test close to to determine if values
            are zero, and if so, set to zero.
        work_queue (Queue): will contain (signal_offset, kernel_offset)
            tuples that can be used to read raster blocks directly using
            GDAL ReadAsArray(**offset). Indicates the block to operate on.
        write_queue (Queue): mechanism to pass result back to the writer
            contains a (index_dict, result, mask_result,
                 left_index_raster, right_index_raster,
                 top_index_raster, bottom_index_raster,
                 left_index_result, right_index_result,
                 top_index_result, bottom_index_result) tuple that's used
            for writing and masking.

    Return:
        None
    """
    signal_raster = gdal.OpenEx(signal_path_band[0], gdal.OF_RASTER)
    kernel_raster = gdal.OpenEx(kernel_path_band[0], gdal.OF_RASTER)
    signal_band = signal_raster.GetRasterBand(signal_path_band[1])
    kernel_band = kernel_raster.GetRasterBand(kernel_path_band[1])

    signal_raster_info = get_raster_info(signal_path_band[0])
    kernel_raster_info = get_raster_info(kernel_path_band[0])

    n_cols_signal, n_rows_signal = signal_raster_info['raster_size']
    n_cols_kernel, n_rows_kernel = kernel_raster_info['raster_size']
    signal_nodata = signal_raster_info['nodata'][0]
    kernel_nodata = kernel_raster_info['nodata'][0]

    mask_result = None  # in case no mask is needed, variable is still defined

    # calculate the kernel sum for normalization
    kernel_sum = 0.0
    for _, kernel_block in iterblocks(kernel_path_band):
        if kernel_nodata is not None and ignore_nodata:
            kernel_block[numpy.isclose(kernel_block, kernel_nodata)] = 0.0
        kernel_sum += numpy.sum(kernel_block)

    while True:
        payload = work_queue.get()
        if payload is None:
            break

        signal_offset, kernel_offset = payload

        # ensure signal and kernel are internally float64 precision
        # irrespective of their base type
        signal_block = signal_band.ReadAsArray(**signal_offset).astype(
            numpy.float64)
        kernel_block = kernel_band.ReadAsArray(**kernel_offset).astype(
            numpy.float64)

        # don't ever convolve the nodata value
        if signal_nodata is not None:
            signal_nodata_mask = numpy.isclose(signal_block, signal_nodata)
            signal_block[signal_nodata_mask] = 0.0
            if not ignore_nodata:
                signal_nodata_mask[:] = 0
        else:
            signal_nodata_mask = numpy.zeros(
                signal_block.shape, dtype=bool)

        left_index_raster = (
            signal_offset['xoff'] - n_cols_kernel // 2 +
            kernel_offset['xoff'])
        right_index_raster = (
            signal_offset['xoff'] - n_cols_kernel // 2 +
            kernel_offset['xoff'] + signal_offset['win_xsize'] +
            kernel_offset['win_xsize'] - 1)
        top_index_raster = (
            signal_offset['yoff'] - n_rows_kernel // 2 +
            kernel_offset['yoff'])
        bottom_index_raster = (
            signal_offset['yoff'] - n_rows_kernel // 2 +
            kernel_offset['yoff'] + signal_offset['win_ysize'] +
            kernel_offset['win_ysize'] - 1)

        # it's possible that the piece of the integrating kernel
        # doesn't affect the final result, if so we should skip
        if (right_index_raster < 0 or
                bottom_index_raster < 0 or
                left_index_raster > n_cols_signal or
                top_index_raster > n_rows_signal):
            continue

        if kernel_nodata is not None:
            kernel_block[numpy.isclose(kernel_block, kernel_nodata)] = 0.0

        if normalize_kernel:
            kernel_block /= kernel_sum

        # determine the output convolve shape
        shape = (
            numpy.array(signal_block.shape) +
            numpy.array(kernel_block.shape) - 1)

        # add zero padding so FFT is fast
        fshape = [_next_regular(int(d)) for d in shape]

        signal_fft = numpy.fft.rfftn(signal_block, fshape)
        kernel_fft = numpy.fft.rfftn(kernel_block, fshape)

        # this variable determines the output slice that doesn't include
        # the padded array region made for fast FFTs.
        fslice = tuple([slice(0, int(sz)) for sz in shape])
        # classic FFT convolution
        result = numpy.fft.irfftn(signal_fft * kernel_fft, fshape)[fslice]
        # nix any roundoff error
        if set_tol_to_zero is not None:
            result[numpy.isclose(result, set_tol_to_zero)] = 0.0

        # if we're ignoring nodata, we need to make a convolution of the
        # nodata mask too
        if ignore_nodata:
            mask_fft = numpy.fft.rfftn(
                numpy.where(signal_nodata_mask, 0.0, 1.0), fshape)
            mask_result = numpy.fft.irfftn(
                mask_fft * kernel_fft, fshape)[fslice]

        left_index_result = 0
        right_index_result = result.shape[1]
        top_index_result = 0
        bottom_index_result = result.shape[0]

        # we might abut the edge of the raster, clip if so
        if left_index_raster < 0:
            left_index_result = -left_index_raster
            left_index_raster = 0
        if top_index_raster < 0:
            top_index_result = -top_index_raster
            top_index_raster = 0
        if right_index_raster > n_cols_signal:
            right_index_result -= right_index_raster - n_cols_signal
            right_index_raster = n_cols_signal
        if bottom_index_raster > n_rows_signal:
            bottom_index_result -= (
                bottom_index_raster - n_rows_signal)
            bottom_index_raster = n_rows_signal

        # Add result to current output to account for overlapping edges
        index_dict = {
            'xoff': left_index_raster,
            'yoff': top_index_raster,
            'win_xsize': right_index_raster-left_index_raster,
            'win_ysize': bottom_index_raster-top_index_raster
        }

        write_queue.put(
            (index_dict, result, mask_result,
             left_index_raster, right_index_raster,
             top_index_raster, bottom_index_raster,
             left_index_result, right_index_result,
             top_index_result, bottom_index_result))

    # Indicates worker has terminated
    write_queue.put(None)


def _assert_is_valid_pixel_size(target_pixel_size):
    """Return true if ``target_pixel_size`` is a valid 2 element sequence.

    Raises ValueError if not a two element list/tuple and/or the values in
        the sequence are not numerical.

    """
    def _is_number(x):
        """Return true if x is a number."""
        try:
            if isinstance(x, str):
                return False
            float(x)
            return True
        except (ValueError, TypeError):
            return False

    if not isinstance(target_pixel_size, (list, tuple)):
        raise ValueError(
            "target_pixel_size is not a tuple, its value was '%s'",
            repr(target_pixel_size))

    if (len(target_pixel_size) != 2 or
            not all([_is_number(x) for x in target_pixel_size])):
        raise ValueError(
            "Invalid value for `target_pixel_size`, expected two numerical "
            "elements, got: %s", repr(target_pixel_size))
    return True


[docs]def shapely_geometry_to_vector(
        shapely_geometry_list, target_vector_path, projection_wkt,
        vector_format, fields=None, attribute_list=None,
        ogr_geom_type=ogr.wkbPolygon):
    """Convert list of geometry to vector on disk.

    Args:
        shapely_geometry_list (list): a list of Shapely objects.
        target_vector_path (str): path to target vector.
        projection_wkt (str): WKT for target vector.
        vector_format (str): GDAL driver name for target vector.
        fields (dict): a python dictionary mapping string fieldname
            to OGR Fieldtypes, if None no fields are added
        attribute_list (list of dicts): a list of python dictionary mapping
            fieldname to field value for each geometry in
            `shapely_geometry_list`, if None, no attributes are created.
        ogr_geom_type (ogr geometry enumerated type): sets the target layer
            geometry type. Defaults to wkbPolygon.

    Return:
        None
    """
    if fields is None:
        fields = {}

    if attribute_list is None:
        attribute_list = [{} for _ in range(len(shapely_geometry_list))]

    num_geoms = len(shapely_geometry_list)
    num_attrs = len(attribute_list)
    if num_geoms != num_attrs:
        raise ValueError(
            f"Geometry count ({num_geoms}) and attribute count "
            f"({num_attrs}) do not match.")

    vector_driver = ogr.GetDriverByName(vector_format)
    target_vector = vector_driver.CreateDataSource(target_vector_path)
    layer_name = os.path.basename(os.path.splitext(target_vector_path)[0])
    projection = osr.SpatialReference()
    projection.ImportFromWkt(projection_wkt)
    target_layer = target_vector.CreateLayer(
        layer_name, srs=projection, geom_type=ogr_geom_type)

    for field_name, field_type in fields.items():
        target_layer.CreateField(ogr.FieldDefn(field_name, field_type))
    layer_defn = target_layer.GetLayerDefn()

    for field_index, (shapely_feature, feature_attributes) in enumerate(
            zip(shapely_geometry_list, attribute_list)):
        new_feature = ogr.Feature(layer_defn)
        new_geometry = ogr.CreateGeometryFromWkb(shapely_feature.wkb)
        new_feature.SetGeometry(new_geometry)

        if set(fields.keys()) != set(feature_attributes.keys()):
            raise ValueError(
                f"The fields and attributes for feature {field_index} "
                f"do not match.  Field definitions={fields}. "
                f"Feature attributes={feature_attributes}.")

        for field_name, field_value in feature_attributes.items():
            new_feature.SetField(field_name, field_value)
        target_layer.CreateFeature(new_feature)

    target_layer = None
    target_vector = None


[docs]def numpy_array_to_raster(
        base_array, target_nodata, pixel_size, origin, projection_wkt,
        target_path,
        raster_driver_creation_tuple=DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS):
    """Create a single band raster of size ``base_array.shape``.

    The GDAL datatype of the target raster is determined by the numpy dtype of
    ``base_array``.

    Note:
        The ``origin`` and ``pixel_size`` parameters must both be defined
        properly as 2-tuples of floats, or else must both be set to ``None``.
        A ``ValueError`` will be raised otherwise.

    Args:
        base_array (numpy.array): a 2d numpy array.
        target_nodata (numeric): nodata value of target array, can be None.
        pixel_size (tuple): square dimensions (in ``(x, y)``) of pixel. Can be
            None to indicate no stated pixel size.
        origin (tuple/list): x/y coordinate of the raster origin. Can be None
            to indicate no stated origin.
        projection_wkt (str): target projection in wkt.  Can be None to
            indicate no projection/SRS.
        target_path (str): path to raster to create that will be of the
            same type of base_array with contents of base_array.
        raster_driver_creation_tuple (tuple): a tuple containing a GDAL driver
            name string as the first element and a GDAL creation options
            tuple/list as the second. Defaults to
            geoprocessing.DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS.

    Return:
        None
    """
    driver_name, creation_options = raster_driver_creation_tuple
    raster_driver = gdal.GetDriverByName(driver_name)
    ny, nx = base_array.shape
    gdal_type, type_creation_options = _numpy_to_gdal_type(base_array.dtype)
    new_raster = raster_driver.Create(
        target_path, nx, ny, 1, gdal_type,
        options=list(creation_options) + type_creation_options)
    if projection_wkt is not None:
        new_raster.SetProjection(projection_wkt)
    if origin is not None and pixel_size is not None:
        new_raster.SetGeoTransform(
            [origin[0], pixel_size[0], 0, origin[1], 0, pixel_size[1]])
    elif origin is not None or pixel_size is not None:
        raise ValueError(
            "Origin and pixel size must both be defined or both be None")
    new_band = new_raster.GetRasterBand(1)
    if target_nodata is not None:
        new_band.SetNoDataValue(target_nodata)
    new_band.WriteArray(base_array)
    new_band = None
    new_raster = None


[docs]def raster_to_numpy_array(raster_path, band_id=1):
    """Read the entire contents of the raster band to a numpy array.

    Args:
        raster_path (str): path to raster.
        band_id (int): band in the raster to read.

    Return:
        numpy array contents of `band_id` in raster.

    """
    raster = gdal.OpenEx(raster_path, gdal.OF_RASTER)
    band = raster.GetRasterBand(band_id)
    array = band.ReadAsArray()
    band = None
    raster = None
    return array


[docs]def stitch_rasters(
        base_raster_path_band_list,
        resample_method_list,
        target_stitch_raster_path_band,
        overlap_algorithm='etch',
        area_weight_m2_to_wgs84=False,
        osr_axis_mapping_strategy=DEFAULT_OSR_AXIS_MAPPING_STRATEGY):
    """Stitch the raster in the base list into the existing target.

    Args:
        base_raster_path_band_list (sequence): sequence of raster path/band
            tuples to stitch into target.
        resample_method_list (sequence): a sequence of resampling methods
            which one to one map each path in ``base_raster_path_band_list``
            during resizing.  Each element must be one of,
            'rms | mode | sum | q1 | near | q3 | average | cubicspline |
            bilinear | max | med | min | cubic | lanczos'
        target_stitch_raster_path_band (tuple): raster path/band tuple to an
            existing raster, values in ``base_raster_path_band_list`` will
            be stitched into this raster/band in the order they are in the
            list. The nodata value for the target band must be defined and
            will be written over with values from the base raster. Nodata
            values in the base rasters will not be written into the target.
            If the pixel size or projection are different between base and
            target the base is warped to the target's cell size and target
            with the interpolation method provided. If any part of the
            base raster lies outside of the target, that part of the base
            is ignored. A warning is logged if the entire base raster is
            outside of the target bounds.
        overlap_algorithm (str): this value indicates which algorithm to use
            when a raster is stitched on non-nodata values in the target
            stitch raster. It can be one of the following:

            - 'etch': write a value to the target raster only if the target
              raster pixel is nodata. If the target pixel is non-nodata
              ignore any additional values to write on that pixel.
            - 'replace': write a value to the target raster irrespective
              of the value of the target raster
            - 'add': add the value to be written to the target raster to
              any existing value that is there. If the existing value
              is nodata, treat it as 0.0.
        area_weight_m2_to_wgs84 (bool): If ``True`` the stitched raster will
            be converted to a per-area value before reprojection to wgs84,
            then multiplied by the m^2 area per pixel in the wgs84 coordinate
            space. This is useful when the quantity being stitched is a total
            quantity per pixel rather than a per unit area density. Note
            this assumes input rasters are in a projected space of meters,
            if they are not the stitched output will be nonsensical.
        osr_axis_mapping_strategy (int): OSR axis mapping strategy for
            ``SpatialReference`` objects. Defaults to
            ``geoprocessing.DEFAULT_OSR_AXIS_MAPPING_STRATEGY``. This
            parameter should not be changed unless you know what you are
            doing.

    Return:
        None.
    """
    valid_overlap_algorithms = ['etch', 'replace', 'add']
    if overlap_algorithm not in valid_overlap_algorithms:
        raise ValueError(
            f'overlap algorithm {overlap_algorithm} is not one of '
            f'{valid_overlap_algorithms}')

    if not _is_raster_path_band_formatted(target_stitch_raster_path_band):
        raise ValueError(
            f'Expected raster path/band tuple for '
            f'target_stitch_raster_path_band but got '
            f'"{target_stitch_raster_path_band}"')

    if len(base_raster_path_band_list) != len(resample_method_list):
        raise ValueError(
            f'Expected same number of elements in '
            f'`base_raster_path_band_list` as `resample_method_list` but '
            f'got {len(base_raster_path_band_list)} != '
            f'{len(resample_method_list)} respectively')

    if not os.path.exists(target_stitch_raster_path_band[0]):
        raise ValueError(
            f'Target stitch raster does not exist: '
            f'"{target_stitch_raster_path_band[0]}"')
    gis_type = get_gis_type(target_stitch_raster_path_band[0])
    from pygeoprocessing import RASTER_TYPE
    if gis_type != RASTER_TYPE:
        raise ValueError(
            f'Target stitch raster is not a raster. '
            f'Location: "{target_stitch_raster_path_band[0]}" '
            f'GIS type: {gis_type}')
    target_raster_info = get_raster_info(target_stitch_raster_path_band[0])
    if target_stitch_raster_path_band[1] > len(target_raster_info['nodata']):
        raise ValueError(
            f'target_stitch_raster_path_band refers to a band that exceeds '
            f'the number of bands in the raster:\n'
            f'target_stitch_raster_path_band[1]: '
            f'{target_stitch_raster_path_band[1]} '
            f'n bands: {len(target_raster_info["nodata"])}')

    target_nodata = target_raster_info['nodata'][
        target_stitch_raster_path_band[1]-1]
    if target_nodata is None:
        raise ValueError(
            f'target stitch raster at "{target_stitch_raster_path_band[0]} "'
            f'nodata value is `None`, expected non-`None` value')

    target_raster = gdal.OpenEx(
        target_stitch_raster_path_band[0], gdal.OF_RASTER | gdal.GA_Update)
    target_band = target_raster.GetRasterBand(
        target_stitch_raster_path_band[1])
    target_inv_gt = gdal.InvGeoTransform(target_raster_info['geotransform'])
    target_raster_x_size, target_raster_y_size = target_raster_info[
        'raster_size']
    for (raster_path, raster_band_id), resample_method in zip(
            base_raster_path_band_list, resample_method_list):
        LOGGER.info(
            f'stitching {(raster_path, raster_band_id)} into '
            f'{target_stitch_raster_path_band}')
        raster_info = get_raster_info(raster_path)

        projected_raster_bounding_box = transform_bounding_box(
            raster_info['bounding_box'],
            raster_info['projection_wkt'],
            target_raster_info['projection_wkt'])

        try:
            # merge the bounding boxes only to see if they don't intersect
            _ = merge_bounding_box_list(
                [projected_raster_bounding_box,
                 target_raster_info['bounding_box']], 'intersection')
        except ValueError:
            LOGGER.warning(
                f'the raster at "{raster_path}"" does not intersect the '
                f'stitch raster at "{target_stitch_raster_path_band[0]}", '
                f'skipping...')
            continue

        # use this to determine if we need to warp and delete if we did at
        # the end
        if (raster_info['projection_wkt'] ==
            target_raster_info['projection_wkt'] and
            raster_info['pixel_size'] ==
                target_raster_info['pixel_size']):
            warped_raster = False
            base_stitch_raster_path = raster_path
        else:
            workspace_dir = tempfile.mkdtemp(
                dir=os.path.dirname(target_stitch_raster_path_band[0]),
                prefix='stitch_rasters_workspace')
            base_stitch_raster_path = os.path.join(
                workspace_dir, os.path.basename(raster_path))
            warp_raster(
                raster_path, target_raster_info['pixel_size'],
                base_stitch_raster_path, resample_method,
                target_projection_wkt=target_raster_info['projection_wkt'],
                working_dir=workspace_dir,
                osr_axis_mapping_strategy=osr_axis_mapping_strategy)
            warped_raster = True

        if warped_raster and area_weight_m2_to_wgs84:
            # determine base area per pixel currently and area per pixel
            # once it is projected to wgs84 pixel sizes
            base_pixel_area_m2 = abs(numpy.prod(raster_info['pixel_size']))
            base_stitch_raster_info = get_raster_info(
                base_stitch_raster_path)
            _, lat_min, _, lat_max = base_stitch_raster_info['bounding_box']
            n_rows = base_stitch_raster_info['raster_size'][1]
            # this column is a longitude invariant latitude variant pixel
            # area for scaling area dependent values
            m2_area_per_lat = _create_latitude_m2_area_column(
                lat_min, lat_max, n_rows)

            def _mult_op(base_array, base_nodata, scale, datatype):
                """Scale non-nodata by scale."""
                result = base_array.astype(datatype)
                if base_nodata is not None:
                    valid_mask = ~numpy.isclose(base_array, base_nodata)
                else:
                    valid_mask = numpy.ones(
                        base_array.shape, dtype=bool)
                result[valid_mask] = result[valid_mask] * scale[valid_mask]
                return result

            base_stitch_nodata = base_stitch_raster_info['nodata'][0]
            scaled_raster_path = os.path.join(
                workspace_dir,
                f'scaled_{os.path.basename(base_stitch_raster_path)}')
            gdal_type = _gdal_to_numpy_type(
                target_band.DataType,
                target_band.GetMetadata('IMAGE_STRUCTURE'))
            # multiply the pixels in the resampled raster by the ratio of
            # the pixel area in the wgs84 units divided by the area of the
            # original pixel
            raster_calculator(
                [(base_stitch_raster_path, 1), (base_stitch_nodata, 'raw'),
                 m2_area_per_lat/base_pixel_area_m2,
                 (gdal_type, 'raw')], _mult_op,
                scaled_raster_path,
                target_raster_info['datatype'], base_stitch_nodata)

            # swap the result to base stitch so the rest of the function
            # operates on the area scaled raster
            os.remove(base_stitch_raster_path)
            base_stitch_raster_path = scaled_raster_path

        base_raster = gdal.OpenEx(base_stitch_raster_path, gdal.OF_RASTER)
        base_gt = base_raster.GetGeoTransform()
        base_band = base_raster.GetRasterBand(raster_band_id)
        base_nodata = base_band.GetNoDataValue()
        # Get the target upper left xoff/yoff w/r/t the stitch raster 0,0
        # coordinates
        target_to_base_xoff, target_to_base_yoff = [
            int(_) for _ in gdal.ApplyGeoTransform(
                target_inv_gt, *gdal.ApplyGeoTransform(base_gt, 0, 0))]
        for offset_dict in iterblocks(
                (base_stitch_raster_path, raster_band_id), offset_only=True):
            _offset_vars = {}
            overlap = True
            for (target_to_base_off, off_val,
                 target_off_id, off_clip_id, win_size_id, raster_size) in [
                (target_to_base_xoff, offset_dict['xoff'],
                 'target_xoff', 'xoff_clip', 'win_xsize',
                 target_raster_x_size),
                (target_to_base_yoff, offset_dict['yoff'],
                 'target_yoff', 'yoff_clip', 'win_ysize',
                 target_raster_y_size)]:
                _offset_vars[target_off_id] = (target_to_base_off+off_val)
                if _offset_vars[target_off_id] >= raster_size:
                    overlap = False
                    break
                # how far to move right to get in the target raster
                _offset_vars[off_clip_id] = 0
                if _offset_vars[target_off_id] < 0:
                    _offset_vars[off_clip_id] = -_offset_vars[target_off_id]
                _offset_vars[win_size_id] = offset_dict[win_size_id]
                if _offset_vars[off_clip_id] >= _offset_vars[win_size_id]:
                    # its too far left for the whole window
                    overlap = False
                    break
                # make the _offset_vars[win_size_id] smaller if it shifts
                # off the target window
                if (_offset_vars[off_clip_id] + _offset_vars[target_off_id] +
                        _offset_vars[win_size_id] >= raster_size):
                    _offset_vars[win_size_id] -= (
                        _offset_vars[off_clip_id] +
                        _offset_vars[target_off_id] +
                        _offset_vars[win_size_id] - raster_size)

            if not overlap:
                continue

            target_array = target_band.ReadAsArray(
                xoff=_offset_vars['target_xoff']+_offset_vars['xoff_clip'],
                yoff=_offset_vars['target_yoff']+_offset_vars['yoff_clip'],
                win_xsize=_offset_vars['win_xsize'],
                win_ysize=_offset_vars['win_ysize'])
            target_nodata_mask = numpy.isclose(target_array, target_nodata)
            base_array = base_band.ReadAsArray(
                xoff=offset_dict['xoff']+_offset_vars['xoff_clip'],
                yoff=offset_dict['yoff']+_offset_vars['yoff_clip'],
                win_xsize=_offset_vars['win_xsize'],
                win_ysize=_offset_vars['win_ysize'])

            if base_nodata is not None:
                base_nodata_mask = numpy.isclose(base_array, base_nodata)
            else:
                base_nodata_mask = numpy.zeros(
                    base_array.shape, dtype=bool)

            if overlap_algorithm == 'etch':
                # place values only where target is nodata
                valid_mask = ~base_nodata_mask & target_nodata_mask
                target_array[valid_mask] = base_array[valid_mask]
            elif overlap_algorithm == 'replace':
                # write valid values into the target -- disregard any
                # existing values in the target
                valid_mask = ~base_nodata_mask
                target_array[valid_mask] = base_array[valid_mask]
            elif overlap_algorithm == 'add':
                # add values to the target and treat target nodata as 0.
                valid_mask = ~base_nodata_mask
                masked_target_array = target_array[valid_mask]
                target_array_nodata_mask = numpy.isclose(
                    masked_target_array, target_nodata)
                target_array[valid_mask] = (
                    base_array[valid_mask] +
                    numpy.where(
                        target_array_nodata_mask, 0, masked_target_array))
            else:
                raise RuntimeError(
                    f'overlap_algorithm {overlap_algorithm} was not defined '
                    f'but also not detected earlier -- this should never '
                    f'happen')

            target_band.WriteArray(
                target_array,
                xoff=_offset_vars['target_xoff']+_offset_vars['xoff_clip'],
                yoff=_offset_vars['target_yoff']+_offset_vars['yoff_clip'])

        base_raster = None
        base_band = None
        if warped_raster:
            shutil.rmtree(workspace_dir)

    target_raster = None
    target_band = None


[docs]def build_overviews(
        raster_path, internal=False, resample_method='near',
        overwrite=False, levels='auto'):
    """Build overviews for a raster dataset.

    Args:
        raster_path (str): A path to a raster on disk for which overviews
            should be built.
        internal=False (bool): Whether to modify the raster when building
            overviews. In GeoTiffs, this builds internal overviews when
            ``internal=True``, and external overviews when ``internal=False``.
        resample_method='near' (str): The resample method to use when
            building overviews.  Must be a valid resampling method for
            ``gdal.GDALDataset.BuildOverviews``, one of
            'rms | mode | sum | q1 | near | q3 | average | cubicspline |
            bilinear | max | med | min | cubic | lanczos'.
        overwrite=False (bool): Whether to overwrite existing overviews, if
            any exist.
        levels='auto' (sequence): A sequence of integer overview levels. If
            ``'auto'``, overview levels will be determined by using factors of
            2 until the overview's x and y dimensions are both less than 256.

    Example:
        Generate overviews, regardless of whether overviews already exist
        for the raster, letting the function determine the levels of overviews
        to generate::

            build_overviews(raster_path)

        Generate overviews for 4 levels, at 1/2, 1/4, 1/8 and 1/16 the
        resolution::

            build_overviews(raster_path, levels=[2, 4, 8, 16])

    Returns:
        ``None``
    """
    def overviews_progress(*args, **kwargs):
        pct_complete, name, other = args
        percent = round(pct_complete * 100, 2)
        if time.time() - overviews_progress.last_progress_report > 5.0:
            LOGGER.info(f"Overviews progress: {percent}%")
            overviews_progress.last_progress_report = time.time()
    overviews_progress.last_progress_report = time.time()

    open_flags = gdal.OF_RASTER
    if internal:
        open_flags |= gdal.GA_Update
        LOGGER.info(f"Building internal overviews on {raster_path}")
    else:
        LOGGER.info("Building external overviews.")
    raster = gdal.OpenEx(raster_path, open_flags)
    overview_count = 0
    for band_index in range(1, raster.RasterCount + 1):
        band = raster.GetRasterBand(band_index)
        overview_count += band.GetOverviewCount()

    if overview_count > 0:
        if overwrite:
            LOGGER.info(f"Clearing existing overviews from {raster_path}")
            result = raster.BuildOverviews(
                resampling=resample_method,
                overviewlist=[],
                callback=overviews_progress
            )
            LOGGER.info(f"Overviews cleared from {raster_path}")
        else:
            raise ValueError(
                f"Raster already has overviews.  Use "
                "overwrite=True to override this and regenerate overviews on "
                f"{raster_path}")

    # This loop and limiting factor borrowed from gdaladdo.cpp.
    # Create overviews so long as the overviews are at least 256 pixels in
    # either x or y dimensions.
    if levels == 'auto':
        overview_scales = []
        factor = 2
        limiting_factor = 256
        while (math.ceil(raster.RasterXSize / factor) > limiting_factor or
               math.ceil(raster.RasterYSize / factor) > limiting_factor):
            overview_scales.append(factor)
            factor *= 2
    else:
        overview_scales = [int(level) for level in levels]

    LOGGER.debug(f"Using overviews {overview_scales}")
    result = raster.BuildOverviews(
        resampling=resample_method,
        overviewlist=overview_scales,
        callback=overviews_progress
    )
    LOGGER.info(f"Overviews completed for {raster_path}")
    if result:  # Result will be nonzero on error.
        raise RuntimeError(
            f"Building overviews failed or was interrupted for {raster_path}")


def _m2_area_of_wg84_pixel(pixel_size, center_lat):
    """Calculate m^2 area of a square wgs84 pixel.

    Adapted from: https://gis.stackexchange.com/a/127327/2397

    Args:
        pixel_size (float): length of side of a square pixel in degrees.
        center_lat (float): latitude of the center of the pixel. Note this
            value +/- half the `pixel-size` must not exceed 90/-90 degrees
            latitude or an invalid area will be calculated.

    Returns:
        Area of square pixel of side length `pixel_size` centered at
        `center_lat` in m^2.

    """
    a = 6378137  # meters
    b = 6356752.3142  # meters
    e = math.sqrt(1 - (b/a)**2)
    area_list = []
    for f in [center_lat+pixel_size/2, center_lat-pixel_size/2]:
        zm = 1 - e*math.sin(math.radians(f))
        zp = 1 + e*math.sin(math.radians(f))
        area_list.append(
            math.pi * b**2 * (
                math.log(zp/zm) / (2*e) +
                math.sin(math.radians(f)) / (zp*zm)))
    return abs(pixel_size / 360. * (area_list[0] - area_list[1]))


def _create_latitude_m2_area_column(lat_min, lat_max, n_pixels):
    """Create a (n, 1) sized numpy array with m^2 areas in each element.

    Creates a per pixel m^2 area array that varies with changes in latitude.
    This array can be used to scale values by area when converting to or
    from a WGS84 projection to a projected one.

    Args:
        lat_max (float): maximum latitude in the bound
        lat_min (float): minimum latitude in the bound
        n_pixels (int): number of pixels to create for the column. The
            size of the target square pixels are (lat_max-lat_min)/n_pixels
            degrees per side.

    Return:
        A (n, 1) sized numpy array whose elements are the m^2 areas in each
        element estimated by the latitude value at the center of each pixel.
    """
    pixel_size = (lat_max - lat_min) / n_pixels
    center_lat_array = numpy.linspace(
        lat_min+pixel_size/2, lat_max-pixel_size/2, n_pixels)
    area_array = numpy.array([
        _m2_area_of_wg84_pixel(pixel_size, lat)
        for lat in reversed(center_lat_array)]).reshape((n_pixels, 1))
    return area_array