import numpy as np
from scipy.sparse import coo_matrix
from scipy.signal import convolve2d, convolve, gaussian
[docs]def fastkde(x, y, gridsize=(200, 200), extents=None, nocorrelation=False, weights=None, adjust=1.):
"""
A fft-based Gaussian kernel density estimate (KDE)
for computing the KDE on a regular grid
Note that this is a different use case than scipy's original
scipy.stats.kde.gaussian_kde
IMPLEMENTATION
--------------
Performs a gaussian kernel density estimate over a regular grid using a
convolution of the gaussian kernel with a 2D histogram of the data.
It computes the sparse bi-dimensional histogram of two data samples where
*x*, and *y* are 1-D sequences of the same length. If *weights* is None
(default), this is a histogram of the number of occurences of the
observations at (x[i], y[i]).
histogram of the data is a faster implementation than numpy.histogram as it
avoids intermediate copies and excessive memory usage!
This function is typically *several orders of magnitude faster* than
scipy.stats.kde.gaussian_kde. For large (>1e7) numbers of points, it
produces an essentially identical result.
Boundary conditions on the data is corrected by using a symmetric /
reflection condition. Hence the limits of the dataset does not affect the
pdf estimate.
INPUTS
------
x, y: ndarray[ndim=1]
The x-coords, y-coords of the input data points respectively
gridsize: tuple
A (nx,ny) tuple of the size of the output grid (default: 200x200)
extents: (xmin, xmax, ymin, ymax) tuple
tuple of the extents of output grid (default: extent of input data)
nocorrelation: bool
If True, the correlation between the x and y coords will be ignored
when preforming the KDE. (default: False)
weights: ndarray[ndim=1]
An array of the same shape as x & y that weights each sample (x_i,
y_i) by each value in weights (w_i). Defaults to an array of ones
the same size as x & y. (default: None)
adjust : float
An adjustment factor for the bw. Bandwidth becomes bw * adjust.
OUTPUTS
-------
g: ndarray[ndim=2]
A gridded 2D kernel density estimate of the input points.
e: (xmin, xmax, ymin, ymax) tuple
Extents of g
"""
# Variable check
x, y = np.asarray(x), np.asarray(y)
x, y = np.squeeze(x), np.squeeze(y)
if x.size != y.size:
raise ValueError('Input x & y arrays must be the same size!')
n = x.size
if weights is None:
# Default: Weight all points equally
weights = np.ones(n)
else:
weights = np.squeeze(np.asarray(weights))
if weights.size != x.size:
raise ValueError('Input weights must be an array of the same size as input x & y arrays!')
# Optimize gridsize ------------------------------------------------------
#Make grid and discretize the data and round it to the next power of 2
# to optimize with the fft usage
if gridsize is None:
gridsize = np.asarray([np.max((len(x), 512.)), np.max((len(y), 512.))])
gridsize = 2 ** np.ceil(np.log2(gridsize)) # round to next power of 2
nx, ny = gridsize
# Make the sparse 2d-histogram -------------------------------------------
# Default extents are the extent of the data
if extents is None:
xmin, xmax = x.min(), x.max()
ymin, ymax = y.min(), y.max()
else:
xmin, xmax, ymin, ymax = map(float, extents)
dx = (xmax - xmin) / (nx - 1)
dy = (ymax - ymin) / (ny - 1)
# Basically, this is just doing what np.digitize does with one less copy
# xyi contains the bins of each point as a 2d array [(xi,yi)]
xyi = np.vstack((x,y)).T
xyi -= [xmin, ymin]
xyi /= [dx, dy]
xyi = np.floor(xyi, xyi).T
# Next, make a 2D histogram of x & y.
# Exploit a sparse coo_matrix avoiding np.histogram2d due to excessive
# memory usage with many points
grid = coo_matrix((weights, xyi), shape=(nx, ny)).toarray()
# Kernel Preliminary Calculations ---------------------------------------
# Calculate the covariance matrix (in pixel coords)
cov = np.cov(xyi)
if nocorrelation:
cov[1,0] = 0
cov[0,1] = 0
# Scaling factor for bandwidth
scotts_factor = n ** (-1.0 / 6.) * adjust # For 2D
# Make the gaussian kernel ---------------------------------------------
# First, determine the bandwidth using Scott's rule
# (note that Silvermann's rule gives the # same value for 2d datasets)
std_devs = np.diag(np.sqrt(cov))
kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs)
# Determine the bandwidth to use for the gaussian kernel
inv_cov = np.linalg.inv(cov * scotts_factor ** 2)
# x & y (pixel) coords of the kernel grid, with <x,y> = <0,0> in center
xx = np.arange(kern_nx, dtype=np.float) - kern_nx / 2.0
yy = np.arange(kern_ny, dtype=np.float) - kern_ny / 2.0
xx, yy = np.meshgrid(xx, yy)
# Then evaluate the gaussian function on the kernel grid
kernel = np.vstack((xx.flatten(), yy.flatten()))
kernel = np.dot(inv_cov, kernel) * kernel
kernel = np.sum(kernel, axis=0) / 2.0
kernel = np.exp(-kernel)
kernel = kernel.reshape((kern_ny, kern_nx))
#---- Produce the kernel density estimate --------------------------------
# Convolve the histogram with the gaussian kernel
# use boundary=symm to correct for data boundaries in the kde
grid = convolve2d(grid, kernel, mode='same', boundary='symm')
# Normalization factor to divide result by so that units are in the same
# units as scipy.stats.kde.gaussian_kde's output.
norm_factor = 2 * np.pi * cov * scotts_factor ** 2
norm_factor = np.linalg.det(norm_factor)
norm_factor = n * dx * dy * np.sqrt(norm_factor)
# Normalize the result
grid /= norm_factor
return grid, (xmin, xmax, ymin, ymax)
[docs]def fastkde1D(xin, gridsize=200, extents=None, weights=None, adjust=1.):
"""
A fft-based Gaussian kernel density estimate (KDE)
for computing the KDE on a regular grid
Note that this is a different use case than scipy's original
scipy.stats.kde.gaussian_kde
IMPLEMENTATION
--------------
Performs a gaussian kernel density estimate over a regular grid using a
convolution of the gaussian kernel with a 2D histogram of the data.
It computes the sparse bi-dimensional histogram of two data samples where
*x*, and *y* are 1-D sequences of the same length. If *weights* is None
(default), this is a histogram of the number of occurences of the
observations at (x[i], y[i]).
histogram of the data is a faster implementation than numpy.histogram as it
avoids intermediate copies and excessive memory usage!
This function is typically *several orders of magnitude faster* than
scipy.stats.kde.gaussian_kde. For large (>1e7) numbers of points, it
produces an essentially identical result.
**Example usage and timing**
from scipy.stats import gaussian_kde
def npkde(x, xe):
kde = gaussian_kde(x)
r = kde.evaluate(xe)
return r
x = np.random.normal(0, 1, 1e6)
%timeit fastkde1D(x)
10 loops, best of 3: 31.9 ms per loop
%timeit npkde(x, xe)
1 loops, best of 3: 11.8 s per loop
~ 1e4 speed up!!! However gaussian_kde is not optimized for this application
Boundary conditions on the data is corrected by using a symmetric /
reflection condition. Hence the limits of the dataset does not affect the
pdf estimate.
INPUTS
------
xin: ndarray[ndim=1]
The x-coords, y-coords of the input data points respectively
gridsize: int
A nx integer of the size of the output grid (default: 200x200)
extents: (xmin, xmax) tuple
tuple of the extents of output grid (default: extent of input data)
weights: ndarray[ndim=1]
An array of the same shape as x that weights each sample x_i
by w_i. Defaults to an array of ones the same size as x (default: None)
adjust : float
An adjustment factor for the bw. Bandwidth becomes bw * adjust.
OUTPUTS
-------
g: ndarray[ndim=2]
A gridded 2D kernel density estimate of the input points.
e: (xmin, xmax, ymin, ymax) tuple
Extents of g
"""
# Variable check
x = np.squeeze(np.asarray(xin))
# Default extents are the extent of the data
if extents is None:
xmin, xmax = x.min(), x.max()
else:
xmin, xmax = map(float, extents)
x = x[ (x <= xmax) & (x >= xmin) ]
n = x.size
if weights is None:
# Default: Weight all points equally
weights = np.ones(n)
else:
weights = np.squeeze(np.asarray(weights))
if weights.size != x.size:
raise ValueError('Input weights must be an array of the same size as input x & y arrays!')
# Optimize gridsize ------------------------------------------------------
#Make grid and discretize the data and round it to the next power of 2
# to optimize with the fft usage
if gridsize is None:
gridsize = np.max((len(x), 512.))
gridsize = 2 ** np.ceil(np.log2(gridsize)) # round to next power of 2
nx = gridsize
# Make the sparse 2d-histogram -------------------------------------------
dx = (xmax - xmin) / (nx - 1)
# Basically, this is just doing what np.digitize does with one less copy
# xyi contains the bins of each point as a 2d array [(xi,yi)]
xyi = x - xmin
xyi /= dx
xyi = np.floor(xyi, xyi)
xyi = np.vstack((xyi, np.zeros(n, dtype=int)))
# Next, make a 2D histogram of x & y.
# Exploit a sparse coo_matrix avoiding np.histogram2d due to excessive
# memory usage with many points
grid = coo_matrix((weights, xyi), shape=(nx, 1)).toarray()
# Kernel Preliminary Calculations ---------------------------------------
std_x = np.std(xyi[0])
# Scaling factor for bandwidth
scotts_factor = n ** (-1. / 5.) * adjust # For n ** (-1. / (d + 4))
#Silvermann n * (d + 2) / 4.)**(-1. / (d + 4)).
# Make the gaussian kernel ---------------------------------------------
# First, determine the bandwidth using Scott's rule
# (note that Silvermann's rule gives the # same value for 2d datasets)
kern_nx = np.round(scotts_factor * 2 * np.pi * std_x)
# Then evaluate the gaussian function on the kernel grid
kernel = np.reshape(gaussian(kern_nx, scotts_factor * std_x), (kern_nx, 1))
#---- Produce the kernel density estimate --------------------------------
# Convolve the histogram with the gaussian kernel
# use symmetric padding to correct for data boundaries in the kde
npad = np.min((nx, 2 * kern_nx))
grid = np.vstack( [grid[npad: 0: -1], grid, grid[nx: nx - npad: -1]] )
grid = convolve(grid, kernel, mode='same')[npad: npad + nx]
# Normalization factor to divide result by so that units are in the same
# units as scipy.stats.kde.gaussian_kde's output.
norm_factor = 2 * np.pi * std_x * std_x * scotts_factor ** 2
norm_factor = n * dx * np.sqrt(norm_factor)
# Normalize the result
grid /= norm_factor
return np.squeeze(grid), (xmin, xmax)