Skip to content
Snippets Groups Projects

Modified GRAD_WINDOW_LENGTH calculation for sooner calculation of valid...

Merged Noah Wolfe requested to merge noah.wolfe/bilby:master into master
All threads resolved!
Files
2
+ 204
26
@@ -47,7 +47,7 @@ class Ptemcee(MCMCSampler):
list commonly used kwargs and the bilby defaults.
Parameters
==========
----------
nsamples: int, (5000)
The requested number of samples. Note, in cases where the
autocorrelation parameter is difficult to measure, it is possible to
@@ -116,7 +116,7 @@ class Ptemcee(MCMCSampler):
Other Parameters
================
----------------
nwalkers: int, (200)
The number of walkers
nsteps: int, (100)
@@ -296,7 +296,7 @@ class Ptemcee(MCMCSampler):
"""Draw the initial positions from the prior
Returns
=======
-------
pos0: list
The initial postitions of the walkers, with shape (ntemps, nwalkers, ndim)
@@ -315,7 +315,7 @@ class Ptemcee(MCMCSampler):
See pos0 in the class initialization for details.
Returns
=======
-------
pos0: list
The initial postitions of the walkers, with shape (ntemps, nwalkers, ndim)
@@ -504,6 +504,7 @@ class Ptemcee(MCMCSampler):
t0 = datetime.datetime.now()
logger.info("Starting to sample")
while True:
for (pos0, log_posterior, log_likelihood) in sampler.sample(
self.pos0,
@@ -531,6 +532,7 @@ class Ptemcee(MCMCSampler):
)
self.pos0 = pos0
self.chain_array[:, self.iteration, :] = pos0[0, :, :]
self.log_likelihood_array[:, :, self.iteration] = log_likelihood
self.log_posterior_array[:, :, self.iteration] = log_posterior
@@ -538,6 +540,9 @@ class Ptemcee(MCMCSampler):
self.log_posterior_array[:, :, : self.iteration], axis=1
)
# (nwalkers, ntemps, iterations)
# so mean_log_posterior is shaped (nwalkers, iterations)
# Calculate time per iteration
self.time_per_check.append((datetime.datetime.now() - t0).total_seconds())
t0 = datetime.datetime.now()
@@ -725,25 +730,87 @@ def check_iteration(
beta_list,
tau_list,
tau_list_n,
Q_list,
gelman_rubin_list,
mean_log_posterior,
verbose=True,
):
"""Per-iteration logic to calculate the convergence check
"""Per-iteration logic to calculate the convergence check.
To check convergence, this function does the following:
1. Calculate the autocorrelation time (tau) for each dimension for each walker,
corresponding to those dimensions in search_parameter_keys that aren't
specifically excluded in ci.ignore_keys_for_tau.
a. Store the average tau for each dimension, averaged over each walker.
2. Calculate the Gelman-Rubin statistic (see `get_Q_convergence`), measuring
the convergence of the ensemble of walkers.
3. Calculate the number of effective samples; we aggregate the total number
of burned-in samples (amongst all walkers), divided by a multiple of the
current maximum average autocorrelation time. Tuned by `ci.burn_in_nact`
and `ci.thin_by_nact`.
4. If the Gelman-Rubin statistic < `ci.Q_tol` and `ci.nsamples` < the
number of effective samples, we say that our ensemble is converged,
setting `converged = True`.
5. For some number of the latest steps (set by the autocorrelation time
and the GRAD_WINDOW_LENGTH parameter), we find the maxmium gradient
of the autocorrelation time over all of our dimensions, over all walkers
(autocorrelation time is already averaged over walkers) and the maximum
value of the gradient of the mean log posterior over iterations, over
all walkers.
6. If the maximum gradient in tau is less than `ci.gradient_tau` and the
maximum gradient in the mean log posterior is less than
`ci.gradient_mean_log_posterior`, we set `tau_usable = True`.
7. If both `converged` and `tau_usable` are true, we return `stop = True`,
indicating that our ensemble is converged + burnt in on this
iteration.
8. Also prints progress! (see `print_progress`)
Notes
-----
The gradient of tau is computed with a Savgol-Filter, over windows in
sample number of length `GRAD_WINDOW_LENGTH`. This value must be an odd integer.
For `ndim > 3`, we calculate this as the nearest odd integer to ndim.
For `ndim <= 3`, we calculate this as the nearest odd integer to nwalkers, as
typically a much larger window length than polynomial order (default 2) leads
to more stable smoothing.
Parameters
==========
----------
iteration: int
Number indexing the current iteration, at which we are checking
convergence.
samples: np.ndarray
Array of ensemble MCMC samples, shaped like (number of walkers, number
of MCMC steps, number of dimensions).
sampler: bilby.core.sampler.Ptemcee
Bilby Ptemcee sampler object; in particular, this function uses the list
of walker temperatures stored in `sampler.betas`.
convergence_inputs: bilby.core.sampler.ptemcee.ConvergenceInputs
A named tuple of the convergence checking inputs
search_parameter_keys: list
A list of the search parameter keys
time_per_check, tau_list, tau_list_n: list
Lists used for tracking the run
beta_list: list
List of floats storing the walker inverse temperatures.
tau_list: list
List of average autocorrelation times for each dimension, averaged
over walkers, at each checked iteration. So, an effective shape
of (number of iterations so far, number of dimensions).
tau_list_n: list
List of iteration numbers, enumerating the first "axis" of tau_list.
E.g. if tau_list_n[1] = 5, this means that the list found at
tau_list[1] was calculated on iteration number 5.
gelman_rubin_list: list (floats)
list of values of the Gelman-Rubin statistic; the value calculated
in this call of check_iteration is appended to the gelman_rubin_list.
mean_log_posterior: np.ndarray
Float array shaped like (number of walkers, number of MCMC steps),
with the log of the posterior, averaged over the dimensions.
verbose: bool
Whether to print the output
Returns
=======
-------
stop: bool
A boolean flag, True if the stopping criteria has been met
burn: int
@@ -757,14 +824,9 @@ def check_iteration(
"""
ci = convergence_inputs
# Note: nsteps is the number of steps in the samples while iterations is
# the current iteration number. So iteration > nsteps by the number of
# of discards
nwalkers, nsteps, ndim = samples.shape
nwalkers, nsteps, ndim = samples.shape
tau_array = calculate_tau_array(samples, search_parameter_keys, ci)
# Maximum over parameters, mean over walkers
tau = np.max(np.mean(tau_array, axis=0))
# Apply multiplicitive safety factor
@@ -775,8 +837,8 @@ def check_iteration(
tau_list.append(list(np.mean(tau_array, axis=0)))
tau_list_n.append(iteration)
Q = get_Q_convergence(samples)
Q_list.append(Q)
gelman_rubin_statistic = get_Q_convergence(samples)
gelman_rubin_list.append(gelman_rubin_statistic)
if np.isnan(tau) or np.isinf(tau):
if verbose:
@@ -791,7 +853,7 @@ def check_iteration(
np.nan,
False,
convergence_inputs,
Q,
gelman_rubin_statistic,
)
return False, np.nan, np.nan, np.nan, np.nan
@@ -805,18 +867,24 @@ def check_iteration(
nsamples_effective = int(nwalkers * (nsteps - nburn) / thin)
# Calculate convergence boolean
converged = Q < ci.Q_tol and ci.nsamples < nsamples_effective
converged = gelman_rubin_statistic < ci.Q_tol and ci.nsamples < nsamples_effective
logger.debug(
f"Convergence: Q<Q_tol={Q < ci.Q_tol}, "
f"nsamples<nsamples_effective={ci.nsamples < nsamples_effective}"
"Convergence: Q<Q_tol={}, nsamples<nsamples_effective={}".format(
gelman_rubin_statistic < ci.Q_tol, ci.nsamples < nsamples_effective
)
)
GRAD_WINDOW_LENGTH = nwalkers + 1
GRAD_WINDOW_LENGTH = 2 * ((ndim + 1) // 2) + 1
if GRAD_WINDOW_LENGTH <= 3:
GRAD_WINDOW_LENGTH = 2 * (nwalkers // 2) + 1
nsteps_to_check = ci.autocorr_tau * np.max([2 * GRAD_WINDOW_LENGTH, tau_int])
lower_tau_index = np.max([0, len(tau_list) - nsteps_to_check])
check_taus = np.array(tau_list[lower_tau_index:])
if not np.any(np.isnan(check_taus)) and check_taus.shape[0] > GRAD_WINDOW_LENGTH:
gradient_tau = get_max_gradient(check_taus, axis=0, window_length=11)
gradient_tau = get_max_gradient(
check_taus, axis=0, window_length=GRAD_WINDOW_LENGTH
)
if gradient_tau < ci.gradient_tau:
logger.debug(
@@ -876,13 +944,51 @@ def check_iteration(
gradient_mean_log_posterior,
tau_usable,
convergence_inputs,
Q,
gelman_rubin_statistic,
)
stop = converged and tau_usable
return stop, nburn, thin, tau_int, nsamples_effective
def get_max_gradient(x, axis=0, window_length=11, polyorder=2, smooth=False):
"""Calculate the maximum value of the gradient in the input data.
Applies a Savitzky-Golay filter (`scipy.signal.savgol_filter`) to the input
data x, along a particular axis. This filter smooths the data and, as configured
in this function, simultaneously calculates the derivative of the smoothed data.
If smooth=True is provided, it will apply a Savitzky-Golay filter with a
polynomial order of 3 to the input data before applying this filter a second
time and calculating the derivative. This function will return the maximum value
of the derivative returned by the filter.
See https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.savgol_filter.html
for more information on the Savitzky-Golay filter that we use. Some parameter
documentation has been borrowed from this source.
Parameters
----------
x : np.ndarray
Array of input data (can be int or float, as `savgol_filter` casts to float).
axis : int, default = 0
The axis of the input array x over which to calculate the gradient.
window_length : int, default = 11
The length of the filter window (i.e., the number of coefficients to use
in approximating the data).
polyorder : int, default = 2
The order of the polynomial used to fit the samples. polyorder must be less
than window_length.
smooth : bool, default = False
If true, this will smooth the data with a Savitzky-Golay filter before
providing it to the Savitzky-Golay filter for calculating the derviative.
Probably useful if you think your input data is especially noisy.
Returns
-------
max_gradient : float
Maximum value of the gradient.
"""
from scipy.signal import savgol_filter
if smooth:
@@ -895,12 +1001,58 @@ def get_max_gradient(x, axis=0, window_length=11, polyorder=2, smooth=False):
def get_Q_convergence(samples):
"""Calculate the Gelman-Rubin statistic as an estimate of convergence for
an ensemble of MCMC walkers.
Calculates the Gelman-Rubin statistic, from Gelman and Rubin (1992).
See section 2.2 of Gelman and Rubin (1992), at
https://doi.org/10.1214/ss/1177011136.
There is also a good description of this statistic in section 7.4.2
of "Advanced Statistical Computing" (Peng 2021), in-progress course notes,
currently found at
https://bookdown.org/rdpeng/advstatcomp/monitoring-convergence.html.
As of this writing, L in this resource refers to the number of sampling
steps remaining after some have been discarded to achieve burn-in,
equivalent to nsteps here. Paraphrasing, we compare the variance between
our walkers (chains) to the variance within each walker (compare
inter-walker vs. intra-walker variance). We do this because our walkers
should be indistinguishable from one another when they reach a steady-state,
i.e. convergence. Looking at V-hat in the definition of this function, we
can see that as nsteps -> infinity, B (inter-chain variance) -> 0,
R -> 1; so, R >~ 1 is a good condition for the convergence of our ensemble.
In practice, this function calculates the Gelman-Rubin statistic for
each dimension, and then returns the largest of these values. This
means that we can be sure that, once the walker with the largest such value
achieves a Gelman-Rubin statistic of >~ 1, the others have as well.
Parameters
----------
samples: np.ndarray
Array of ensemble MCMC samples, shaped like (number of walkers, number
of MCMC steps, number of dimensions).
Returns
-------
Q: float
The largest value of the Gelman-Rubin statistic, from those values
calculated for each dimension. If only one step is represented in
samples, this returns np.inf.
"""
nwalkers, nsteps, ndim = samples.shape
if nsteps > 1:
# avg variance of samples b/t steps, for each walker
W = np.mean(np.var(samples, axis=1), axis=0)
# chain/walker mean
per_walker_mean = np.mean(samples, axis=1)
# grand mean (per dimension)
mean = np.mean(per_walker_mean, axis=0)
# variance b/t walkers
B = nsteps / (nwalkers - 1.0) * np.sum((per_walker_mean - mean) ** 2, axis=0)
Vhat = (nsteps - 1) / nsteps * W + (nwalkers + 1) / (nwalkers * nsteps) * B
Q_per_dim = np.sqrt(Vhat / W)
return np.max(Q_per_dim)
@@ -977,7 +1129,31 @@ def print_progress(
def calculate_tau_array(samples, search_parameter_keys, ci):
"""Compute ACT tau for 0-temperature chains"""
"""Calculate the autocorrelation time for zero-temperature chains.
Calculates the autocorrelation time for each chain, for those parameters/
dimensions that are not explicitly excluded in ci.ignore_keys_for_tau.
Parameters
----------
samples: np.ndarray
Array of ensemble MCMC samples, shaped like (number of walkers, number
of MCMC steps, number of dimensions).
search_parameter_keys: list
A list of the search parameter keys
ci : collections.namedtuple
Collection of settings for convergence tests, including autocorrelation
calculation. If a value in search_parameter_keys is included in
ci.ignore_keys_for_tau, this function will not calculate an
autocorrelation time for any walker along that particular dimension.
Returns
-------
tau_array: np.ndarray
Float array shaped like (nwalkers, ndim) (with all np.inf for any
dimension that is excluded by ci.ignore_keys_for_tau).
"""
import emcee
nwalkers, nsteps, ndim = samples.shape
@@ -1141,9 +1317,11 @@ def plot_tau(
def plot_mean_log_posterior(mean_log_posterior, outdir, label):
import matplotlib.pyplot as plt
mean_log_posterior[mean_log_posterior < -1e100] = np.nan
ntemps, nsteps = mean_log_posterior.shape
ymax = np.max(mean_log_posterior)
ymin = np.min(mean_log_posterior[:, -100:])
ymax = np.nanmax(mean_log_posterior)
ymin = np.nanmin(mean_log_posterior[:, -100:])
ymax += 0.1 * (ymax - ymin)
ymin -= 0.1 * (ymax - ymin)
Loading