diff --git a/AUTHORS.md b/AUTHORS.md index 98fae5bce25a9dd071aa4fae634c4b3bb7ce6b97..afa1592dcb8ad3a95769e962cbf0c0ab63ac9c47 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -57,6 +57,7 @@ Moritz Huebner Nicola De Lillo Nikhil Sarin Nirban Bose +Noah Wolfe Olivia Wilk Paul Easter Paul Lasky diff --git a/bilby/core/sampler/ptemcee.py b/bilby/core/sampler/ptemcee.py index 2534b0369d1de8d1b75e8561f3cafb4cda26ec73..85b253b1e97fd33d9a55b88bf68771697af029d6 100644 --- a/bilby/core/sampler/ptemcee.py +++ b/bilby/core/sampler/ptemcee.py @@ -47,7 +47,7 @@ class Ptemcee(MCMCSampler): list commonly used kwargs and the bilby defaults. Parameters - ========== + ---------- nsamples: int, (5000) The requested number of samples. Note, in cases where the autocorrelation parameter is difficult to measure, it is possible to @@ -116,7 +116,7 @@ class Ptemcee(MCMCSampler): Other Parameters - ================ + ---------------- nwalkers: int, (200) The number of walkers nsteps: int, (100) @@ -296,7 +296,7 @@ class Ptemcee(MCMCSampler): """Draw the initial positions from the prior Returns - ======= + ------- pos0: list The initial postitions of the walkers, with shape (ntemps, nwalkers, ndim) @@ -315,7 +315,7 @@ class Ptemcee(MCMCSampler): See pos0 in the class initialization for details. Returns - ======= + ------- pos0: list The initial postitions of the walkers, with shape (ntemps, nwalkers, ndim) @@ -504,6 +504,7 @@ class Ptemcee(MCMCSampler): t0 = datetime.datetime.now() logger.info("Starting to sample") + while True: for (pos0, log_posterior, log_likelihood) in sampler.sample( self.pos0, @@ -531,6 +532,7 @@ class Ptemcee(MCMCSampler): ) self.pos0 = pos0 + self.chain_array[:, self.iteration, :] = pos0[0, :, :] self.log_likelihood_array[:, :, self.iteration] = log_likelihood self.log_posterior_array[:, :, self.iteration] = log_posterior @@ -538,6 +540,9 @@ class Ptemcee(MCMCSampler): self.log_posterior_array[:, :, : self.iteration], axis=1 ) + # (nwalkers, ntemps, iterations) + # so mean_log_posterior is shaped (nwalkers, iterations) + # Calculate time per iteration self.time_per_check.append((datetime.datetime.now() - t0).total_seconds()) t0 = datetime.datetime.now() @@ -725,25 +730,87 @@ def check_iteration( beta_list, tau_list, tau_list_n, - Q_list, + gelman_rubin_list, mean_log_posterior, verbose=True, ): - """Per-iteration logic to calculate the convergence check + """Per-iteration logic to calculate the convergence check. + + To check convergence, this function does the following: + 1. Calculate the autocorrelation time (tau) for each dimension for each walker, + corresponding to those dimensions in search_parameter_keys that aren't + specifically excluded in ci.ignore_keys_for_tau. + a. Store the average tau for each dimension, averaged over each walker. + 2. Calculate the Gelman-Rubin statistic (see `get_Q_convergence`), measuring + the convergence of the ensemble of walkers. + 3. Calculate the number of effective samples; we aggregate the total number + of burned-in samples (amongst all walkers), divided by a multiple of the + current maximum average autocorrelation time. Tuned by `ci.burn_in_nact` + and `ci.thin_by_nact`. + 4. If the Gelman-Rubin statistic < `ci.Q_tol` and `ci.nsamples` < the + number of effective samples, we say that our ensemble is converged, + setting `converged = True`. + 5. For some number of the latest steps (set by the autocorrelation time + and the GRAD_WINDOW_LENGTH parameter), we find the maxmium gradient + of the autocorrelation time over all of our dimensions, over all walkers + (autocorrelation time is already averaged over walkers) and the maximum + value of the gradient of the mean log posterior over iterations, over + all walkers. + 6. If the maximum gradient in tau is less than `ci.gradient_tau` and the + maximum gradient in the mean log posterior is less than + `ci.gradient_mean_log_posterior`, we set `tau_usable = True`. + 7. If both `converged` and `tau_usable` are true, we return `stop = True`, + indicating that our ensemble is converged + burnt in on this + iteration. + 8. Also prints progress! (see `print_progress`) + + Notes + ----- + The gradient of tau is computed with a Savgol-Filter, over windows in + sample number of length `GRAD_WINDOW_LENGTH`. This value must be an odd integer. + For `ndim > 3`, we calculate this as the nearest odd integer to ndim. + For `ndim <= 3`, we calculate this as the nearest odd integer to nwalkers, as + typically a much larger window length than polynomial order (default 2) leads + to more stable smoothing. Parameters - ========== + ---------- + iteration: int + Number indexing the current iteration, at which we are checking + convergence. + samples: np.ndarray + Array of ensemble MCMC samples, shaped like (number of walkers, number + of MCMC steps, number of dimensions). + sampler: bilby.core.sampler.Ptemcee + Bilby Ptemcee sampler object; in particular, this function uses the list + of walker temperatures stored in `sampler.betas`. convergence_inputs: bilby.core.sampler.ptemcee.ConvergenceInputs A named tuple of the convergence checking inputs search_parameter_keys: list A list of the search parameter keys time_per_check, tau_list, tau_list_n: list Lists used for tracking the run + beta_list: list + List of floats storing the walker inverse temperatures. + tau_list: list + List of average autocorrelation times for each dimension, averaged + over walkers, at each checked iteration. So, an effective shape + of (number of iterations so far, number of dimensions). + tau_list_n: list + List of iteration numbers, enumerating the first "axis" of tau_list. + E.g. if tau_list_n[1] = 5, this means that the list found at + tau_list[1] was calculated on iteration number 5. + gelman_rubin_list: list (floats) + list of values of the Gelman-Rubin statistic; the value calculated + in this call of check_iteration is appended to the gelman_rubin_list. + mean_log_posterior: np.ndarray + Float array shaped like (number of walkers, number of MCMC steps), + with the log of the posterior, averaged over the dimensions. verbose: bool Whether to print the output Returns - ======= + ------- stop: bool A boolean flag, True if the stopping criteria has been met burn: int @@ -757,14 +824,9 @@ def check_iteration( """ ci = convergence_inputs - # Note: nsteps is the number of steps in the samples while iterations is - # the current iteration number. So iteration > nsteps by the number of - # of discards - nwalkers, nsteps, ndim = samples.shape + nwalkers, nsteps, ndim = samples.shape tau_array = calculate_tau_array(samples, search_parameter_keys, ci) - - # Maximum over parameters, mean over walkers tau = np.max(np.mean(tau_array, axis=0)) # Apply multiplicitive safety factor @@ -775,8 +837,8 @@ def check_iteration( tau_list.append(list(np.mean(tau_array, axis=0))) tau_list_n.append(iteration) - Q = get_Q_convergence(samples) - Q_list.append(Q) + gelman_rubin_statistic = get_Q_convergence(samples) + gelman_rubin_list.append(gelman_rubin_statistic) if np.isnan(tau) or np.isinf(tau): if verbose: @@ -791,7 +853,7 @@ def check_iteration( np.nan, False, convergence_inputs, - Q, + gelman_rubin_statistic, ) return False, np.nan, np.nan, np.nan, np.nan @@ -805,18 +867,24 @@ def check_iteration( nsamples_effective = int(nwalkers * (nsteps - nburn) / thin) # Calculate convergence boolean - converged = Q < ci.Q_tol and ci.nsamples < nsamples_effective + converged = gelman_rubin_statistic < ci.Q_tol and ci.nsamples < nsamples_effective logger.debug( - f"Convergence: Q<Q_tol={Q < ci.Q_tol}, " - f"nsamples<nsamples_effective={ci.nsamples < nsamples_effective}" + "Convergence: Q<Q_tol={}, nsamples<nsamples_effective={}".format( + gelman_rubin_statistic < ci.Q_tol, ci.nsamples < nsamples_effective + ) ) - GRAD_WINDOW_LENGTH = nwalkers + 1 + GRAD_WINDOW_LENGTH = 2 * ((ndim + 1) // 2) + 1 + if GRAD_WINDOW_LENGTH <= 3: + GRAD_WINDOW_LENGTH = 2 * (nwalkers // 2) + 1 + nsteps_to_check = ci.autocorr_tau * np.max([2 * GRAD_WINDOW_LENGTH, tau_int]) lower_tau_index = np.max([0, len(tau_list) - nsteps_to_check]) check_taus = np.array(tau_list[lower_tau_index:]) if not np.any(np.isnan(check_taus)) and check_taus.shape[0] > GRAD_WINDOW_LENGTH: - gradient_tau = get_max_gradient(check_taus, axis=0, window_length=11) + gradient_tau = get_max_gradient( + check_taus, axis=0, window_length=GRAD_WINDOW_LENGTH + ) if gradient_tau < ci.gradient_tau: logger.debug( @@ -876,13 +944,51 @@ def check_iteration( gradient_mean_log_posterior, tau_usable, convergence_inputs, - Q, + gelman_rubin_statistic, ) + stop = converged and tau_usable return stop, nburn, thin, tau_int, nsamples_effective def get_max_gradient(x, axis=0, window_length=11, polyorder=2, smooth=False): + """Calculate the maximum value of the gradient in the input data. + + Applies a Savitzky-Golay filter (`scipy.signal.savgol_filter`) to the input + data x, along a particular axis. This filter smooths the data and, as configured + in this function, simultaneously calculates the derivative of the smoothed data. + If smooth=True is provided, it will apply a Savitzky-Golay filter with a + polynomial order of 3 to the input data before applying this filter a second + time and calculating the derivative. This function will return the maximum value + of the derivative returned by the filter. + + See https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.savgol_filter.html + for more information on the Savitzky-Golay filter that we use. Some parameter + documentation has been borrowed from this source. + + Parameters + ---------- + x : np.ndarray + Array of input data (can be int or float, as `savgol_filter` casts to float). + axis : int, default = 0 + The axis of the input array x over which to calculate the gradient. + window_length : int, default = 11 + The length of the filter window (i.e., the number of coefficients to use + in approximating the data). + polyorder : int, default = 2 + The order of the polynomial used to fit the samples. polyorder must be less + than window_length. + smooth : bool, default = False + If true, this will smooth the data with a Savitzky-Golay filter before + providing it to the Savitzky-Golay filter for calculating the derviative. + Probably useful if you think your input data is especially noisy. + + Returns + ------- + max_gradient : float + Maximum value of the gradient. + """ + from scipy.signal import savgol_filter if smooth: @@ -895,12 +1001,54 @@ def get_max_gradient(x, axis=0, window_length=11, polyorder=2, smooth=False): def get_Q_convergence(samples): + """Calculate the Gelman-Rubin statistic as an estimate of convergence for + an ensemble of MCMC walkers. + + Calculates the Gelman-Rubin statistic, from Gelman and Rubin (1992). + See section 2.2 of Gelman and Rubin (1992), at + https://doi.org/10.1214/ss/1177011136. + + There is also a good description of this statistic in section 7.4.2 + of "Advanced Statistical Computing" (Peng 2021), in-progress course notes, + currently found at + https://bookdown.org/rdpeng/advstatcomp/monitoring-convergence.html. + As of this writing, L in this resource refers to the number of sampling + steps remaining after some have been discarded to achieve burn-in, + equivalent to nsteps here. Paraphrasing, we compare the variance between + our walkers (chains) to the variance within each walker (compare + inter-walker vs. intra-walker variance). We do this because our walkers + should be indistinguishable from one another when they reach a steady-state, + i.e. convergence. Looking at V-hat in the definition of this function, we + can see that as nsteps -> infinity, B (inter-chain variance) -> 0, + R -> 1; so, R >~ 1 is a good condition for the convergence of our ensemble. + + In practice, this function calculates the Gelman-Rubin statistic for + each dimension, and then returns the largest of these values. This + means that we can be sure that, once the walker with the largest such value + achieves a Gelman-Rubin statistic of >~ 1, the others have as well. + + Parameters + ---------- + samples: np.ndarray + Array of ensemble MCMC samples, shaped like (number of walkers, number + of MCMC steps, number of dimensions). + + Returns + ------- + Q: float + The largest value of the Gelman-Rubin statistic, from those values + calculated for each dimension. If only one step is represented in + samples, this returns np.inf. + """ + nwalkers, nsteps, ndim = samples.shape if nsteps > 1: W = np.mean(np.var(samples, axis=1), axis=0) + per_walker_mean = np.mean(samples, axis=1) mean = np.mean(per_walker_mean, axis=0) B = nsteps / (nwalkers - 1.0) * np.sum((per_walker_mean - mean) ** 2, axis=0) + Vhat = (nsteps - 1) / nsteps * W + (nwalkers + 1) / (nwalkers * nsteps) * B Q_per_dim = np.sqrt(Vhat / W) return np.max(Q_per_dim) @@ -977,7 +1125,31 @@ def print_progress( def calculate_tau_array(samples, search_parameter_keys, ci): - """Compute ACT tau for 0-temperature chains""" + """Calculate the autocorrelation time for zero-temperature chains. + + Calculates the autocorrelation time for each chain, for those parameters/ + dimensions that are not explicitly excluded in ci.ignore_keys_for_tau. + + Parameters + ---------- + samples: np.ndarray + Array of ensemble MCMC samples, shaped like (number of walkers, number + of MCMC steps, number of dimensions). + search_parameter_keys: list + A list of the search parameter keys + ci : collections.namedtuple + Collection of settings for convergence tests, including autocorrelation + calculation. If a value in search_parameter_keys is included in + ci.ignore_keys_for_tau, this function will not calculate an + autocorrelation time for any walker along that particular dimension. + + Returns + ------- + tau_array: np.ndarray + Float array shaped like (nwalkers, ndim) (with all np.inf for any + dimension that is excluded by ci.ignore_keys_for_tau). + """ + import emcee nwalkers, nsteps, ndim = samples.shape @@ -1141,9 +1313,11 @@ def plot_tau( def plot_mean_log_posterior(mean_log_posterior, outdir, label): import matplotlib.pyplot as plt + mean_log_posterior[mean_log_posterior < -1e100] = np.nan + ntemps, nsteps = mean_log_posterior.shape - ymax = np.max(mean_log_posterior) - ymin = np.min(mean_log_posterior[:, -100:]) + ymax = np.nanmax(mean_log_posterior) + ymin = np.nanmin(mean_log_posterior[:, -100:]) ymax += 0.1 * (ymax - ymin) ymin -= 0.1 * (ymax - ymin)