Source code for TheCannon.dataset

from __future__ import (absolute_import, division, print_function)
import numpy as np
import sys
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib import rc
rc('text', usetex=True)
rc('font', family='serif')
from .helpers.corner import corner
from .helpers import Table
from .find_continuum_pixels import * 
from .normalization import \
    (_cont_norm_gaussian_smooth,
     _cont_norm_running_quantile,
     _cont_norm_running_quantile_regions,
     _cont_norm_running_quantile_mp,
     _cont_norm_running_quantile_regions_mp,
     _find_cont_fitfunc,
     _find_cont_fitfunc_regions,
     _cont_norm,
     _cont_norm_regions)
from .find_continuum_pixels import _find_contpix,_find_contpix_regions
from multiprocessing import cpu_count
from astropy.io import fits
from astropy.table import Table

n_cpus = cpu_count()

PY3 = sys.version_info[0] > 2

if PY3:
    basestring = (str, bytes)
else:
    basestring = (str, unicode)


[docs]class Dataset(object): """ A class to represent Cannon input: a dataset of spectra and labels """ def __init__(self, wl, tr_ID, tr_flux, tr_ivar, tr_label, test_ID, test_flux, test_ivar): """ Initiate a Dataset object Parameters ---------- wl: grid of wavelength values, onto which all spectra are mapped tr_ID: array of IDs of training objects tr_flux: array of flux values for training objects, [nobj, npix] tr_ivar: array [nobj, npix] of inverse variance values for training objects tr_label: array [nobj, nlabel] test_ID: array [nobj[ of IDs of test objects test_flux: array [nobj, npix] of flux values for test objects test_ivar: array [nobj, npix] of inverse variance values for test objects """ print("Loading dataset") print("This may take a while...") self.wl = wl self.tr_ID = tr_ID self.tr_flux = tr_flux self.tr_ivar = tr_ivar self.tr_label = tr_label self.test_ID = test_ID self.test_flux = test_flux self.test_ivar = test_ivar self._label_names = None self.ranges = None # calculate SNR self.tr_SNR = np.array( [self._SNR(*s) for s in zip(tr_flux, tr_ivar)]) self.test_SNR = np.array( [self._SNR(*s) for s in zip(test_flux, test_ivar)]) def _SNR(self, flux, ivar): """ Calculate the SNR of a spectrum, ignoring bad pixels Parameters ---------- flux: numpy ndarray pixel intensities ivar: numpy ndarray inverse variances corresponding to flux Returns ------- SNR: float """ take = ivar > 0 SNR = float(np.median(flux[take]*(ivar[take]**0.5))) return SNR
[docs] def set_label_names(self, names): """ Set the label names for plotting Parameters ---------- names: ndarray or list The names of the labels used for plotting, ex. in LaTeX syntax """ self._label_names = names
[docs] def get_plotting_labels(self): """ Return the label names used make plots Returns ------- label_names: ndarray The label names """ if self._label_names is None: print("No label names yet!") return None else: return self._label_names
[docs] def bin_flux(flux, ivar): """ bin two neighboring flux values """ if np.sum(ivar)==0: return np.sum(flux)/2. return np.average(flux, weights=ivar)
[docs] def smooth_spectrum(wl, flux, ivar): """ Bins down one spectrum Parameters ---------- wl: numpy ndarray wavelengths flux: numpy ndarray flux values ivar: numpy ndarray inverse variances associated with fluxes Returns ------- wl: numpy ndarray updated binned pixel wavelengths flux: numpy ndarray updated binned flux values ivar: numpy ndarray updated binned inverse variances """ # if odd, discard the last point if len(wl)%2 == 1: wl = np.delete(wl, -1) flux = np.delete(flux, -1) ivar = np.delete(ivar, -1) wl = wl.reshape(-1,2) ivar = ivar.reshape(-1,2) flux = flux.reshape(-1,2) wl_binned = np.mean(wl, axis=1) ivar_binned = np.sqrt(np.sum(ivar**2, axis=1)) flux_binned = np.array([bin_flux(f,w) for f,w in zip(flux, ivar)]) return wl_binned, flux_binned, ivar_binned
[docs] def smooth_spectra(wl, fluxes, ivars): """ Bins down a block of spectra """ output = np.asarray( [smooth_spectrum(wl, flux, ivar) for flux,ivar in zip(fluxes, ivars)]) return output
[docs] def smooth_dataset(self): """ Bins down all of the spectra and updates the dataset """ output = smooth_spectra(self.wl, self.tr_flux, self.tr_ivar) self.wl = output[:,0,:] self.tr_flux = output[:,1,:] self.tr_ivar = output[:,2,:] output = smooth_spectra(self.wl, self.test_flux, self.test_ivar) self.test_flux = output[:,1,:] self.test_ivar = output[:,2,:]
[docs] def diagnostics_SNR(self): """ Plots SNR distributions of ref and test object spectra """ print("Diagnostic for SNRs of reference and survey objects") fig = plt.figure() data = self.test_SNR plt.hist(data, bins=int(np.sqrt(len(data))), alpha=0.5, facecolor='r', label="Survey Objects") data = self.tr_SNR plt.hist(data, bins=int(np.sqrt(len(data))), alpha=0.5, color='b', label="Ref Objects") plt.legend(loc='upper right') #plt.xscale('log') plt.title("SNR Comparison Between Reference and Survey Objects") #plt.xlabel("log(Formal SNR)") plt.xlabel("Formal SNR") plt.ylabel("Number of Objects") return fig
[docs] def diagnostics_ref_labels(self): """ Plots all training labels against each other """ return self._label_triangle_plot(self.tr_label)
def _label_triangle_plot(self, label_vals): """Make a triangle plot for the selected labels Parameters ---------- label_vals: numpy ndarray values of the labels """ labels = [r"$%s$" % l for l in self.get_plotting_labels()] print("Plotting every label against every other") fig = corner(label_vals, labels=labels, show_titles=True, title_args={"fontsize":12}) return fig
[docs] def make_contmask(self, fluxes, ivars, frac): """ Identify continuum pixels using training spectra Does this for each region of the spectrum if dataset.ranges is not None Parameters ---------- fluxes: ndarray Flux data values ivars: ndarray Inverse variances corresponding to flux data values frac: float The fraction of pixels that should be identified as continuum Returns ------- contmask: ndarray Mask with True indicating that the pixel is continuum """ print("Finding continuum pixels...") if self.ranges is None: print("assuming continuous spectra") contmask = _find_contpix(self.wl, fluxes, ivars, frac) else: print("taking spectra in %s regions" %len(self.ranges)) contmask = _find_contpix_regions( self.wl, fluxes, ivars, frac, self.ranges) print("%s pixels returned as continuum" %sum(contmask)) return contmask
[docs] def set_continuum(self, contmask): """ Set the contmask attribute Parameters ---------- contmask: ndarray Mask with True indicating that the pixel is continuum """ self.contmask = contmask
[docs] def fit_continuum(self, deg, ffunc, n_proc=1): """ Fit a continuum to the continuum pixels Parameters ---------- deg: int Degree of the fitting function ffunc: str Type of fitting function, 'sinusoid' or 'chebyshev' Returns ------- tr_cont: ndarray Flux values corresponding to the fitted continuum of training objects test_cont: ndarray Flux values corresponding to the fitted continuum of test objects """ print("Fitting Continuum...") if self.ranges == None: tr_cont = _find_cont_fitfunc( self.tr_flux, self.tr_ivar, self.contmask, deg, ffunc, n_proc=n_proc) test_cont = _find_cont_fitfunc( self.test_flux, self.test_ivar, self.contmask, deg, ffunc, n_proc=n_proc) else: print("Fitting Continuum in %s Regions..." %len(self.ranges)) tr_cont = _find_cont_fitfunc_regions( self.tr_flux, self.tr_ivar, self.contmask, deg, self.ranges, ffunc, n_proc=n_proc) test_cont = _find_cont_fitfunc_regions( self.test_flux, self.test_ivar, self.contmask, deg, self.ranges, ffunc, n_proc=n_proc) return tr_cont, test_cont
[docs] def continuum_normalize_training_q(self, q, delta_lambda, n_proc=1, verbose=True): """ Continuum normalize the training set using a running quantile Parameters ---------- q: float The quantile cut delta_lambda: float The width of the pixel range used to calculate the median Modified by: [08 Jun 2016] Bo Zhang (NAOC): add multiprocessing option Note: Multiprocessing calls for more memory on computer. """ # don't use too many process assert n_proc >= 1 and n_proc <= 100*n_cpus if n_proc > n_cpus: print('@Bo Zhang: n_proc is larger than your numbers of physical CPUs!') print("Continuum normalizing the tr set using running quantile...") if n_proc == 1: # use old version (single process) print('##########################################################') print('@Bo Zhang: you will use only 1 process ...') print(' i.e., the original TheCannon version') print('##########################################################') if self.ranges is None: return _cont_norm_running_quantile( self.wl, self.tr_flux, self.tr_ivar, q=q, delta_lambda=delta_lambda, verbose=verbose) else: return _cont_norm_running_quantile_regions( self.wl, self.tr_flux, self.tr_ivar, q=q, delta_lambda=delta_lambda, ranges=self.ranges, verbose=verbose) else: # use new version (multi process) print('##########################################################') print('@Bo Zhang: you will use ** %d ** processes ... ' % n_proc) print('Note: Multiprocessing calls for more memory on computer!') print('##########################################################') if self.ranges is None: return _cont_norm_running_quantile_mp( self.wl, self.tr_flux, self.tr_ivar, q=q, delta_lambda=delta_lambda, n_proc=n_proc, verbose=verbose) else: return _cont_norm_running_quantile_regions_mp( self.wl, self.tr_flux, self.tr_ivar, q=q, delta_lambda=delta_lambda, ranges=self.ranges, n_proc=n_proc, verbose=verbose)
[docs] def continuum_normalize(self, cont): """ Continuum normalize spectra, in chunks if spectrum has regions Parameters ---------- cont: ndarray Flux values corresponding to the continuum Returns ------- norm_tr_flux: ndarray Normalized flux values for the training objects norm_tr_ivar: ndarray Rescaled inverse variance values for the training objects norm_test_flux: ndarray Normalized flux values for the test objects norm_test_ivar: numpy ndarray Rescaled inverse variance values for the test objects """ tr_cont, test_cont = cont if self.ranges is None: print("assuming continuous spectra") norm_tr_flux, norm_tr_ivar = _cont_norm( self.tr_flux, self.tr_ivar, tr_cont) norm_test_flux, norm_test_ivar = _cont_norm( self.test_flux, self.test_ivar, test_cont) else: print("taking spectra in %s regions" %(len(self.ranges))) norm_tr_flux, norm_tr_ivar = _cont_norm_regions( self.tr_flux, self.tr_ivar, tr_cont, self.ranges) norm_test_flux, norm_test_ivar = _cont_norm_regions( self.test_flux, self.test_ivar, test_cont, self.ranges) return norm_tr_flux, norm_tr_ivar, norm_test_flux, norm_test_ivar
[docs] def continuum_normalize_gaussian_smoothing(self, L): """ Continuum normalize using a Gaussian-weighted smoothed spectrum Parameters ---------- dataset: Dataset the dataset to continuum normalize L: float the width of the Gaussian used for weighting """ norm_tr_flux, norm_tr_ivar, norm_test_flux, norm_test_ivar = \ _cont_norm_gaussian_smooth(self, L) self.tr_flux = norm_tr_flux self.tr_ivar = norm_tr_ivar self.test_flux = norm_test_flux self.test_ivar = norm_test_ivar
[docs] def diagnostics_test_step_flagstars(self): """ Write files listing stars whose inferred labels lie outside 2 standard deviations from the reference label space """ label_names = self.get_plotting_labels() nlabels = len(label_names) reference_labels = self.tr_label test_labels = self.test_label_vals test_IDs = np.array(self.test_ID) mean = np.mean(reference_labels, 0) stdev = np.std(reference_labels, 0) lower = mean - 2 * stdev upper = mean + 2 * stdev for i in range(nlabels): label_name = label_names[i] test_vals = test_labels[:,i] warning = np.logical_or(test_vals < lower[i], test_vals > upper[i]) filename = "flagged_stars_%s.txt" % i with open(filename, 'w') as output: for star in test_IDs[warning]: output.write('{0:s}\n'.format(star)) print("Reference label %s" % label_name) print("flagged %s stars beyond 2-sig of ref labels" % sum(warning)) print("Saved list %s" % filename)
[docs] def diagnostics_survey_labels(self): """ Plot all survey labels against each other """ fig = self._label_triangle_plot(self.test_label_vals)
[docs] def diagnostics_1to1(self, figname="1to1_label"): """ Plots survey labels vs. training labels, color-coded by survey SNR """ snr = self.test_SNR label_names = self.get_plotting_labels() nlabels = len(label_names) reference_labels = self.tr_label test_labels = self.test_label_vals for i in range(nlabels): name = label_names[i] orig = reference_labels[:,i] cannon = test_labels[:,i] # calculate bias and scatter scatter = np.round(np.std(orig-cannon),5) bias = np.round(np.mean(orig-cannon),5) low = np.minimum(min(orig), min(cannon)) high = np.maximum(max(orig), max(cannon)) fig = plt.figure(figsize=(10,6)) gs = gridspec.GridSpec(1,2,width_ratios=[2,1], wspace=0.3) ax1 = plt.subplot(gs[0]) ax2 = plt.subplot(gs[1]) ax1.plot([low, high], [low, high], 'k-', linewidth=2.0, label="x=y") ax1.set_xlim(low, high) ax1.set_ylim(low, high) ax1.legend(fontsize=14, loc='lower right') pl = ax1.scatter(orig, cannon, marker='x', c=snr, vmin=50, vmax=200, alpha=0.7) cb = plt.colorbar(pl, ax=ax1, orientation='horizontal') cb.set_label('SNR from Test Set', fontsize=12) textstr = 'Scatter: %s \nBias: %s' %(scatter, bias) ax1.text(0.05, 0.95, textstr, transform=ax1.transAxes, fontsize=14, verticalalignment='top') ax1.tick_params(axis='x', labelsize=14) ax1.tick_params(axis='y', labelsize=14) ax1.set_xlabel("Reference Value", fontsize=14) ax1.set_ylabel("Cannon Test Value", fontsize=14) ax1.set_title("1-1 Plot of Label " + r"$%s$" % name) diff = cannon-orig npoints = len(diff) mu = np.mean(diff) sig = np.std(diff) #ax2.hist(diff, orientation='horizontal') ax2.hist(diff, range=[-3*sig,3*sig], color='k', bins=int(np.sqrt(npoints)), orientation='horizontal', alpha=0.3, histtype='stepfilled') ax2.tick_params(axis='x', labelsize=14) ax2.tick_params(axis='y', labelsize=14) ax2.set_xlabel("Count", fontsize=14) ax2.set_ylabel("Difference", fontsize=14) ax2.axhline(y=0, c='k', lw=3, label='Difference=0') ax2.set_title("Training Versus Test Labels for $%s$" %name, fontsize=14) ax2.legend(fontsize=14) figname_full = "%s_%s.png" %(figname, i) plt.savefig(figname_full) print("Diagnostic for label output vs. input") print("Saved fig %s" % figname_full) plt.close()
[docs] def set_test_label_vals(self, vals): """ Set test label values Parameters ---------- vals: ndarray Test label values """ self.test_label_vals = vals
[docs] def diagnostics_best_fit_spectra(self, model): """ Plot results of best-fit spectra for ten random test objects """ overlay_spectra(model, self)
def write_to_fits(self, filepath, **kwargs): hl = convert_to_hdulist(self) print('@Bo Zhang: writting to fits [%s] ...' % filepath) hl.writeto(filepath, **kwargs) print('@Bo Zhang: ---------------------------------------------------')
def convert_to_hdulist(ds): """ transform TheCannon dataset into fits HDU list """ print('@Bo Zhang: ---------------------------------------------------') print('@Bo Zhang: transforming TheCannon data cubes into HDU list ...') print('@Bo Zhang: ---------------------------------------------------') # initialize data list data_list = [ds.wl, ds.ranges, ds.contmask*1, ds.tr_ID, ds.tr_SNR, ds.tr_flux, ds.tr_ivar, ds.tr_label, ds.test_ID, ds.test_SNR, ds.test_flux, ds.test_ivar] name_list = ['wl', 'ranges', 'contmask', 'tr_ID', 'tr_SNR', 'tr_flux', 'tr_ivar', 'tr_label', 'test_ID', 'test_SNR', 'test_flux', 'test_ivar'] hdu_format_list_rw = ['image', 'image', 'image', 'table', 'image', 'image', 'image', 'image', 'table', 'image', 'image', 'image'] # construct Primary header header = fits.Header() header['author'] = 'Bo Zhang (@NAOC)' header['version'] = 'v1.0' header['last_modified'] = 'Sat 11 Jun 2016' header['data'] = 'TheCannon dataset object' # initialize HDU list print('@Bo Zhang: initializing the HDU list ...') hl = [fits.hdu.PrimaryHDU(header=header)] # construct HDU list assert len(data_list) == len(name_list) n_hdus = len(data_list) for i in xrange(n_hdus): print('@Bo Zhang: transforming HDU [%d/%d]: %s ...' % (i+1, n_hdus, name_list[i])) if hdu_format_list_rw[i] == 'table': data = Table(data=data_list[i].reshape((-1, 1)), names=[name_list[i]]) hl.append(fits.BinTableHDU(data.as_array())) else: hl.append(fits.ImageHDU(data=np.array(data_list[i]), name=name_list[i])) print('@Bo Zhang: ---------------------------------------------------') return fits.HDUList(hl)