Source code for snipar.correlate

import numpy.ma as ma
from numba import njit, prange
import gzip
from snipar.ld import ldscores_from_bed
from snipar.utilities import make_id_dict
import numpy as np

[docs]class sumstats(object): def __init__(self,chrom,sid,pos, A1, A2, freqs, direct, direct_SE, avg_NTC, avg_NTC_SE, population, population_SE, r_direct_avg_NTC, r_direct_pop, ldscores = None, map=None): sizes = np.array([sid.shape[0],pos.shape[0],A1.shape[0],A2.shape[0],freqs.shape[0],direct.shape[0], avg_NTC.shape[0],population.shape[0],r_direct_avg_NTC.shape[0],r_direct_pop.shape[0]]) if np.unique(sizes).shape[0] > 1: raise(ValueError('All inputs to sumstats class must have same size')) self.chrom = np.zeros(sid.shape,dtype=int) self.chrom[:] = int(chrom) self.sid = np.array(sid,dtype=str) self.sid_dict = make_id_dict(self.sid) self.pos = np.array(pos,dtype=int) self.A1 = np.array(A1,dtype=str) self.A2 = np.array(A2,dtype=str) self.freqs = ma.array(freqs,dtype=float) self.freqs.mask = np.isnan(self.freqs) self.direct = ma.array(direct, dtype=float) self.direct.mask = np.isnan(self.direct) self.direct_SE = ma.array(direct_SE, dtype=float) self.direct_SE.mask = np.isnan(self.direct_SE) self.avg_NTC = ma.array(avg_NTC, dtype=float) self.avg_NTC.mask = np.isnan(self.avg_NTC) self.avg_NTC_SE = ma.array(avg_NTC_SE, dtype=float) self.avg_NTC_SE.mask = np.isnan(self.avg_NTC_SE) self.population = ma.array(population, dtype=float) self.population.mask = np.isnan(self.population) self.population_SE = ma.array(population_SE, dtype=float) self.population_SE.mask = np.isnan(self.population_SE) self.r_direct_avg_NTC = ma.array(r_direct_avg_NTC, dtype=float) self.r_direct_avg_NTC.mask = np.isnan(self.r_direct_avg_NTC) self.r_direct_pop = ma.array(r_direct_pop, dtype=float) self.r_direct_pop.mask = np.isnan(self.r_direct_pop) if ldscores is not None: if not ldscores.shape[0] == sid.shape[0]: raise(ValueError('LD scores must have same size as other sumstats')) self.ldscores = ma.array(ldscores,dtype=float) self.ldscores.mask = np.isnan(self.ldscores) else: self.ldscores = None if map is not None: if not map.shape[0] == sid.shape[0]: raise(ValueError('LD scores must have same size as other sumstats')) self.map = ma.array(map,dtype=float) self.map.mask = np.isnan(self.map) else: self.map = None
[docs] def concatenate(self,s2): self.chrom = np.hstack((self.chrom,s2.chrom)) self.sid = np.hstack((self.sid, s2.sid)) self.sid_dict = make_id_dict(self.sid) self.pos = np.hstack((self.pos, s2.pos)) self.A1 = np.hstack((self.A1, s2.A1)) self.A2 = np.hstack((self.A2, s2.A2)) self.freqs = ma.concatenate([self.freqs, s2.freqs]) self.direct = ma.concatenate([self.direct, s2.direct]) self.direct_SE = ma.concatenate([self.direct_SE, s2.direct_SE]) self.avg_NTC = ma.concatenate([self.avg_NTC, s2.avg_NTC]) self.avg_NTC_SE = ma.concatenate([self.avg_NTC_SE, s2.avg_NTC_SE]) self.population = ma.concatenate([self.population, s2.population]) self.population_SE = ma.concatenate([self.population_SE, s2.population_SE]) self.r_direct_avg_NTC = ma.concatenate([self.r_direct_avg_NTC, s2.r_direct_avg_NTC]) self.r_direct_pop = ma.concatenate([self.r_direct_pop, s2.r_direct_pop]) if self.ldscores is not None and s2.ldscores is not None: self.ldscores = ma.concatenate([self.ldscores, s2.ldscores]) if self.map is not None and s2.map is not None: self.map = ma.concatenate([self.map, s2.map])
[docs] def filter(self,filter_pass): self.chrom = self.chrom[filter_pass] self.sid = self.sid[filter_pass] self.sid_dict = make_id_dict(self.sid) self.pos = self.pos[filter_pass] self.A1 = self.A1[filter_pass] self.A2 = self.A2[filter_pass] self.freqs = self.freqs[filter_pass] self.direct = self.direct[filter_pass] self.direct_SE = self.direct_SE[filter_pass] self.avg_NTC = self.avg_NTC[filter_pass] self.avg_NTC_SE = self.avg_NTC_SE[filter_pass] self.population = self.population[filter_pass] self.population_SE = self.population_SE[filter_pass] self.r_direct_avg_NTC = self.r_direct_avg_NTC[filter_pass] self.r_direct_pop = self.r_direct_pop[filter_pass] if self.ldscores is not None: self.ldscores = self.ldscores[filter_pass] if self.map is not None: self.map = self.map[filter_pass]
[docs] def filter_NAs(self): no_NAs = (~self.freqs.mask)*(~self.direct.mask)*(~self.avg_NTC.mask)*(~self.population.mask)*(~self.r_direct_pop.mask)*(~self.r_direct_avg_NTC.mask) if self.ldscores is not None: no_NAs = no_NAs*(~self.ldscores.mask) if self.map is not None: no_NAs = no_NAs*(~self.map.mask) self.filter(no_NAs)
[docs] def filter_maf(self,min_maf): if min_maf<0 or min_maf>1: raise(ValueError('MAF must be between 0 and 1')) good_maf = np.logical_and(min_maf < self.freqs, self.freqs < (1-min_maf)) self.filter(good_maf)
[docs] def filter_corrs(self,max_Z): r_dir_avg_NTC_Z = (self.r_direct_avg_NTC-np.mean(self.r_direct_avg_NTC))/np.std(self.r_direct_avg_NTC) r_dir_pop_Z = (self.r_direct_pop-np.mean(self.r_direct_pop))/np.std(self.r_direct_pop) good_corrs = np.logical_and(np.abs(r_dir_avg_NTC_Z) < max_Z, np.abs(r_dir_pop_Z) < max_Z) self.filter(good_corrs)
[docs] def scores_from_ldsc(self,ld_files): self.ldscores = ma.array(np.zeros(self.sid.shape[0]), mask=np.ones(self.sid.shape[0])) for i in range(ld_files.shape[0]): print('Reading LD scores from '+ld_files[i]) ld_chr = np.loadtxt(ld_files[i],dtype=str,usecols=(1,3)) in_sid_dict = np.array([x in self.sid_dict for x in ld_chr[:,0]]) if np.sum(in_sid_dict) > 0: ld_chr = ld_chr[in_sid_dict,:] ld_indices = np.array([self.sid_dict[x] for x in ld_chr[:,0]]) self.ldscores[ld_indices] = np.array(ld_chr[:,1],dtype=float) self.ldscores.mask[ld_indices] = False else: raise(ValueError('No SNPs in common between LD scores and summary statistics'))
[docs] def cor_direct_pop(self, n_blocks): print('Computing correlation between direct and population effects') r_dir_pop, r_dir_pop_SE, r_dir_pop_delete = jacknife_est(self.direct,self.population,np.power(self.direct_SE,2), np.power(self.population_SE,2),self.r_direct_pop,self.ldscores,n_blocks) return r_dir_pop, r_dir_pop_SE, r_dir_pop_delete
[docs] def cor_direct_avg_NTC(self, n_blocks): print('Computing correlation between direct effects and average NTCs') r_dir_avg_NTC, r_dir_avg_NTC_SE, r_dir_avg_NTC_delete = jacknife_est(self.direct,self.avg_NTC,np.power(self.direct_SE,2), np.power(self.avg_NTC_SE,2),self.r_direct_avg_NTC,self.ldscores, n_blocks) return r_dir_avg_NTC, r_dir_avg_NTC_SE, r_dir_avg_NTC_delete
[docs] def compute_ld_scores(self, bedfiles, chroms, ld_wind, ld_out=None): self.ldscores = ma.array(np.zeros(self.sid.shape[0]), mask=np.ones(self.sid.shape[0])) for i in range(bedfiles.shape[0]): print('Computing LD scores for chromosome '+str(chroms[i])) ld_chr, ld_snps_chr = ldscores_from_bed(bedfiles[i], chroms[i], ld_wind, ld_out) in_sid_dict = np.array([x in self.sid_dict for x in ld_snps_chr]) if np.sum(in_sid_dict) > 0: ld_indices = np.array([self.sid_dict[x] for x in ld_snps_chr[in_sid_dict]]) self.ldscores[ld_indices] = np.array(ld_chr[in_sid_dict],dtype=float) self.ldscores.mask[ld_indices] = False else: raise(ValueError('No SNPs in common between LD scores and sumstats'))
[docs]def read_sumstats_file(sumstats_file, chrom): print('Reading sumstats from '+sumstats_file) # Read header sumstats_file_o = gzip.open(sumstats_file,'r') sumstats_header = sumstats_file_o.readline().decode('UTF-8').split(' ') sumstats_header[len(sumstats_header)-1] = sumstats_header[len(sumstats_header)-1].split('\n')[0] sumstats_header = np.array(sumstats_header) sumstats_file_o.close() ## Find columns # Keep backwards compatibility with old sumstats files if 'avg_parental_Beta' in sumstats_header: parname = 'avg_parental' else: parname = 'avg_NTC' # Find column indices colnames = ['SNP','pos','A1','A2','freq','direct_Beta', 'direct_SE',parname+'_Beta',parname+'_SE', 'population_Beta','population_SE','r_direct_'+parname,'r_direct_population'] col_indices = tuple([np.where(sumstats_header==x)[0][0] for x in colnames]) # Read summary statistics s = np.loadtxt(sumstats_file,dtype=str,skiprows=1,usecols=col_indices) # Return sumstats class return sumstats(chrom, s[:,0],s[:,1],s[:,2],s[:,3],s[:,4], s[:,5],s[:,6],s[:,7],s[:,8],s[:,9],s[:,10],s[:,11],s[:,12])
[docs]def read_sumstats_files(sumstats_files, chroms): s = read_sumstats_file(sumstats_files[0], chroms[0]) if chroms.shape[0]>1: for i in range(1,sumstats_files.shape[0]): s.concatenate(read_sumstats_file(sumstats_files[i], chroms[i])) return s
[docs]@njit def estimate_var(z,v,w): return np.sum(w*(np.power(z,2)-v))
[docs]@njit def estimate_weights(v_est,v,l): w = np.power((v_est+v)*l,-1) w = w/np.sum(w) return w
[docs]@njit def reweighted_estimate_var(z, v, l, tol=1e-8, max_iter=10**3): # Initialize weights w = np.power(v*l,-1) w = w/np.sum(w) # Initialize estimate v_est = estimate_var(z, v, w) # Iterate to convergence v_diff = np.inf n_iter = 0 while v_diff > tol and n_iter < max_iter: w = estimate_weights(v_est, v, l) v_est_new = estimate_var(z, v, w) v_diff = np.abs(v_est_new-v_est) v_est = v_est_new n_iter += 1 return v_est
[docs]@njit def estimate_cov(z1, z2, v1, v2, r, w): return np.sum(w*(z1*z2-r*np.sqrt(v1*v2)))
[docs]@njit def estimate_cov_weights(v1, v2, r, var_1, var_2, c, l): w = np.power(((var_1+v1)*(var_2+v2)+np.power(c+r*np.sqrt(v1*v2),2))*l,-1) w = w/np.sum(w) return w
[docs]@njit def reweighted_estimate_cov(z1, z2, v1, v2, r, l, var_1, var_2, tol=1e-8, max_iter=10**3): # Initialize weights w = np.power(((var_1+v1)*(var_2+v2)+np.power(r,2)*v1*v2)*l,-1) w = w/np.sum(w) # Initialize estimate c_est = estimate_cov(z1, z2, v1, v2, r, w) # Iterate to convergence c_diff = np.inf n_iter = 0 while c_diff > tol and n_iter < max_iter: w = estimate_cov_weights(v1, v2, r, var_1, var_2, c_est, l) c_est_new = estimate_cov(z1, z2, v1, v2, r, w) c_diff = np.abs(c_est_new-c_est) c_est = c_est_new n_iter += 1 return c_est
[docs]@njit def compute_corr(z1,z2,v1,v2,r,l): var_1 = reweighted_estimate_var(z1,v1,l) var_2 = reweighted_estimate_var(z2,v2,l) cov_12 = reweighted_estimate_cov(z1, z2, v1, v2, r, l, var_1, var_2) corr = cov_12/np.sqrt(var_1*var_2) reg_21 = cov_12/var_1 resid_2 = z2-reg_21*z1 v_resid_2 = v2+(reg_21**2)*v1-2*reg_21*r*np.sqrt(v1*v2) v_s = reweighted_estimate_var(resid_2, v_resid_2, l) return np.array([var_1, var_2, cov_12, corr, reg_21, v_s/var_2],dtype=np.float_)
[docs]@njit(parallel=True) def jacknife(z1,z2,v1,v2,r,l,n_blocks,block_size): # Construct jacknife blocks jack_delete = np.zeros((n_blocks,6),dtype=np.float_) mask = np.ones((n_blocks, z1.shape[0]),dtype=np.bool_) for i in prange(n_blocks-1): mask[i, (block_size*i):(block_size*(i+1))] = False mask[n_blocks-1, (block_size*(n_blocks-1)):z1.shape[0]] = False # Compute jacknife values for i in prange(n_blocks): jack_delete[i,:] = compute_corr(z1[mask[i,:]],z2[mask[i,:]],v1[mask[i,:]],v2[mask[i,:]],r[mask[i,:]],l[mask[i,:]]) return jack_delete
[docs]def jacknife_est(z1,z2,v1,v2,r,l,n_blocks): # Estimate est = compute_corr(z1,z2,v1,v2,r,l) # Calculate blocks block_size = int(np.floor(z1.shape[0]/n_blocks)) # Get jacknife ests jack_delete = jacknife(z1,z2,v1,v2,r,l,int(n_blocks),block_size) n_blocks = jack_delete.shape[0] # Compute jacknife-variance jack_vars = np.zeros((jack_delete.shape[1]),dtype=np.float_) for i in range(jack_delete.shape[1]): jack_vars[i] = ((n_blocks-1)/n_blocks)*np.sum(np.power(jack_delete[:,i]-np.mean(jack_delete[:,i]),2)) # Adjust estimate of uncorrelated variance for error in regression coefficient est[5] = est[5]-est[0]*jack_vars[4] return est[3:6], np.sqrt(jack_vars[3:6]), jack_delete[:,3:6]