Source code for snipar.pgs

from snipar.gtarray import gtarray
import numpy as np
from snipar.read import get_gts_matrix
from snipar.utilities import *
from scipy.optimize import fmin_l_bfgs_b
from numba import njit, prange
import numpy.ma as ma
import snipar.slmm as slmm
from pysnptools.snpreader import Bed
import numdifftools as nd
from snipar.pgs_am import *

[docs]class pgs(object): """Define a polygenic score based on a set of SNPs with weights and ref/alt allele pairs. Args: snp_ids : :class:`~numpy:numpy.array` [L] vector of SNP ids weights : :class:`~numpy:numpy.array` [L] vector of weights of equal length to snp_ids alleles : :class:`~numpy:numpy.array` [L x 2] matrix of ref and alt alleles for the SNPs. L must match size of snp_ids Returns: pgs : :class:`snipar.pgs` """ def __init__(self,snp_ids,weights,alleles): if snp_ids.shape[0] == weights.shape[0] and alleles.shape[0] == weights.shape[0] and alleles.shape[1]==2: self.snp_ids = snp_ids self.snp_dict = make_id_dict(snp_ids) self.weights = weights self.alleles = alleles else: raise ValueError('All inputs must have the same dimension')
[docs] def remove_zeros(self): zero_weight = self.weights==0 n_zero = np.sum(zero_weight) if n_zero>0: print('Filtering '+str(n_zero)+' SNPs with zero weight') self.snp_ids = self.snp_ids[~zero_weight] self.snp_dict = make_id_dict(self.snp_ids) self.weights = self.weights[~zero_weight] self.alleles = self.alleles[~zero_weight] else: print('No zero weight SNPs found')
[docs] def compute(self, garray, cols=None): """Compute polygenic score values from a given genotype array. Finds the SNPs in the genotype array that have weights in the pgs and matching alleles, and computes the PGS based on these SNPs and the weights after allele-matching. Args: garray : :class:`sbreg.gtarray` genotype array to compute PGS values for cols : :class:`numpy:numpy.array` names to give the columns in the output gtarray Returns: pg : :class:`snipar.gtarray` 2d gtarray with PGS values. If a 3d gtarray is input, then each column corresponds to the second dimension on the input gtarray (for example, individual, paternal, maternal PGS). If a 2d gtarray is input, then there will be only one column in the output gtarray. The names given in 'cols' are stored in 'sid' attribute of the output. """ if not type(garray) == gtarray: raise ValueError('Must be of gtarray class') if garray.alleles is None: raise ValueError('Alleles of genotype matrix must be provided') # Match SNP IDs in_pgs_snps = np.array([x in self.snp_dict for x in garray.sid]) nmatch = np.sum(in_pgs_snps) if nmatch==0: print('No overlap between PGS SNPs and genotype SNPs') return None else: # Get weights matched_snps = garray.sid[in_pgs_snps] matched_alleles = garray.alleles[in_pgs_snps,:] snp_indices = np.zeros((nmatch),dtype=int) for i in range(0,nmatch): snp_indices[i] = self.snp_dict[matched_snps[i]] weights_compute = self.weights[snp_indices] alleles = self.alleles[snp_indices,:] # Match alleles and adjust weights a_match = np.logical_and(alleles[:,0] == matched_alleles[:, 0], alleles[:,1] == matched_alleles[:, 1]) a_reverse = np.logical_and(alleles[:,0] == matched_alleles[:, 1], alleles[:,1] == matched_alleles[:, 0]) a_nomatch = np.logical_and(np.logical_not(a_match), np.logical_not(a_reverse)) n_nomatch = np.sum(a_nomatch) if n_nomatch > 0: print('Removing ' + str(n_nomatch) + ' SNPs due to allele mismatch between genotypes and PGS alleles') weights_compute[a_nomatch] = 0 weights_compute[a_reverse] = -weights_compute[a_reverse] ### Compute PGS if garray.ndim == 2: geno_means = np.mean(garray.gts[:, in_pgs_snps],axis=0) garray.gts = garray.gts[:, in_pgs_snps]-geno_means pgs_val = ma.dot(garray.gts[:, in_pgs_snps],weights_compute) elif garray.ndim == 3: geno_means = np.mean(garray.gts[:, 0, in_pgs_snps], axis=0) pgs_val = np.zeros((garray.gts.shape[0], garray.gts.shape[1]), garray.dtype) for i in range(0, garray.gts.shape[1]): if cols[i]=='parental': sf = 2.0 else: sf = 1.0 garray.gts[:, i, in_pgs_snps] = garray.gts[:, i, in_pgs_snps]-sf*geno_means pgs_val[:, i] = ma.dot(garray.gts[:, i, in_pgs_snps], weights_compute) return pgarray(pgs_val, garray.ids, sid=cols, fams=garray.fams, par_status=garray.par_status, ped=garray.ped)
[docs]def read_weights(weights, SNP='SNP', beta_col='b', A1='A1', A2='A2', sep=None): if sep is None: weights = np.loadtxt(weights,dtype=str) else: weights = np.loadtxt(weights,dtype=str, delimiter=sep) colnames = weights[0,:] weights = weights[1:weights.shape[0],:] print('Read weights for '+str(weights.shape[0])+' variants') beta = np.array(weights[:,np.where(colnames == beta_col)[0][0]],dtype=np.float64) allele_indices = np.array([np.where(colnames==A1)[0][0],np.where(colnames==A2)[0][0]]) return pgs(weights[:,np.where(colnames==SNP)[0][0]], beta, weights[:,allele_indices])
[docs]def read_pgs(pgs_file): pgs_f = open(pgs_file,'r') pgs_header = pgs_f.readline().split(' ') pgs_header[len(pgs_header)-1] = pgs_header[len(pgs_header)-1].split('\n')[0] ncols = len(pgs_header) if pgs_header[0]=='FID' and pgs_header[1]=='IID': precols = 2 else: raise(ValueError('First two columns of PGS file must be FID and IID')) if pgs_header[2]=='FATHER_ID' and pgs_header[3]=='MOTHER_ID': precols = 4 pgs_cols = tuple([x for x in range(precols,ncols)]) # Read pedigree if precols==4: ped = np.loadtxt(pgs_file,usecols=tuple([x for x in range(4)]),dtype=str, skiprows=1) else: ped = None if ped is not None: par_status = np.array(ped[:,2:4]=='NA',dtype=int) else: par_status = None pg = pgarray(np.genfromtxt(pgs_file,usecols = pgs_cols, skip_header=1, missing_values=['nan','NA','N/A']), np.loadtxt(pgs_file,usecols = 1, dtype=str, skiprows=1), sid=np.array(pgs_header[precols:ncols]), fams=np.loadtxt(pgs_file,usecols = 0, dtype=str, skiprows=1), par_status = par_status, ped=ped) return pg
[docs]def compute(pgs, bedfile=None, bgenfile=None, par_gts_f=None, ped=None, sib=False, compute_controls=False, verbose=True, batch_size=None): """Compute a polygenic score (PGS) for the individuals with observed genotypes and observed/imputed parental genotypes. Args: par_gts_f : :class:`str` path to HDF5 file with imputed parental genotypes gts_f : :class:`str` path to bed file with observed genotypes pgs : :class:`snipar.pgs` the PGS, defined by the weights for a set of SNPs and the alleles of those SNPs sib : :class:`bool` Compute the PGS for genotyped individuals with at least one genotyped sibling and observed/imputed parental genotypes. Default False. compute_controls : :class:`bool` Compute polygenic scores for control families (families with observed parental genotypes set to missing). Default False. Returns: pg : :class:`snipar.gtarray` Return the polygenic score as a genotype array with columns: individual's PGS, mean of their siblings' PGS, observed/imputed paternal PGS, observed/imputed maternal PGS """ # Check for SNP overlap if bedfile is not None: bed = Bed(bedfile, count_A1=True) snp_ids = bed.sid if bgenfile is not None: bgen = open_bgen(bgenfile) snp_ids = bgen.ids if np.unique(snp_ids).shape[0] == 1: snp_ids = bgen.rsids pgs_snp_set = set(pgs.snp_ids) in_snp_set = np.array([x in pgs_snp_set for x in snp_ids]) if np.sum(in_snp_set)==0: print('No overlap between variants in weights file and observed genotypes') return None else: ## Get genotype matrix snps_in_pgs = snp_ids[in_snp_set] # Get batch size if batch_size is not None: batch_size = min(batch_size, snps_in_pgs.shape[0]) else: batch_size = snps_in_pgs.shape[0] # Set batch boundaries n_batches = int(np.ceil(snps_in_pgs.shape[0]/batch_size)) batch_boundaries = np.zeros((n_batches,2),dtype=int) for i in range(n_batches-1): batch_boundaries[i,:] = [i*batch_size,(i+1)*batch_size] batch_boundaries[n_batches-1,:] = [(n_batches-1)*batch_size,snps_in_pgs.shape[0]] ## Compute PGS, reading snps in batches # First batch G = get_gts_matrix(bedfile=bedfile, bgenfile=bgenfile, par_gts_f=par_gts_f, ped=ped, snp_ids=snps_in_pgs[batch_boundaries[0,0]:batch_boundaries[0,1]], sib=sib, compute_controls=compute_controls, verbose=verbose) if sib: if G.shape[1]==4: cols = np.array(['proband', 'sibling', 'paternal', 'maternal']) else: cols = np.array(['proband','sibling','parental']) else: if G.shape[1]==3: cols = np.array(['proband', 'paternal', 'maternal']) else: cols = np.array(['proband','parental']) if compute_controls: pgs_out = [pgs.compute(x,cols) for x in G[0:3]] if sib: o_cols = np.array(['proband', 'sibling', 'parental']) else: o_cols = np.array(['proband','parental']) pgs_out.append(pgs.compute(G[3], o_cols)) else: pgs_out = pgs.compute(G,cols) # Remaining batches for i in range(1,n_batches): del G G = get_gts_matrix(bedfile=bedfile, bgenfile=bgenfile, par_gts_f=par_gts_f, ped=ped, snp_ids=snps_in_pgs[batch_boundaries[i,0]:batch_boundaries[i,1]], sib=sib, compute_controls=compute_controls, verbose=False) if compute_controls: pgs_out_i = [pgs.compute(x,cols) for x in G[0:3]] if sib: o_cols = np.array(['proband', 'sibling', 'parental']) else: o_cols = np.array(['proband','parental']) pgs_out_i.append(pgs.compute(G[3], o_cols)) pgs_out = [pgs_out[x].add(pgs_out_i[x]) for x in range(0, len(pgs_out))] else: pgs_out = pgs_out.add(pgs.compute(G,cols)) return pgs_out
[docs]class pgarray(gtarray):
[docs] def add(self,garray): """ Adds another gtarray of the same dimension to this array and returns the sum. It matches IDs before summing. """ if type(garray)==pgarray: pass else: raise ValueError('Must add to another pgarray') if not self.gts.ndim == garray.gts.ndim: raise ValueError('Arrays must have same number of dimensions') if self.gts.ndim == 2: if not self.gts.shape[1] == garray.gts.shape[1]: raise ValueError('Arrays must have same dimensions (apart from first)') if self.gts.ndim == 3: if not self.gts.shape[1:3] == garray.gts.shape[1:3]: raise ValueError('Arrays must have same dimensions (apart from first)') # Match IDs common_ids = list(self.id_dict.keys() & garray.id_dict.keys()) if len(common_ids) == 0: raise ValueError('No IDs in common') self_index = np.array([self.id_dict[x] for x in common_ids]) other_index = np.array([garray.id_dict[x] for x in common_ids]) # Out if self.ids.ndim == 1: ids_out = self.ids[self_index] else: ids_out = self.ids[self_index, :] if self.gts.ndim ==2: add_gts = self.gts[self_index, :]+garray.gts[other_index, :] else: add_gts = self.gts[self_index, :, :] + garray.gts[other_index, :, :] if self.ped is None: ped = garray.ped else: ped = self.ped return pgarray(add_gts, ids_out, self.sid, alleles=self.alleles, fams=self.fams[self_index], par_status=self.par_status[self_index,:], ped=ped)
[docs] def filter_bpg(self): if self.par_status is None: raise(ValueError('Parental genotype status unknown so cannot restrict to both parents genotyped sample')) else: bpg_ids = self.ids[np.sum(self.par_status==0, axis=1)==2] self.filter_ids(bpg_ids)
[docs] def estimate_r(self, return_se=True, parents_only=False): # Check pgs columns if 'paternal' in self.sid: paternal_index = np.where(self.sid=='paternal')[0][0] if 'maternal' in self.sid: maternal_index = np.where(self.sid=='maternal')[0][0] # count fams fams = np.unique(self.fams, return_counts=True) # Mapping of sibships to pgs rows fam_dict = {} for fam in fams[0]: fam_dict[fam] = np.where(self.fams==fam)[0] # record genotype status of parents in each fam par_status_fams = np.zeros((fams[0].shape[0],2),dtype=int) for i in range(fams[0].shape[0]): par_status_fams[i,:] = self.par_status[fam_dict[fams[0][i]][0]] # get family sizes including parents fsizes = fams[1]+np.sum(par_status_fams==0,axis=1) ## pgs matrix with observed sibs and parents for each fam pgs_fam = np.zeros((fams[0].shape[0],np.max(fsizes))) pgs_fam[:] = np.nan # record whether sib or parent is_sib = np.zeros((fams[0].shape[0],np.max(fsizes)),dtype=bool) is_father = np.zeros((fams[0].shape[0],np.max(fsizes)),dtype=bool) is_mother = np.zeros((fams[0].shape[0],np.max(fsizes)),dtype=bool) # populate for i in range(fams[0].shape[0]): # Fill in sibs sib_indices = fam_dict[fams[0][i]] pgs_fam[i,0:sib_indices.shape[0]] = self.gts[sib_indices,0] is_sib[i,0:sib_indices.shape[0]] = True npar = 0 if par_status_fams[i,0] == 0: pgs_fam[i,sib_indices.shape[0]] = self.gts[sib_indices[0],paternal_index] is_father[i,sib_indices.shape[0]] = True npar = 1 if par_status_fams[i,1] == 0: pgs_fam[i,sib_indices.shape[0]+npar] = self.gts[sib_indices[0],maternal_index] is_mother[i,sib_indices.shape[0]+npar] = True is_parent = np.logical_or(is_father,is_mother) # normalize pgs_fam[is_sib] = (pgs_fam[is_sib]-np.mean(pgs_fam[is_sib]))/np.std(pgs_fam[is_sib]) if np.sum(is_mother)>0: pgs_fam[is_mother] = (pgs_fam[is_mother]-np.mean(pgs_fam[is_mother]))/np.std(pgs_fam[is_mother]) if np.sum(is_father)>0: pgs_fam[is_father] = (pgs_fam[is_father]-np.mean(pgs_fam[is_father]))/np.std(pgs_fam[is_father]) # fam size dict fsizes = dict(zip(list(fams[0]),list(fams[1]))) ### find correlation between maternal and paternal pgis print('Finding MLE for correlation between parents scores') ## Initialize with correlation from sibs and between parents # correlation between parents bpg = np.sum(self.par_status==0,axis=1)==2 n_bpg = np.sum(bpg) if n_bpg>0: r_bpg = np.corrcoef(self.gts[bpg,1],self.gts[bpg,2])[0,1] else: r_bpg = 0 if parents_only: if n_bpg>0: r_se = (1-r_bpg**2)/np.sqrt(n_bpg-1) return r_bpg, r_se, fsizes else: return np.nan, np.nan, fsizes else: # Correlation from sibs sib_2 = np.sum(is_sib,axis=1)>1 n_sib_2 = np.sum(sib_2) if n_sib_2>0: r_sib = 2*np.corrcoef(pgs_fam[sib_2,0],pgs_fam[sib_2,1])[0,1]-1 else: r_sib = 0 # Initialize at weighted average r_init = n_bpg*r_bpg+n_sib_2*r_sib r_init = r_init/(n_bpg+n_sib_2) # Find MLE optimized = fmin_l_bfgs_b(func=pgs_cor_lik, x0=r_init, args=(pgs_fam, is_sib, is_parent), approx_grad=True, bounds=[(-0.999,0.999)]) if optimized[2]['warnflag']==0: r=optimized[0][0] hess = nd.Hessian(pgs_cor_lik) hess = float(hess([r], pgs_fam, is_sib, is_parent)) r_se = np.sqrt(2/hess) else: print('Could not find MLE for correlation. Returning weighted average from siblings and parents.') r=r_init r_se = (1-r**2)/np.sqrt(n_bpg+n_sib_2-1) return r, r_se, fsizes
[docs] def am_adj(self): print('Estimating correlation between maternal and paternal PGSs assuming equilibrium') r, r_se, fsizes = self.estimate_r() r_z = r/r_se print('Estimated correlation between maternal and paternal PGSs: '+str(round(r,4))+' S.E.='+str(round(r_se,4))) if r_z>1.5: # Check pgs columns if 'paternal' in self.sid and 'maternal' in self.sid: paternal_index = np.where(self.sid=='paternal')[0][0] maternal_index = np.where(self.sid=='maternal')[0][0] parental_index = None elif 'parental' in self.sid: parental_index = np.where(self.sid=='parental')[0][0] else: raise(ValueError('No parental PGS values to adjust')) # Adjust imputed parental PGSs npar = np.sum(self.par_status==0,axis=1) print('Adjuting imputed PGSs for assortative mating') for i in range(self.gts.shape[0]): # No parents genotyped if npar[i]==0: if parental_index is None: self.gts[i,[paternal_index, maternal_index]] = npg_am_adj(r,fsizes[self.fams[i]])*self.gts[i,[paternal_index, maternal_index]] else: self.gts[i,parental_index] = npg_am_adj(r,fsizes[self.fams[i]])*self.gts[i,parental_index] # One parent genotyped if npar[i]==1: # Father imputed if self.par_status[i,0] == 1: self.gts[i,paternal_index] = opg_am_adj(self.gts[i,paternal_index],self.gts[i,maternal_index],r,fsizes[self.fams[i]]) # Mother imputed if self.par_status[i,1] == 1: self.gts[i,maternal_index] = opg_am_adj(self.gts[i,maternal_index],self.gts[i,paternal_index],r,fsizes[self.fams[i]]) else: print('Signal to noise ratio for correlation estimate too small, so not performing assortative mating adjustment. Use --force_am_adj to override') return r, r_se
[docs] def scale(self, sf=None): if sf is None: if 'proband' in self.sid: proband_index = np.where(self.sid=='proband')[0][0] else: raise(ValueError('Cannot scale as no proband column found and no scale factor provided')) sf = np.std(self.gts[:, proband_index]) # Rescale by observed proband PGS self.gts = self.gts / sf return sf
[docs] def write(self, filename, scale=False): if scale: self.scale() ####### Write PGS to file ######## parent_genotyped = self.par_status == 0 ped_dict = make_id_dict(self.ped,1) ped_indices = np.array([ped_dict[x] for x in self.ids]) parent_ids = self.ped[ped_indices,2:4] parent_ids[~parent_genotyped] = 'NA' pg_out = np.column_stack((self.fams,self.ids,parent_ids,self.gts)) pg_header = np.column_stack((np.array(['FID','IID','FATHER_ID','MOTHER_ID']).reshape(1,4),self.sid.reshape(1,self.sid.shape[0]))) pg_out = np.row_stack((pg_header,pg_out)) print('Writing PGS to ' + filename) np.savetxt(filename, pg_out, fmt='%s') return pg_out
[docs] def compute_grandpar(self, r): # Check we have required information if self.ped is None: raise(ValueError('Pedigree needed for grandparental model')) if self.par_status is None: raise(ValueError('Parental genotype status needed to compute grandparents')) # Check pgs columns if 'paternal' in self.sid: paternal_index = np.where(self.sid=='paternal')[0][0] else: raise(ValueError('No paternal PGS column found')) if 'maternal' in self.sid: maternal_index = np.where(self.sid=='maternal')[0][0] else: raise(ValueError('No maternal PGS column found')) # Find individuals with both parents genotyped bpg = np.sum(self.par_status==0,axis=1)==2 n_bpg = np.sum(bpg) print('Found '+str(n_bpg)+' individuals with both parents genotyped for which grandparental scores will be computed') ## Create grandparental PGS matrix gpar = np.zeros((self.gts.shape[0],4)) gpar[:] = np.nan if n_bpg>0: # Find their parents' IDs ped_dict = make_id_dict(self.ped,1) bpg_ped = self.ped[[ped_dict[x] for x in self.ids[bpg]],:] # Find the mean of the mothers and fathers parent_means = np.mean(self.gts[bpg,:],axis=0)[[paternal_index, maternal_index]] ## Fill for i in range(bpg_ped.shape[0]): i_index = self.id_dict[bpg_ped[i,1]] ## Paternal grandparental scores # Find imputed grandparents if bpg_ped[i,2] in self.id_dict: gpar[i_index, 0:2] = self.gts[self.id_dict[bpg_ped[i,2]],[paternal_index,maternal_index]] else: # linear imputation from father if no imputed gpar[i_index, 0:2] = parent_means[0]+(1+r)*(self.gts[i_index, paternal_index]-parent_means[0])/2.0 ## Maternal grandparental scores # Find imputed grandparents if bpg_ped[i,3] in self.id_dict: gpar[i_index, 2:4] = self.gts[self.id_dict[bpg_ped[i,3]],[paternal_index,maternal_index]] else: # linear imputation from mother if no imputed gpar[i_index, 2:4] = parent_means[1]+(1+r)*(self.gts[i_index, maternal_index]-parent_means[1])/2.0 ## Append to PGS matrix self.gts = np.hstack((self.gts,gpar)) self.sid = np.hstack((self.sid,np.array(['gpp','gpm','gmp','gmm']))) return bpg_ped
[docs]def opg_am_adj(pgi_imp, pgi_obs, r, n): rcoef = r/((2**n)*(1+r)-r) return rcoef*pgi_obs+(1+rcoef)*pgi_imp
[docs]def npg_am_adj(r,n): rcoef = (1+r)/(1+(1-(1/2)**(n-1))*r) return rcoef
[docs]@njit def pgs_corr_matrix(r,is_sib_fam,is_parent_fam): n_sib = np.sum(is_sib_fam) n_par = np.sum(is_parent_fam) # Full matrix R = np.zeros((n_sib+n_par,n_sib+n_par), dtype=np.float_) np.fill_diagonal(R, 1.0) # sib submatrix r_sib = (1+r)/2 R_sib = r_sib*np.ones((n_sib,n_sib),dtype=np.float_) np.fill_diagonal(R_sib,1.0) R[0:n_sib,0:n_sib] = R_sib # sib to parent if n_par>0: R[0:n_sib,n_sib:(n_sib+n_par)] = r_sib R[n_sib:(n_sib+n_par),0:n_sib] = r_sib # parent-to-parent if n_par>1: R[n_sib+n_par-1,n_sib+n_par-2] = r R[n_sib+n_par-2,n_sib+n_par-1] = r # return return R
[docs]@njit def pgs_corr_likelihood_fam(r,pg_fam,is_sib_fam,is_parent_fam): sib_or_parent = np.logical_or(is_sib_fam,is_parent_fam) R = pgs_corr_matrix(r,is_sib_fam,is_parent_fam) slogdet_R = np.linalg.slogdet(R) pg_vec = pg_fam[sib_or_parent].reshape((np.sum(sib_or_parent),1)) L_fam = slogdet_R[0]*slogdet_R[1]+pg_vec.T @ np.linalg.inv(R) @ pg_vec return L_fam[0,0]
[docs]@njit(parallel=True) def pgs_corr_likelihood(r,pgs_fam,is_sib,is_parent): L = 0.0 for i in prange(pgs_fam.shape[0]): L += pgs_corr_likelihood_fam(r, pgs_fam[i,:], is_sib[i,:], is_parent[i,:]) return L
[docs]def pgs_cor_lik(r, *args): pgs_fam, is_sib, is_parent = args return pgs_corr_likelihood(r[0], pgs_fam, is_sib, is_parent)
[docs]def simulate_r_inf(r, nfam, nsib, npar): is_sib = np.zeros((nfam,nsib+npar),dtype=np.bool_) is_sib[:,0:nsib] = True is_parent = np.zeros((nfam,nsib+npar),dtype=np.bool_) is_parent[:,nsib:(nsib+npar)] = True R = pgs_corr_matrix(r,is_sib[0,:],is_parent[0,:]) pgs_fam = np.random.multivariate_normal(np.zeros((nsib+npar)), R, size=nfam) # correlation between parents if npar==2: bpg = np.ones((nfam),dtype=bool) else: bpg = np.zeros((nfam),dtype=bool) n_bpg = np.sum(bpg) if n_bpg>0: r_bpg = np.corrcoef(pgs_fam[:,nsib],pgs_fam[:,nsib+1])[0,1] else: r_bpg = 0 # Correlation from sibs if nsib>1: sib_2 = np.ones((nfam),dtype=bool) else: sib_2 = np.zeros((nfam),dtype=bool) n_sib_2 = np.sum(sib_2) if n_sib_2>0: r_sib = 2*np.corrcoef(pgs_fam[sib_2,0],pgs_fam[sib_2,1])[0,1]-1 else: r_sib = 0 # Initialize at weighted average r_init = n_bpg*r_bpg+n_sib_2*r_sib r_init = r_init/(n_bpg+n_sib_2) # Find MLE optimized = fmin_l_bfgs_b(func=pgs_cor_lik, x0=r_init, args=(pgs_fam, is_sib, is_parent), approx_grad=True, bounds=[(-0.999,0.999)]) return np.array([r_init,optimized[0][0]])
[docs]def fit_pgs_model(y, pg, ngen, ibdrel_path=None, covariates=None, fit_sib=False, parsum=False, gparsum=False, outprefix=None, sparse_thresh=0.025): pg.gts = ma.array(pg.gts,fill_value=np.nan) pg.gts = pg.gts.filled() if ngen in [1,2,3]: pass else: raise(ValueError('ngen must be 1, 2, or 3')) # Check if IDs are aligned if not np.array_equal(y.ids,pg.ids): pg.filter_ids(y.ids) y.filter_ids(pg.ids) # Check covariate IDs are aligned if covariates is not None: if not np.array_equal(pg.ids,covariates.ids): covariates.filter_ids(pg.ids) ## Fit model if ngen==1: print('Fitting 1 generation model (proband only)') alpha, alpha_cols = make_and_fit_model(y, pg, ['proband'], ibdrel_path=ibdrel_path, covariates=covariates, sparse_thresh=sparse_thresh) elif ngen==2 or ngen==3: if fit_sib: if 'sib' in pg.sid: pg_cols = ['proband','sibling'] else: raise(ValueError('Sibling PGS not found (use --fit_sib when calculating PGS)')) else: pg_cols = ['proband'] if parsum: if 'maternal' in pg.sid and 'paternal' in pg.sid: parcols = np.sort(np.array([np.where(pg.sid=='maternal')[0][0],np.where(pg.sid=='paternal')[0][0]])) trans_matrix = np.identity(pg.gts.shape[1]) trans_matrix[:,parcols[0]] += trans_matrix[:,parcols[1]] trans_matrix = np.delete(trans_matrix,parcols[1],1) pg.gts = pg.gts.dot(trans_matrix) pg.sid = np.delete(pg.sid,parcols[1]) pg.sid[parcols[0]] = 'parental' elif 'parental' in pg.sid: pass else: raise(ValueError('Maternal and paternal PGS not found so cannot sum (--parsum option given)')) pg_cols.append('parental') else: pg_cols += ['paternal','maternal'] if ngen==2: print('Fitting 2 generation model (proband and observed/imputed parents)') alpha, alpha_cols = make_and_fit_model(y, pg, pg_cols, ibdrel_path=ibdrel_path, covariates=covariates, sparse_thresh=sparse_thresh) elif ngen==3: print('Fitting 3 generation model: observed proband and observed parents, and observed/imputed grandparents') if gparsum: if 'gpp' in pg.sid and 'gpm' in pg.sid and 'gmp' in pg.sid and 'gmm' in pg.sid: # Sum of paternal grandparents gparcols = np.sort(np.array([np.where(pg.sid==x)[0][0] for x in ['gpp','gpm']])) trans_matrix = np.identity(pg.gts.shape[1]) trans_matrix[:,gparcols[0]] += trans_matrix[:,gparcols[1]] trans_matrix = np.delete(trans_matrix,gparcols[1],1) pg.gts = pg.gts.dot(trans_matrix) pg.sid = np.delete(pg.sid,gparcols[1]) pg.sid[gparcols[0]] = 'gp' # Sum of maternal grandparents gparcols = np.sort(np.array([np.where(pg.sid==x)[0][0] for x in ['gmp','gmm']])) trans_matrix = np.identity(pg.gts.shape[1]) trans_matrix[:,gparcols[0]] += trans_matrix[:,gparcols[1]] trans_matrix = np.delete(trans_matrix,gparcols[1],1) pg.gts = pg.gts.dot(trans_matrix) pg.sid = np.delete(pg.sid,gparcols[1]) pg.sid[gparcols[0]] = 'gm' elif 'gp' in pg.sid and 'gm' in pg.sid: pass else: raise(ValueError('Grandparental PGSs not found so cannot sum (--gparsum option given)')) pg_cols += ['gp','gm'] else: pg_cols += ['gpp','gpm','gmp','gmm'] alpha, alpha_cols = make_and_fit_model(y, pg, pg_cols, ibdrel_path=ibdrel_path, covariates=covariates, sparse_thresh=sparse_thresh) # Save to file if outprefix is not None: write_estimates(outprefix+'.'+str(ngen), alpha, alpha_cols) return alpha, alpha_cols
[docs]def make_and_fit_model(y, pg, pg_cols, ibdrel_path=None, covariates=None, sparse_thresh=0.025): pg_col_indices = [np.where(pg.sid==x)[0][0] for x in pg_cols] if covariates is not None: X = np.hstack((covariates.gts,np.array(pg.gts[:,pg_col_indices]))) X_cols = np.hstack((np.array(['intercept']),covariates.sid,pg_cols)) else: X = np.array(pg.gts[:, pg_col_indices]) X_cols = np.hstack((np.array(['intercept']),pg_cols)) # Check for NAs no_NA = np.sum(np.isnan(X),axis=1)==0 print('Sample size: '+str(np.sum(no_NA))) ## Read GRM id_dict = make_id_dict(pg.ids[no_NA]) varcomp_lst = make_grms(pg.fams[no_NA], ibdrel_path=ibdrel_path, id_dict=id_dict, keep=pg.ids[no_NA], sparse_thresh=sparse_thresh) ## Define LMM slmm_model = slmm.LinearMixedModel(np.array(y.gts[no_NA,0],dtype=float), varcomp_arr_lst=varcomp_lst, covar_X=np.array(X[no_NA,:],dtype=float), add_intercept=True) # Optimize Lmm slmm_model.scipy_optimize() # Print variance components print(f'Variance components: {list(i for i in slmm_model.varcomps)}') ZT_Vinv_Z_imp = slmm_model.Z.T @ slmm_model.Vinv_Z alpha = [np.linalg.solve(ZT_Vinv_Z_imp, slmm_model.Z.T @ slmm_model.Vinv_y), np.linalg.inv(ZT_Vinv_Z_imp)] return alpha, X_cols
[docs]def write_estimates(outprefix, alpha, cols): cols = cols.reshape((cols.shape[0],1)) # Find cols alpha_out = np.zeros((alpha[0].shape[0], 2)) alpha_out[:, 0] = alpha[0] alpha_out[:, 1] = np.sqrt(np.diag(alpha[1])) print('Saving effect estimates to '+outprefix+ '.effects.txt') np.savetxt(outprefix + '.effects.txt', np.hstack((cols, np.array(alpha_out, dtype='S'))), delimiter='\t', fmt='%s') vcols = np.zeros((1,cols.shape[0]+1),dtype=cols.dtype) vcols[0,1:vcols.shape[1]] = cols[:,0] alpha_cov_out = np.vstack((vcols, np.hstack((cols,alpha[1])))) print('Saving sampling variance-covariance matrix to '+outprefix+ '.vcov.txt') np.savetxt(outprefix+ '.vcov.txt',alpha_cov_out, fmt='%s')
[docs]def make_grms(fams, ibdrel_path=None, id_dict=None, keep=None, sparse_thresh=0.05): if ibdrel_path is not None: grm_data, grm_row_ind, grm_col_ind = slmm.build_ibdrel_arr( ibdrel_path, id_dict=id_dict, keep=keep, thres=sparse_thresh) ## Build sparse sib GRM sib_data, sib_row_ind, sib_col_ind = slmm.build_sib_arr(fams) ## GRM list if 'grm_data' in locals(): varcomp_lst = ( (grm_data, grm_row_ind, grm_col_ind), (sib_data, sib_row_ind, sib_col_ind), ) else: varcomp_lst = ( (sib_data, sib_row_ind, sib_col_ind), ) return varcomp_lst
######## Assortative mating adjustment in two-generation model ###########
[docs]def h2f_parse(h2f_str): h2f_split = h2f_str.split(',') if not len(h2f_split)==2: raise(ValueError('Invalid h2f input. Please use estimate,se format, such as 0.5,0.01.')) h2f = float(h2f_split[0]) h2f_se = float(h2f_split[1].split('\n')[0]) return h2f, h2f_se
[docs]def am_adj_2gen_calc(delta, delta_se, ab, ab_se, r_delta_ab, h2f, h2f_se, rk, rk_se, is_beta=False, verbose=True): # Get estimates if is_beta: estimates = v_eta_delta(delta, delta_se, h2f, h2f_se, rk, beta=ab) estimates['beta'] = ab else: estimates = v_eta_delta(delta, delta_se, h2f, h2f_se, rk, alpha=ab) estimates['alpha'] = ab estimates['rk'] = rk estimates['delta'] = delta # Get ses ses = {'rk':rk_se, 'k':k_se(delta, delta_se, h2f, h2f_se, rk, rk_se), 'r':r_se(delta, delta_se, h2f, h2f_se, rk, rk_se), 'h2_eq':h2eq_se(delta, delta_se, h2f, h2f_se, rk, rk_se), 'rho':rho_se(delta, delta_se, h2f, h2f_se, rk, rk_se), 'delta':delta_se, 'v_eta_delta':se_v_eta_delta(delta, delta_se, ab, ab_se, r_delta_ab, h2f, h2f_se, rk, rk_se, is_beta=is_beta)} if is_beta: ses['alpha_delta'] = se_alpha_from_beta(delta, delta_se, ab, ab_se, r_delta_ab, h2f, h2f_se, rk, rk_se) ses['beta'] = ab_se ses['r_delta_beta'] = r_delta_ab else: ses['alpha_delta'] = se_alpha_from_alpha(delta, delta_se, ab, ab_se, r_delta_ab, h2f, h2f_se, rk, rk_se) ses['alpha'] = ab_se ses['r_delta_alpha'] = r_delta_ab # Print if verbose: for par in ['rk','k','r','h2_eq','rho','alpha_delta','v_eta_delta']: print(par+': '+str(round(estimates[par],4))+' ('+str(round(ses[par],4))+')') # Return return estimates, ses
[docs]def write_2gen_adj_ests(estimates,ses, outprefix=''): pars = np.array(['delta','alpha','rk','k','r','h2_eq','rho','alpha_delta','v_eta_delta']) outarray = np.zeros((pars.shape[0],2)) parcount = 0 for par in pars: outarray[parcount,:] = [estimates[par],ses[par]] parcount += 1 outarray = np.hstack((pars.reshape((pars.shape[0],1)),outarray)) outarray = np.vstack((np.array(['parameter','estimate','SE']).reshape((1,3)),outarray)) np.savetxt(outprefix+'.am_adj_pars.txt',outarray,fmt='%s') return outarray
[docs]def am_adj_2gen(estimates, estimate_cols, h2f, h2f_se, rk=None, rk_se=None, pg=None, y_std=1, pg_std=1): """ Adjust 2-generation model results for assortative mating (assuming equilibrium) """ ##### Check we have r, or a pg object to estimate r with ##### if pg is None: if rk is None: raise(ValueError('Need estimate of correlation between parents PGIs if not providing pg object to use to estimate')) if rk_se is None: raise(ValueError('Need standard error of correlation between parents PGIs if not providing pg object to use to estimate')) ##### Find estimates and their sampling variance-covariance ##### if 'proband' in estimate_cols: proband_index = np.where(estimate_cols=='proband')[0][0] else: raise(ValueError('Proband coefficient (direct effect) estimate needed for adjustment')) # If separate maternal and paternal NTC estimates, take average if 'paternal' in estimate_cols and 'maternal' in estimate_cols: paternal_index = np.where(estimate_cols=='paternal')[0][0] maternal_index = np.where(estimate_cols=='maternal')[0][0] est_cols = [proband_index, paternal_index, maternal_index] estimate = estimates[0][est_cols].reshape((3,1)) estimate_cov = estimates[1][np.ix_(est_cols,est_cols)] A = np.array([[1,0,0],[0,0.5,0.5]]) estimate = A @ estimate estimate_cov = A @ estimate_cov @ A.T elif 'parental' in estimate_cols: est_cols = [proband_index, np.where(estimate_cols=='parental')[0][0]] estimate = estimates[0][est_cols].reshape((2,1)) estimate_cov = estimates[1][np.ix_(est_cols,est_cols)] else: raise(ValueError('Need parental NTC estimate(s) for adjustment')) # Adjust for non-normalized pgs/phenotype estimate = estimate*pg_std/y_std estimate_cov = estimate_cov*((pg_std/y_std)**2) ##### Estimate correlation between parents' PGIs ###### if rk is None: rk, rk_se, fam_sizes = pg.estimate_r(parents_only=True) ##### Estimate equilibrium quantities ###### print('Computing equilibrium adjusted quantities') adj_estimates, adj_ses = am_adj_2gen_calc(estimate[0,0], np.sqrt(estimate_cov[0,0]), estimate[1,0], np.sqrt(estimate_cov[0,0]), estimate_cov[0,1]/np.sqrt(estimate_cov[0,0]*estimate_cov[1,1]), h2f, h2f_se, rk, rk_se, verbose=True) return adj_estimates, adj_ses