Source code for snipar.pgs

from snipar.gtarray import gtarray
import numpy as np
from snipar.read import get_gts_matrix
from snipar.utilities import *
from scipy.optimize import fmin_l_bfgs_b
from numba import njit, prange
import numpy.ma as ma
import snipar.slmm as slmm
from pysnptools.snpreader import Bed
import numdifftools as nd
from snipar.pgs_am import *

[docs]class pgs(object):
    """Define a polygenic score based on a set of SNPs with weights and ref/alt allele pairs.

    Args:
        snp_ids : :class:`~numpy:numpy.array`
            [L] vector of SNP ids
        weights : :class:`~numpy:numpy.array`
            [L] vector of weights of equal length to snp_ids
        alleles : :class:`~numpy:numpy.array`
            [L x 2] matrix of ref and alt alleles for the SNPs. L must match size of snp_ids

    Returns:
        pgs : :class:`snipar.pgs`

    """
    def __init__(self,snp_ids,weights,alleles):
        if snp_ids.shape[0] == weights.shape[0] and alleles.shape[0] == weights.shape[0] and alleles.shape[1]==2:
            self.snp_ids = snp_ids
            self.snp_dict = make_id_dict(snp_ids)
            self.weights = weights
            self.alleles = alleles
        else:
            raise ValueError('All inputs must have the same dimension')
    
[docs]    def remove_zeros(self):
        zero_weight = self.weights==0
        n_zero = np.sum(zero_weight)
        if n_zero>0:
            print('Filtering '+str(n_zero)+' SNPs with zero weight')
            self.snp_ids = self.snp_ids[~zero_weight]
            self.snp_dict = make_id_dict(self.snp_ids)
            self.weights = self.weights[~zero_weight]
            self.alleles = self.alleles[~zero_weight]
        else:
            print('No zero weight SNPs found')

[docs]    def compute(self, garray, cols=None):
        """Compute polygenic score values from a given genotype array. Finds the SNPs in the genotype array
        that have weights in the pgs and matching alleles, and computes the PGS based on these SNPs and the
        weights after allele-matching.


        Args:
            garray : :class:`sbreg.gtarray`
                genotype array to compute PGS values for
            cols : :class:`numpy:numpy.array`
                names to give the columns in the output gtarray

        Returns:
            pg : :class:`snipar.gtarray`
                2d gtarray with PGS values. If a 3d gtarray is input, then each column corresponds to
                the second dimension on the input gtarray (for example, individual, paternal, maternal PGS).
                If a 2d gtarray is input, then there will be only one column in the output gtarray. The
                names given in 'cols' are stored in 'sid' attribute of the output.

        """
        if not type(garray) == gtarray:
            raise ValueError('Must be of gtarray class')
        if garray.alleles is None:
            raise ValueError('Alleles of genotype matrix must be provided')
        # Match SNP IDs
        in_pgs_snps = np.array([x in self.snp_dict for x in garray.sid])
        nmatch = np.sum(in_pgs_snps)
        if nmatch==0:
            print('No overlap between PGS SNPs and genotype SNPs')
            return None
        else:
            # Get weights
            matched_snps = garray.sid[in_pgs_snps]
            matched_alleles = garray.alleles[in_pgs_snps,:]
            snp_indices = np.zeros((nmatch),dtype=int)
            for i in range(0,nmatch):
                snp_indices[i] = self.snp_dict[matched_snps[i]]
            weights_compute = self.weights[snp_indices]
            alleles = self.alleles[snp_indices,:]

            # Match alleles and adjust weights
            a_match = np.logical_and(alleles[:,0] == matched_alleles[:, 0], alleles[:,1] == matched_alleles[:, 1])
            a_reverse = np.logical_and(alleles[:,0] == matched_alleles[:, 1], alleles[:,1] == matched_alleles[:, 0])
            a_nomatch = np.logical_and(np.logical_not(a_match), np.logical_not(a_reverse))
            n_nomatch = np.sum(a_nomatch)
            if n_nomatch > 0:
                print('Removing ' + str(n_nomatch) + ' SNPs due to allele mismatch between genotypes and PGS alleles')
                weights_compute[a_nomatch] = 0
            weights_compute[a_reverse] = -weights_compute[a_reverse]

        ### Compute PGS
        if garray.ndim == 2:
            geno_means = np.mean(garray.gts[:, in_pgs_snps],axis=0)
            garray.gts = garray.gts[:, in_pgs_snps]-geno_means
            pgs_val = ma.dot(garray.gts[:, in_pgs_snps],weights_compute)
        elif garray.ndim == 3:
            geno_means = np.mean(garray.gts[:, 0, in_pgs_snps], axis=0)
            pgs_val = np.zeros((garray.gts.shape[0], garray.gts.shape[1]), garray.dtype)
            for i in range(0, garray.gts.shape[1]):
                if cols[i]=='parental':
                    sf = 2.0                
                else:
                    sf = 1.0
                garray.gts[:, i, in_pgs_snps] = garray.gts[:, i, in_pgs_snps]-sf*geno_means
                pgs_val[:, i] = ma.dot(garray.gts[:, i, in_pgs_snps], weights_compute)
        return pgarray(pgs_val, garray.ids, sid=cols, fams=garray.fams, par_status=garray.par_status, ped=garray.ped)
        
[docs]def read_weights(weights, SNP='SNP', beta_col='b', A1='A1', A2='A2', sep=None):
    if sep is None:
        weights = np.loadtxt(weights,dtype=str)
    else:
        weights = np.loadtxt(weights,dtype=str, delimiter=sep)
    colnames = weights[0,:]
    weights = weights[1:weights.shape[0],:]
    print('Read weights for '+str(weights.shape[0])+' variants')
    beta = np.array(weights[:,np.where(colnames == beta_col)[0][0]],dtype=np.float64)
    allele_indices = np.array([np.where(colnames==A1)[0][0],np.where(colnames==A2)[0][0]])
    return pgs(weights[:,np.where(colnames==SNP)[0][0]],
            beta,
            weights[:,allele_indices])

[docs]def read_pgs(pgs_file):
    pgs_f = open(pgs_file,'r')
    pgs_header = pgs_f.readline().split(' ')
    pgs_header[len(pgs_header)-1] = pgs_header[len(pgs_header)-1].split('\n')[0]
    ncols = len(pgs_header)
    if pgs_header[0]=='FID' and pgs_header[1]=='IID':
        precols = 2
    else:
        raise(ValueError('First two columns of PGS file must be FID and IID'))
    if pgs_header[2]=='FATHER_ID' and pgs_header[3]=='MOTHER_ID':
        precols = 4
    pgs_cols = tuple([x for x in range(precols,ncols)])
    # Read pedigree
    if precols==4:
        ped = np.loadtxt(pgs_file,usecols=tuple([x for x in range(4)]),dtype=str, skiprows=1)
    else:
        ped = None
    if ped is not None:
        par_status = np.array(ped[:,2:4]=='NA',dtype=int)
    else:
        par_status = None
    pg = pgarray(np.genfromtxt(pgs_file,usecols = pgs_cols, skip_header=1, missing_values=['nan','NA','N/A']),
                    np.loadtxt(pgs_file,usecols = 1, dtype=str, skiprows=1),
                    sid=np.array(pgs_header[precols:ncols]),
                    fams=np.loadtxt(pgs_file,usecols = 0, dtype=str, skiprows=1),
                    par_status = par_status,
                    ped=ped)
    return pg


[docs]def compute(pgs, bedfile=None, bgenfile=None, par_gts_f=None, ped=None, sib=False, compute_controls=False, verbose=True, batch_size=None):
    """Compute a polygenic score (PGS) for the individuals with observed genotypes and observed/imputed parental genotypes.

    Args:
        par_gts_f : :class:`str`
            path to HDF5 file with imputed parental genotypes
        gts_f : :class:`str`
            path to bed file with observed genotypes
        pgs : :class:`snipar.pgs`
            the PGS, defined by the weights for a set of SNPs and the alleles of those SNPs
        sib : :class:`bool`
            Compute the PGS for genotyped individuals with at least one genotyped sibling and observed/imputed parental genotypes. Default False.
        compute_controls : :class:`bool`
            Compute polygenic scores for control families (families with observed parental genotypes set to missing). Default False.

    Returns:
        pg : :class:`snipar.gtarray`
            Return the polygenic score as a genotype array with columns: individual's PGS, mean of their siblings' PGS, observed/imputed paternal PGS,
            observed/imputed maternal PGS

    """
    # Check for SNP overlap
    if bedfile is not None:
        bed = Bed(bedfile, count_A1=True)
        snp_ids = bed.sid
    if bgenfile is not None:
        bgen = open_bgen(bgenfile)
        snp_ids = bgen.ids
        if np.unique(snp_ids).shape[0] == 1:
            snp_ids = bgen.rsids
    pgs_snp_set = set(pgs.snp_ids)
    in_snp_set = np.array([x in pgs_snp_set for x in snp_ids])
    if np.sum(in_snp_set)==0:
        print('No overlap between variants in weights file and observed genotypes')
        return None
    else:
        ## Get genotype matrix
        snps_in_pgs = snp_ids[in_snp_set]
        # Get batch size
        if batch_size is not None:
            batch_size = min(batch_size, snps_in_pgs.shape[0])
        else: 
            batch_size = snps_in_pgs.shape[0]
        # Set batch boundaries 
        n_batches = int(np.ceil(snps_in_pgs.shape[0]/batch_size))
        batch_boundaries = np.zeros((n_batches,2),dtype=int)
        for i in range(n_batches-1):
            batch_boundaries[i,:] = [i*batch_size,(i+1)*batch_size]
        batch_boundaries[n_batches-1,:] = [(n_batches-1)*batch_size,snps_in_pgs.shape[0]]
        ## Compute PGS, reading snps in batches
        # First batch
        G = get_gts_matrix(bedfile=bedfile, bgenfile=bgenfile, par_gts_f=par_gts_f, ped=ped, 
                           snp_ids=snps_in_pgs[batch_boundaries[0,0]:batch_boundaries[0,1]], 
                           sib=sib, compute_controls=compute_controls, verbose=verbose)
        if sib:
            if G.shape[1]==4:
                cols = np.array(['proband', 'sibling', 'paternal', 'maternal'])
            else:
                cols = np.array(['proband','sibling','parental'])
        else:
            if G.shape[1]==3:
                cols = np.array(['proband', 'paternal', 'maternal'])
            else:
                cols = np.array(['proband','parental'])
        if compute_controls:
            pgs_out = [pgs.compute(x,cols) for x in G[0:3]]
            if sib:
                o_cols = np.array(['proband', 'sibling', 'parental'])
            else:
                o_cols = np.array(['proband','parental'])
            pgs_out.append(pgs.compute(G[3], o_cols))
        else:
            pgs_out = pgs.compute(G,cols)
        # Remaining batches
        for i in range(1,n_batches):
            del G
            G = get_gts_matrix(bedfile=bedfile, bgenfile=bgenfile, par_gts_f=par_gts_f, ped=ped, 
                               snp_ids=snps_in_pgs[batch_boundaries[i,0]:batch_boundaries[i,1]], 
                               sib=sib, compute_controls=compute_controls, verbose=False)
            if compute_controls:
                pgs_out_i = [pgs.compute(x,cols) for x in G[0:3]]
                if sib:
                    o_cols = np.array(['proband', 'sibling', 'parental'])
                else:
                    o_cols = np.array(['proband','parental'])
                pgs_out_i.append(pgs.compute(G[3], o_cols))
                pgs_out = [pgs_out[x].add(pgs_out_i[x]) for x in range(0, len(pgs_out))]
            else:
                pgs_out = pgs_out.add(pgs.compute(G,cols))
        return pgs_out
        

[docs]class pgarray(gtarray):

[docs]    def add(self,garray):
        """
        Adds another gtarray of the same dimension to this array and returns the sum. It matches IDs before summing.
        """
        if type(garray)==pgarray:
            pass
        else:
            raise ValueError('Must add to another pgarray')

        if not self.gts.ndim == garray.gts.ndim:
            raise ValueError('Arrays must have same number of dimensions')

        if self.gts.ndim == 2:
            if not self.gts.shape[1] == garray.gts.shape[1]:
                raise ValueError('Arrays must have same dimensions (apart from first)')

        if self.gts.ndim == 3:
            if not self.gts.shape[1:3] == garray.gts.shape[1:3]:
                raise ValueError('Arrays must have same dimensions (apart from first)')

        # Match IDs
        common_ids = list(self.id_dict.keys() & garray.id_dict.keys())
        if len(common_ids) == 0:
            raise ValueError('No IDs in common')
        self_index = np.array([self.id_dict[x] for x in common_ids])
        other_index = np.array([garray.id_dict[x] for x in common_ids])

        # Out
        if self.ids.ndim == 1:
            ids_out = self.ids[self_index]
        else:
            ids_out = self.ids[self_index, :]

        if self.gts.ndim ==2:
            add_gts = self.gts[self_index, :]+garray.gts[other_index, :]
        else:
            add_gts = self.gts[self_index, :, :] + garray.gts[other_index, :, :]
        if self.ped is None:
            ped = garray.ped
        else:
            ped = self.ped

        return pgarray(add_gts, ids_out, self.sid, alleles=self.alleles, fams=self.fams[self_index], par_status=self.par_status[self_index,:], ped=ped)

[docs]    def filter_bpg(self):
        if self.par_status is None:
            raise(ValueError('Parental genotype status unknown so cannot restrict to both parents genotyped sample'))
        else:
            bpg_ids = self.ids[np.sum(self.par_status==0, axis=1)==2]
            self.filter_ids(bpg_ids)

[docs]    def estimate_r(self, return_se=True, parents_only=False):
        # Check pgs columns
        if 'paternal' in self.sid:
            paternal_index = np.where(self.sid=='paternal')[0][0]
        if 'maternal' in self.sid:
            maternal_index = np.where(self.sid=='maternal')[0][0]
        # count fams
        fams = np.unique(self.fams, return_counts=True)
        # Mapping of sibships to pgs rows
        fam_dict = {}
        for fam in fams[0]:
            fam_dict[fam] = np.where(self.fams==fam)[0]
        # record genotype status of parents in each fam
        par_status_fams = np.zeros((fams[0].shape[0],2),dtype=int)
        for i in range(fams[0].shape[0]):
            par_status_fams[i,:] = self.par_status[fam_dict[fams[0][i]][0]]
        # get family sizes including parents
        fsizes = fams[1]+np.sum(par_status_fams==0,axis=1)
        ## pgs matrix with observed sibs and parents for each fam
        pgs_fam = np.zeros((fams[0].shape[0],np.max(fsizes)))
        pgs_fam[:] = np.nan
        # record whether sib or parent
        is_sib = np.zeros((fams[0].shape[0],np.max(fsizes)),dtype=bool)
        is_father = np.zeros((fams[0].shape[0],np.max(fsizes)),dtype=bool)
        is_mother = np.zeros((fams[0].shape[0],np.max(fsizes)),dtype=bool)
        # populate
        for i in range(fams[0].shape[0]):
            # Fill in sibs
            sib_indices = fam_dict[fams[0][i]]
            pgs_fam[i,0:sib_indices.shape[0]] = self.gts[sib_indices,0]
            is_sib[i,0:sib_indices.shape[0]] = True
            npar = 0
            if par_status_fams[i,0] == 0:
                pgs_fam[i,sib_indices.shape[0]] = self.gts[sib_indices[0],paternal_index]
                is_father[i,sib_indices.shape[0]] = True
                npar = 1
            if par_status_fams[i,1] == 0:
                pgs_fam[i,sib_indices.shape[0]+npar] = self.gts[sib_indices[0],maternal_index]
                is_mother[i,sib_indices.shape[0]+npar] = True
        is_parent = np.logical_or(is_father,is_mother)
        # normalize
        pgs_fam[is_sib] = (pgs_fam[is_sib]-np.mean(pgs_fam[is_sib]))/np.std(pgs_fam[is_sib])
        if np.sum(is_mother)>0:
            pgs_fam[is_mother] = (pgs_fam[is_mother]-np.mean(pgs_fam[is_mother]))/np.std(pgs_fam[is_mother])
        if np.sum(is_father)>0:
            pgs_fam[is_father] = (pgs_fam[is_father]-np.mean(pgs_fam[is_father]))/np.std(pgs_fam[is_father])
        # fam size dict
        fsizes = dict(zip(list(fams[0]),list(fams[1])))
        ### find correlation between maternal and paternal pgis
        print('Finding MLE for correlation between parents scores')
        ## Initialize with correlation from sibs and between parents
        # correlation between parents
        bpg = np.sum(self.par_status==0,axis=1)==2
        n_bpg = np.sum(bpg)
        if n_bpg>0:
            r_bpg = np.corrcoef(self.gts[bpg,1],self.gts[bpg,2])[0,1]
        else:
            r_bpg = 0
        if parents_only:
            if n_bpg>0:
                r_se = (1-r_bpg**2)/np.sqrt(n_bpg-1)
                return r_bpg, r_se, fsizes
            else:
                return np.nan, np.nan, fsizes
        else:
            # Correlation from sibs
            sib_2 = np.sum(is_sib,axis=1)>1
            n_sib_2 = np.sum(sib_2)
            if n_sib_2>0:
                r_sib = 2*np.corrcoef(pgs_fam[sib_2,0],pgs_fam[sib_2,1])[0,1]-1
            else:
                r_sib = 0
            # Initialize at weighted average
            r_init = n_bpg*r_bpg+n_sib_2*r_sib
            r_init = r_init/(n_bpg+n_sib_2)
            # Find MLE
            optimized = fmin_l_bfgs_b(func=pgs_cor_lik, x0=r_init,
                                    args=(pgs_fam, is_sib, is_parent),
                                    approx_grad=True,
                                    bounds=[(-0.999,0.999)])
            if optimized[2]['warnflag']==0:
                r=optimized[0][0]
                hess = nd.Hessian(pgs_cor_lik)
                hess = float(hess([r], pgs_fam, is_sib, is_parent))
                r_se = np.sqrt(2/hess)
            else:
                print('Could not find MLE for correlation. Returning weighted average from siblings and parents.')
                r=r_init
                r_se = (1-r**2)/np.sqrt(n_bpg+n_sib_2-1)
            return r, r_se, fsizes
    
[docs]    def am_adj(self):
        print('Estimating correlation between maternal and paternal PGSs assuming equilibrium')
        r, r_se, fsizes = self.estimate_r()
        r_z = r/r_se
        print('Estimated correlation between maternal and paternal PGSs: '+str(round(r,4))+' S.E.='+str(round(r_se,4)))
        if r_z>1.5:    
            # Check pgs columns
            if 'paternal' in self.sid and 'maternal' in self.sid:
                paternal_index = np.where(self.sid=='paternal')[0][0]
                maternal_index = np.where(self.sid=='maternal')[0][0]
                parental_index = None
            elif 'parental' in self.sid:
                parental_index = np.where(self.sid=='parental')[0][0]
            else:
                raise(ValueError('No parental PGS values to adjust'))
            # Adjust imputed parental PGSs
            npar = np.sum(self.par_status==0,axis=1)
            print('Adjuting imputed PGSs for assortative mating')
            for i in range(self.gts.shape[0]):
                # No parents genotyped
                if npar[i]==0:
                    if parental_index is None:
                        self.gts[i,[paternal_index, maternal_index]] = npg_am_adj(r,fsizes[self.fams[i]])*self.gts[i,[paternal_index, maternal_index]]
                    else:
                        self.gts[i,parental_index] = npg_am_adj(r,fsizes[self.fams[i]])*self.gts[i,parental_index] 
                # One parent genotyped
                if npar[i]==1:
                    # Father imputed
                    if self.par_status[i,0] == 1:
                        self.gts[i,paternal_index] = opg_am_adj(self.gts[i,paternal_index],self.gts[i,maternal_index],r,fsizes[self.fams[i]])
                    # Mother imputed
                    if self.par_status[i,1] == 1:
                        self.gts[i,maternal_index] = opg_am_adj(self.gts[i,maternal_index],self.gts[i,paternal_index],r,fsizes[self.fams[i]])
        else:
            print('Signal to noise ratio for correlation estimate too small, so not performing assortative mating adjustment. Use --force_am_adj to override')
        return r, r_se

[docs]    def scale(self, sf=None):
        if sf is None:
            if 'proband' in self.sid:
                proband_index = np.where(self.sid=='proband')[0][0]
            else:
                raise(ValueError('Cannot scale as no proband column found and no scale factor provided'))
            sf = np.std(self.gts[:, proband_index])
        # Rescale by observed proband PGS
        self.gts = self.gts / sf
        return sf

[docs]    def write(self, filename, scale=False):
        if scale:
            self.scale()
        ####### Write PGS to file ########  
        parent_genotyped = self.par_status == 0
        ped_dict = make_id_dict(self.ped,1)
        ped_indices = np.array([ped_dict[x] for x in self.ids])
        parent_ids = self.ped[ped_indices,2:4]
        parent_ids[~parent_genotyped] = 'NA'
        pg_out = np.column_stack((self.fams,self.ids,parent_ids,self.gts))
        pg_header = np.column_stack((np.array(['FID','IID','FATHER_ID','MOTHER_ID']).reshape(1,4),self.sid.reshape(1,self.sid.shape[0])))
        pg_out = np.row_stack((pg_header,pg_out))
        print('Writing PGS to ' + filename)
        np.savetxt(filename, pg_out, fmt='%s')
        return pg_out
    
[docs]    def compute_grandpar(self, r):
        # Check we have required information
        if self.ped is None:
            raise(ValueError('Pedigree needed for grandparental model'))
        if self.par_status is None:
            raise(ValueError('Parental genotype status needed to compute grandparents'))
        # Check pgs columns
        if 'paternal' in self.sid:
            paternal_index = np.where(self.sid=='paternal')[0][0]
        else:
            raise(ValueError('No paternal PGS column found'))
        if 'maternal' in self.sid:
            maternal_index = np.where(self.sid=='maternal')[0][0]
        else:
            raise(ValueError('No maternal PGS column found'))
        # Find individuals with both parents genotyped
        bpg = np.sum(self.par_status==0,axis=1)==2
        n_bpg = np.sum(bpg)
        print('Found '+str(n_bpg)+' individuals with both parents genotyped for which grandparental scores will be computed')
        ## Create grandparental PGS matrix
        gpar = np.zeros((self.gts.shape[0],4))
        gpar[:] = np.nan
        if n_bpg>0:
            # Find their parents' IDs
            ped_dict = make_id_dict(self.ped,1)
            bpg_ped = self.ped[[ped_dict[x] for x in self.ids[bpg]],:]
            # Find the mean of the mothers and fathers
            parent_means = np.mean(self.gts[bpg,:],axis=0)[[paternal_index, maternal_index]]
            ## Fill
            for i in range(bpg_ped.shape[0]):
                i_index = self.id_dict[bpg_ped[i,1]]
                ## Paternal grandparental scores
                # Find imputed grandparents
                if bpg_ped[i,2] in self.id_dict:
                    gpar[i_index, 0:2] = self.gts[self.id_dict[bpg_ped[i,2]],[paternal_index,maternal_index]]
                else:
                    # linear imputation from father if no imputed
                    gpar[i_index, 0:2] = parent_means[0]+(1+r)*(self.gts[i_index, paternal_index]-parent_means[0])/2.0
                ## Maternal grandparental scores
                # Find imputed grandparents
                if bpg_ped[i,3] in self.id_dict:
                    gpar[i_index, 2:4] = self.gts[self.id_dict[bpg_ped[i,3]],[paternal_index,maternal_index]]
                else:
                    # linear imputation from mother if no imputed
                    gpar[i_index, 2:4] = parent_means[1]+(1+r)*(self.gts[i_index, maternal_index]-parent_means[1])/2.0
        ## Append to PGS matrix
        self.gts = np.hstack((self.gts,gpar))
        self.sid = np.hstack((self.sid,np.array(['gpp','gpm','gmp','gmm'])))
        return bpg_ped
        
[docs]def opg_am_adj(pgi_imp, pgi_obs, r, n):
    rcoef = r/((2**n)*(1+r)-r)
    return rcoef*pgi_obs+(1+rcoef)*pgi_imp

[docs]def npg_am_adj(r,n):
  rcoef = (1+r)/(1+(1-(1/2)**(n-1))*r)
  return rcoef

[docs]@njit
def pgs_corr_matrix(r,is_sib_fam,is_parent_fam):
    n_sib = np.sum(is_sib_fam)
    n_par = np.sum(is_parent_fam)
    # Full matrix
    R = np.zeros((n_sib+n_par,n_sib+n_par), dtype=np.float_)
    np.fill_diagonal(R, 1.0)
    # sib submatrix
    r_sib = (1+r)/2
    R_sib = r_sib*np.ones((n_sib,n_sib),dtype=np.float_)
    np.fill_diagonal(R_sib,1.0)
    R[0:n_sib,0:n_sib] = R_sib
    # sib to parent
    if n_par>0:
        R[0:n_sib,n_sib:(n_sib+n_par)] = r_sib
        R[n_sib:(n_sib+n_par),0:n_sib] = r_sib
    # parent-to-parent
    if n_par>1:
        R[n_sib+n_par-1,n_sib+n_par-2] = r
        R[n_sib+n_par-2,n_sib+n_par-1] = r
    # return 
    return R

[docs]@njit
def pgs_corr_likelihood_fam(r,pg_fam,is_sib_fam,is_parent_fam):
    sib_or_parent = np.logical_or(is_sib_fam,is_parent_fam)
    R = pgs_corr_matrix(r,is_sib_fam,is_parent_fam)
    slogdet_R = np.linalg.slogdet(R)
    pg_vec = pg_fam[sib_or_parent].reshape((np.sum(sib_or_parent),1))
    L_fam = slogdet_R[0]*slogdet_R[1]+pg_vec.T @ np.linalg.inv(R) @ pg_vec
    return L_fam[0,0]

[docs]@njit(parallel=True)
def pgs_corr_likelihood(r,pgs_fam,is_sib,is_parent):
    L = 0.0
    for i in prange(pgs_fam.shape[0]):
        L += pgs_corr_likelihood_fam(r, pgs_fam[i,:], is_sib[i,:], is_parent[i,:])
    return L

[docs]def pgs_cor_lik(r, *args):
    pgs_fam, is_sib, is_parent = args
    return pgs_corr_likelihood(r[0], pgs_fam, is_sib, is_parent)

[docs]def simulate_r_inf(r, nfam, nsib, npar):
    is_sib = np.zeros((nfam,nsib+npar),dtype=np.bool_)
    is_sib[:,0:nsib] = True
    is_parent = np.zeros((nfam,nsib+npar),dtype=np.bool_)
    is_parent[:,nsib:(nsib+npar)] = True
    R = pgs_corr_matrix(r,is_sib[0,:],is_parent[0,:])
    pgs_fam = np.random.multivariate_normal(np.zeros((nsib+npar)), R, size=nfam)
    # correlation between parents
    if npar==2:
        bpg = np.ones((nfam),dtype=bool)
    else:
        bpg = np.zeros((nfam),dtype=bool)
    n_bpg = np.sum(bpg)
    if n_bpg>0:
        r_bpg = np.corrcoef(pgs_fam[:,nsib],pgs_fam[:,nsib+1])[0,1]
    else:
        r_bpg = 0
    # Correlation from sibs
    if nsib>1:
        sib_2 = np.ones((nfam),dtype=bool)
    else:
        sib_2 = np.zeros((nfam),dtype=bool)
    n_sib_2 = np.sum(sib_2)
    if n_sib_2>0:
        r_sib = 2*np.corrcoef(pgs_fam[sib_2,0],pgs_fam[sib_2,1])[0,1]-1
    else:
        r_sib = 0
    # Initialize at weighted average
    r_init = n_bpg*r_bpg+n_sib_2*r_sib
    r_init = r_init/(n_bpg+n_sib_2)
    # Find MLE
    optimized = fmin_l_bfgs_b(func=pgs_cor_lik, x0=r_init,
                            args=(pgs_fam, is_sib, is_parent),
                            approx_grad=True,
                            bounds=[(-0.999,0.999)])
    return np.array([r_init,optimized[0][0]])

[docs]def fit_pgs_model(y, pg, ngen, ibdrel_path=None, covariates=None, fit_sib=False, parsum=False, gparsum=False, outprefix=None, sparse_thresh=0.025):
    pg.gts = ma.array(pg.gts,fill_value=np.nan)
    pg.gts = pg.gts.filled()
    if ngen in [1,2,3]:
        pass
    else:
        raise(ValueError('ngen must be 1, 2, or 3'))
    # Check if IDs are aligned
    if not np.array_equal(y.ids,pg.ids):
        pg.filter_ids(y.ids)
        y.filter_ids(pg.ids)
    # Check covariate IDs are aligned
    if covariates is not None:
        if not np.array_equal(pg.ids,covariates.ids):
            covariates.filter_ids(pg.ids)
    ## Fit model
    if ngen==1:
        print('Fitting 1 generation model (proband only)')
        alpha, alpha_cols = make_and_fit_model(y, pg, ['proband'], ibdrel_path=ibdrel_path, covariates=covariates, sparse_thresh=sparse_thresh)
    elif ngen==2 or ngen==3:
        if fit_sib:
            if 'sib' in pg.sid:
                pg_cols = ['proband','sibling']
            else:
                raise(ValueError('Sibling PGS not found (use --fit_sib when calculating PGS)'))
        else:
            pg_cols = ['proband']
        if parsum:
            if 'maternal' in pg.sid and 'paternal' in pg.sid:
                parcols = np.sort(np.array([np.where(pg.sid=='maternal')[0][0],np.where(pg.sid=='paternal')[0][0]]))
                trans_matrix = np.identity(pg.gts.shape[1])
                trans_matrix[:,parcols[0]] += trans_matrix[:,parcols[1]]
                trans_matrix = np.delete(trans_matrix,parcols[1],1)
                pg.gts = pg.gts.dot(trans_matrix)
                pg.sid = np.delete(pg.sid,parcols[1])
                pg.sid[parcols[0]] = 'parental'
            elif 'parental' in pg.sid:
                pass
            else:
                raise(ValueError('Maternal and paternal PGS not found so cannot sum (--parsum option given)'))
            pg_cols.append('parental')
        else:
            pg_cols += ['paternal','maternal']
        if ngen==2:
            print('Fitting 2 generation model (proband and observed/imputed parents)')
            alpha, alpha_cols = make_and_fit_model(y, pg, pg_cols, ibdrel_path=ibdrel_path, covariates=covariates, sparse_thresh=sparse_thresh)
        elif ngen==3:
            print('Fitting 3 generation model: observed proband and observed parents, and observed/imputed grandparents')
            if gparsum:
                if 'gpp' in pg.sid and 'gpm' in pg.sid and 'gmp' in pg.sid and 'gmm' in pg.sid:
                    # Sum of paternal grandparents
                    gparcols = np.sort(np.array([np.where(pg.sid==x)[0][0] for x in ['gpp','gpm']]))
                    trans_matrix = np.identity(pg.gts.shape[1])
                    trans_matrix[:,gparcols[0]] += trans_matrix[:,gparcols[1]]
                    trans_matrix = np.delete(trans_matrix,gparcols[1],1)
                    pg.gts = pg.gts.dot(trans_matrix)
                    pg.sid = np.delete(pg.sid,gparcols[1])
                    pg.sid[gparcols[0]] = 'gp'
                    # Sum of maternal grandparents
                    gparcols = np.sort(np.array([np.where(pg.sid==x)[0][0] for x in ['gmp','gmm']]))
                    trans_matrix = np.identity(pg.gts.shape[1])
                    trans_matrix[:,gparcols[0]] += trans_matrix[:,gparcols[1]]
                    trans_matrix = np.delete(trans_matrix,gparcols[1],1)
                    pg.gts = pg.gts.dot(trans_matrix)
                    pg.sid = np.delete(pg.sid,gparcols[1])
                    pg.sid[gparcols[0]] = 'gm'
                elif 'gp' in pg.sid and 'gm' in pg.sid:
                    pass
                else:
                    raise(ValueError('Grandparental PGSs not found so cannot sum (--gparsum option given)'))
                pg_cols += ['gp','gm']
            else:
                pg_cols += ['gpp','gpm','gmp','gmm']
            alpha, alpha_cols = make_and_fit_model(y, pg, pg_cols, ibdrel_path=ibdrel_path, covariates=covariates, sparse_thresh=sparse_thresh)
    # Save to file
    if outprefix is not None:
        write_estimates(outprefix+'.'+str(ngen), alpha, alpha_cols)
    return alpha, alpha_cols

[docs]def make_and_fit_model(y, pg, pg_cols, ibdrel_path=None, covariates=None, sparse_thresh=0.025):
    pg_col_indices = [np.where(pg.sid==x)[0][0] for x in pg_cols]
    if covariates is not None:
        X = np.hstack((covariates.gts,np.array(pg.gts[:,pg_col_indices])))
        X_cols = np.hstack((np.array(['intercept']),covariates.sid,pg_cols))
    else:
        X = np.array(pg.gts[:, pg_col_indices])
        X_cols = np.hstack((np.array(['intercept']),pg_cols))
    # Check for NAs
    no_NA = np.sum(np.isnan(X),axis=1)==0
    print('Sample size: '+str(np.sum(no_NA)))
    ## Read GRM
    id_dict = make_id_dict(pg.ids[no_NA])
    varcomp_lst = make_grms(pg.fams[no_NA], ibdrel_path=ibdrel_path, id_dict=id_dict, keep=pg.ids[no_NA], sparse_thresh=sparse_thresh)
    ## Define LMM
    slmm_model = slmm.LinearMixedModel(np.array(y.gts[no_NA,0],dtype=float), 
                    varcomp_arr_lst=varcomp_lst, covar_X=np.array(X[no_NA,:],dtype=float), add_intercept=True)
    # Optimize Lmm
    slmm_model.scipy_optimize()
    # Print variance components
    print(f'Variance components: {list(i for i in slmm_model.varcomps)}')
    ZT_Vinv_Z_imp = slmm_model.Z.T @ slmm_model.Vinv_Z
    alpha = [np.linalg.solve(ZT_Vinv_Z_imp, slmm_model.Z.T @ slmm_model.Vinv_y), np.linalg.inv(ZT_Vinv_Z_imp)]
    return alpha, X_cols

[docs]def write_estimates(outprefix, alpha, cols):
    cols = cols.reshape((cols.shape[0],1))
    # Find cols
    alpha_out = np.zeros((alpha[0].shape[0], 2))
    alpha_out[:, 0] = alpha[0]
    alpha_out[:, 1] = np.sqrt(np.diag(alpha[1]))
    print('Saving effect estimates to '+outprefix+ '.effects.txt')
    np.savetxt(outprefix + '.effects.txt',
                np.hstack((cols, np.array(alpha_out, dtype='S'))),
                delimiter='\t', fmt='%s')
    vcols = np.zeros((1,cols.shape[0]+1),dtype=cols.dtype)
    vcols[0,1:vcols.shape[1]] = cols[:,0]
    alpha_cov_out = np.vstack((vcols, np.hstack((cols,alpha[1]))))
    print('Saving sampling variance-covariance matrix to '+outprefix+ '.vcov.txt')
    np.savetxt(outprefix+ '.vcov.txt',alpha_cov_out, fmt='%s')

[docs]def make_grms(fams, ibdrel_path=None, id_dict=None, keep=None, sparse_thresh=0.05):
    if ibdrel_path is not None:
        grm_data, grm_row_ind, grm_col_ind = slmm.build_ibdrel_arr(
            ibdrel_path, id_dict=id_dict, keep=keep, thres=sparse_thresh)
    ## Build sparse sib GRM
    sib_data, sib_row_ind, sib_col_ind = slmm.build_sib_arr(fams)
    ## GRM list
    if 'grm_data' in locals():
        varcomp_lst = (
            (grm_data, grm_row_ind, grm_col_ind),
            (sib_data, sib_row_ind, sib_col_ind),
        )
    else:
        varcomp_lst = (
            (sib_data, sib_row_ind, sib_col_ind),
        )
    return varcomp_lst

######## Assortative mating adjustment in two-generation model ###########

[docs]def h2f_parse(h2f_str):
    h2f_split = h2f_str.split(',')
    if not len(h2f_split)==2:
        raise(ValueError('Invalid h2f input. Please use estimate,se format, such as 0.5,0.01.'))
    h2f = float(h2f_split[0])
    h2f_se = float(h2f_split[1].split('\n')[0])
    return h2f, h2f_se

[docs]def am_adj_2gen_calc(delta, delta_se, ab, ab_se, r_delta_ab, h2f, h2f_se, rk, rk_se, is_beta=False, verbose=True):
    # Get estimates
    if is_beta:
        estimates = v_eta_delta(delta, delta_se, h2f, h2f_se, rk, beta=ab)
        estimates['beta'] = ab
    else:
        estimates = v_eta_delta(delta, delta_se, h2f, h2f_se, rk, alpha=ab)
        estimates['alpha'] = ab
    estimates['rk'] = rk
    estimates['delta'] = delta
    
    # Get ses
    ses = {'rk':rk_se,
           'k':k_se(delta, delta_se, h2f, h2f_se, rk, rk_se),
           'r':r_se(delta, delta_se, h2f, h2f_se, rk, rk_se),
           'h2_eq':h2eq_se(delta, delta_se, h2f, h2f_se, rk, rk_se),
           'rho':rho_se(delta, delta_se, h2f, h2f_se, rk, rk_se),
           'delta':delta_se,
           'v_eta_delta':se_v_eta_delta(delta, delta_se, ab, ab_se, r_delta_ab, h2f, h2f_se, rk, rk_se, is_beta=is_beta)}
    if is_beta:
        ses['alpha_delta'] = se_alpha_from_beta(delta, delta_se, ab, ab_se, r_delta_ab, h2f, h2f_se, rk, rk_se)
        ses['beta'] = ab_se
        ses['r_delta_beta'] = r_delta_ab
    else:
        ses['alpha_delta'] = se_alpha_from_alpha(delta, delta_se, ab, ab_se, r_delta_ab, h2f, h2f_se, rk, rk_se)
        ses['alpha'] = ab_se
        ses['r_delta_alpha'] = r_delta_ab
    # Print
    if verbose:
        for par in ['rk','k','r','h2_eq','rho','alpha_delta','v_eta_delta']:
            print(par+': '+str(round(estimates[par],4))+' ('+str(round(ses[par],4))+')')
    # Return
    return estimates, ses

[docs]def write_2gen_adj_ests(estimates,ses, outprefix=''):
    pars = np.array(['delta','alpha','rk','k','r','h2_eq','rho','alpha_delta','v_eta_delta'])
    outarray = np.zeros((pars.shape[0],2))
    parcount = 0
    for par in pars:
        outarray[parcount,:] = [estimates[par],ses[par]]
        parcount += 1
    outarray = np.hstack((pars.reshape((pars.shape[0],1)),outarray))
    outarray = np.vstack((np.array(['parameter','estimate','SE']).reshape((1,3)),outarray))
    np.savetxt(outprefix+'.am_adj_pars.txt',outarray,fmt='%s')
    return outarray

[docs]def am_adj_2gen(estimates, estimate_cols, h2f, h2f_se, rk=None, rk_se=None, pg=None, y_std=1, pg_std=1):
    """
    Adjust 2-generation model results for assortative mating (assuming equilibrium)
    """
    ##### Check we have r, or a pg object to estimate r with #####
    if pg is None:
        if rk is None:
            raise(ValueError('Need estimate of correlation between parents PGIs if not providing pg object to use to estimate'))
        if rk_se is None:
            raise(ValueError('Need standard error of correlation between parents PGIs if not providing pg object to use to estimate')) 
    ##### Find estimates and their sampling variance-covariance #####
    if 'proband' in estimate_cols:
        proband_index = np.where(estimate_cols=='proband')[0][0]
    else:
        raise(ValueError('Proband coefficient (direct effect) estimate needed for adjustment'))
    # If separate maternal and paternal NTC estimates, take average
    if 'paternal' in estimate_cols and 'maternal' in estimate_cols:
        paternal_index = np.where(estimate_cols=='paternal')[0][0]
        maternal_index = np.where(estimate_cols=='maternal')[0][0]
        est_cols = [proband_index, paternal_index, maternal_index]
        estimate = estimates[0][est_cols].reshape((3,1))
        estimate_cov = estimates[1][np.ix_(est_cols,est_cols)]
        A =  np.array([[1,0,0],[0,0.5,0.5]])
        estimate = A @ estimate
        estimate_cov = A @ estimate_cov @ A.T
    elif 'parental' in estimate_cols:
        est_cols = [proband_index, np.where(estimate_cols=='parental')[0][0]]
        estimate = estimates[0][est_cols].reshape((2,1))
        estimate_cov = estimates[1][np.ix_(est_cols,est_cols)]
    else:
        raise(ValueError('Need parental NTC estimate(s) for adjustment')) 
    # Adjust for non-normalized pgs/phenotype
    estimate = estimate*pg_std/y_std
    estimate_cov = estimate_cov*((pg_std/y_std)**2)
    ##### Estimate correlation between parents' PGIs ######
    if rk is None:
        rk, rk_se, fam_sizes = pg.estimate_r(parents_only=True)
    ##### Estimate equilibrium quantities ######
    print('Computing equilibrium adjusted quantities')
    adj_estimates, adj_ses = am_adj_2gen_calc(estimate[0,0], np.sqrt(estimate_cov[0,0]), 
                                              estimate[1,0], np.sqrt(estimate_cov[0,0]), 
                                              estimate_cov[0,1]/np.sqrt(estimate_cov[0,0]*estimate_cov[1,1]), 
                                              h2f, h2f_se, rk, rk_se, verbose=True)
    return adj_estimates, adj_ses