#!/usr/bin/env python
"""Infers direct effects, non-transmitted coefficients (NTCs), and population effects of genome-wide SNPs on a phenotype.
Minimally: the script requires observed genotypes on phenotyped individuals and their parents, and/or
parental genotypes imputed by snipar's impute.py script, along with a phenotype file.
Args:
@parser@
Results:
sumstats.gz
For each chromosome, a gzipped text file containing the SNP level summary statistics.
"""
import argparse, h5py
import snipar.read as read
import numpy as np
import snipar.lmm as lmm
from snipar.utilities import *
from snipar.gwas import *
from numba import set_num_threads
from numba import config as numba_config
from snipar.pedigree import get_sibpairs_from_ped
from snipar.utilities import get_parser_doc
######### Command line arguments #########
parser=argparse.ArgumentParser()
parser.add_argument('phenofile',type=str,help='Location of the phenotype file')
parser.add_argument('--bgen',
type=str,help='Address of the phased genotypes in .bgen format. If there is a @ in the address, @ is replaced by the chromosome numbers in the range of chr_range for each chromosome (chr_range is an optional parameters for this script).')
parser.add_argument('--bed',
type=str,help='Address of the unphased genotypes in .bed format. If there is a @ in the address, @ is replaced by the chromosome numbers in the range of chr_range for each chromosome (chr_range is an optional parameters for this script).')
parser.add_argument('--imp', type=str, help='Address of hdf5 files with imputed parental genotypes (without .hdf5 suffix). If there is a @ in the address, @ is replaced by the chromosome numbers in the range of chr_range (chr_range is an optional parameters for this script).', default = None)
parser.add_argument('--chr_range',
type=parseNumRange,
nargs='*',
action=NumRangeAction,
help='number of the chromosomes to be imputed. Should be a series of ranges with x-y format or integers.', default=None)
parser.add_argument('--out', type=str, help="The summary statistics will output to this path, one file for each chromosome. If the path contains '@', the '@' will be replaced with the chromosome number. Otherwise, the summary statistics will be output to the given path with file names chr_1.sumstats.gz, chr_2.sumstats.gz, etc. for the text sumstats, and chr_1.sumstats.hdf5, etc. for the HDF5 sumstats",default='./')
parser.add_argument('--pedigree',type=str,help='Address of pedigree file. Must be provided if not providing imputed parental genotypes.',default=None)
parser.add_argument('--parsum',action='store_true',help='Regress onto proband and sum of (imputed/observed) maternal and paternal genotypes. Default uses separate paternal and maternal genotypes when available.',default = False)
parser.add_argument('--fit_sib',action='store_true',help='Fit indirect effect from sibling ',default=False)
parser.add_argument('--covar',type=str,help='Path to file with covariates: plain text file with columns FID, IID, covar1, covar2, ..', default=None)
parser.add_argument('--phen_index',type=int,help='If the phenotype file contains multiple phenotypes, which phenotype should be analysed (default 1, first)',
default=1)
parser.add_argument('--min_maf',type=float,help='Ignore SNPs with minor allele frequency below min_maf (default 0.01)', default=0.01)
parser.add_argument('--threads',type=int,help='Number of threads to use for IBD inference. Uses all available by default.',default=None)
parser.add_argument('--max_missing',type=float,help='Ignore SNPs with greater percent missing calls than max_missing (default 5)', default=5)
parser.add_argument('--batch_size',type=int,help='Batch size of SNPs to load at a time (reduce to reduce memory requirements)',default=100000)
parser.add_argument('--no_hdf5_out',action='store_true',help='Suppress HDF5 output of summary statistics',default=False)
parser.add_argument('--no_txt_out',action='store_true',help='Suppress text output of summary statistics',default=False)
parser.add_argument('--missing_char',type=str,help='Missing value string in phenotype file (default NA)', default='NA')
parser.add_argument('--tau_init',type=float,help='Initial value for ratio between shared family environmental variance and residual variance',
default=1)
__doc__ = __doc__.replace("@parser@", get_parser_doc(parser))
# Set number of threads
[docs]def main(args):
""""Calling this function with args is equivalent to running this script from commandline with the same arguments.
Args:
args: list
list of all the desired options and arguments. The possible values are all the values you can pass this script from commandline.
"""
# Set number of threads
if args.threads is not None:
num_threads = min([args.threads, numba_config.NUMBA_NUM_THREADS])
else:
num_threads = numba_config.NUMBA_NUM_THREADS
set_num_threads(num_threads)
print('Number of threads: '+str(num_threads))
# Check arguments
if args.bed is None and args.bgen is None:
raise(ValueError('Must provide one of --bedfiles and --bgenfiles'))
if args.bed is not None and args.bgen is not None:
raise(ValueError('Both bed files and bgen files provided. Please provide only one'))
if args.imp is None and args.pedigree is None:
raise(ValueError('Must provide pedigree if not providing imputed parental genotypes file(s)'))
# Find observed and imputed files
if args.imp is None:
print('Warning: no imputed parental genotypes provided. Will analyse only individuals with both parents genotyped.')
if args.bed is not None:
bedfiles, chroms = parse_obsfiles(args.bed, 'bed', chromosomes=args.chr_range)
bgenfiles = [None for x in range(chroms.shape[0])]
elif args.bgen is not None:
bgenfiles, chroms = parse_obsfiles(args.bgen, 'bgen', chromosomes=args.chr_range)
bedfiles = [None for x in range(chroms.shape[0])]
pargts_list = [None for x in range(chroms.shape[0])]
else:
if args.bed is not None:
bedfiles, pargts_list, chroms = parse_filelist(args.bed, args.imp, 'bed', chromosomes=args.chr_range)
bgenfiles = [None for x in range(chroms.shape[0])]
elif args.bgen is not None:
bgenfiles, pargts_list, chroms = parse_filelist(args.bgen, args.imp, 'bgen', chromosomes=args.chr_range)
bedfiles = [None for x in range(chroms.shape[0])]
if chroms.shape[0]==0:
raise(ValueError('No input genotype files found'))
# Read phenotype and covariates
######### Read Phenotype ########
y = read.phenotype.read_phenotype(args.phenofile, missing_char=args.missing_char, phen_index=args.phen_index)
######## Read covariates ########
if args.covar is not None:
print('Reading covariates')
covariates = read.phenotype.read_covariates(args.covar, pheno_ids=y.ids, missing_char=args.missing_char)
# Match to pheno ids
covariates.filter_ids(y.ids)
else:
covariates = None
# Read pedigree
if args.imp is None:
print('Reading pedigree from '+str(args.pedigree))
ped = np.loadtxt(args.pedigree,dtype=str)
if ped.shape[1] < 4:
raise(ValueError('Not enough columns in pedigree file'))
elif ped.shape[1] > 4:
print('Warning: pedigree file has more than 4 columns. The first four columns only will be used')
# Remove rows with missing parents
sibpairs, ped = get_sibpairs_from_ped(ped)
if sibpairs is not None:
print('Found '+str(sibpairs.shape[0])+' sibling pairs in pedigree')
else:
print('Found 0 sibling pairs')
else:
# Read pedigree
par_gts_f = h5py.File(pargts_list[0],'r')
ped = convert_str_array(par_gts_f['pedigree'])
ped = ped[1:ped.shape[0]]
# Remove control fams
controls = np.array([x[0]=='_' for x in ped[:,0]])
ped = ped[~controls,:]
####### Fit null model ######
# Match to pedigree
ped_dict = make_id_dict(ped,1)
y.filter_ids(ped[:,1])
print(str(y.shape[0])+' individuals with phenotype values found in pedigree')
ped_indices = np.array([ped_dict[x] for x in y.ids])
y.fams = ped[ped_indices,0]
# Fit variance components
print('Fitting variance components')
if args.covar is not None:
# Match covariates
covariates.filter_ids(y.ids)
# Fit null model
null_model, sigma2, tau, null_alpha, null_alpha_cov = lmm.fit_model(y.gts[:,0], covariates.gts, y.fams, add_intercept=True,
tau_init=args.tau_init)
# Adjust for covariates
y.gts[:,0] = y.gts[:,0]-(null_alpha[0]+covariates.gts.dot(null_alpha[1:null_alpha.shape[0]]))
else:
# Fit null model
null_model, sigma2, tau = lmm.fit_model(y.gts[:,0], np.ones((y.shape[0], 1)), y.fams,
tau_init = args.tau_init, return_fixed = False)
y.gts[:,0] = y.gts[:,0]-np.mean(y.gts[:,0])
print('Family variance estimate: '+str(round(sigma2/tau,4)))
print('Residual variance estimate: ' + str(round(sigma2,4)))
# Diagonalize y
print('Transforming phenotype')
L = null_model.sigma_inv_root(tau, sigma2)
y.diagonalise(L)
for i in range(chroms.shape[0]):
if args.bed is not None:
print('Observed genotypes file: '+bedfiles[i])
if args.bgen is not None:
print('Observed genotypes file: '+bgenfiles[i])
if args.imp is not None:
print('Imputed genotypes file: '+pargts_list[i])
if chroms.shape[0]>1:
print('Estimating SNP effects for chromosome '+str(chroms[i]))
else:
print('Estimaing SNP effects')
process_chromosome(chroms[i], y, ped, tau, sigma2, args.out, bedfile=bedfiles[i], bgenfile=bgenfiles[i],
par_gts_f=pargts_list[i], fit_sib=args.fit_sib, parsum=args.parsum,
max_missing=args.max_missing, min_maf=args.min_maf, batch_size=args.batch_size,
no_hdf5_out=args.no_hdf5_out, no_txt_out=args.no_txt_out)
if __name__ == "__main__":
args=parser.parse_args()
main(args)