#!/usr/bin/env python
"""Infers direct effects, non-transmitted coefficients (NTCs), and population effects of genome-wide SNPs on a phenotype.
Minimally: the script requires observed genotypes on phenotyped individuals along with a phenotype file.
If no imputed parental genotypes are provided, a pedigree file is required, and the script will analyze
samples with siblings and/or both parents genotyped by default.
Args:
@parser@
Results:
sumstats.gz
For each chromosome, a gzipped text file containing the SNP level summary statistics.
"""
import argparse
from snipar.numrange import parseNumRange, NumRangeAction
from snipar.docgen import get_parser_doc
######### Command line arguments #########
parser = argparse.ArgumentParser(add_help=True)
parser.add_argument('phenofile',type=str,help='Location of the phenotype file')
parser.add_argument('--bgen', type=str,help='Address of the phased genotypes in .bgen format. If there is a @ in the address, @ is replaced by the chromosome numbers in the range of chr_range for each chromosome (chr_range is an optional parameters for this script).')
parser.add_argument('--bed', type=str,help='Address of the unphased genotypes in .bed format. If there is a @ in the address, @ is replaced by the chromosome numbers in the range of chr_range for each chromosome (chr_range is an optional parameters for this script).')
parser.add_argument('--imp', type=str, help='Address of hdf5 files with imputed parental genotypes (without .hdf5 suffix). If there is a @ in the address, @ is replaced by the chromosome numbers in the range of chr_range (chr_range is an optional parameters for this script).', default = None)
parser.add_argument('--pedigree',type=str,help='Address of pedigree file. Must be provided if not providing imputed parental genotypes.',default=None)
parser.add_argument('--covar',type=str,help='Path to file with covariates: plain text file with columns FID, IID, covar1, covar2, ..', default=None)
parser.add_argument('--chr_range', type=parseNumRange, nargs='*', action=NumRangeAction, help='Chromosomes to analyse. Should be a series of ranges with x-y format (e.g. 1-22) or integers.', default=None)
parser.add_argument('--out', type=str, help="The summary statistics will output to this path, one file for each chromosome. If the path contains '@', the '@' will be replaced with the chromosome number. Otherwise, the summary statistics will be output to the given path with file names chr_1.sumstats.gz, chr_2.sumstats.gz, etc. for the text sumstats, and chr_1.sumstats.hdf5, etc. for the HDF5 sumstats",default='./')
parser.add_argument('--grm', type=str, help='Path to GRM file giving pairwise relatednsss information. Designed to work with KING IBD segment inference output (.seg file).', default=None)
parser.add_argument('--grmgz', type=str, help='Path to GRM in GCTA grm.gz format (without .grm.gz suffix). Assumes .grm.id file with same root path also available.', default=None)
parser.add_argument('--sparse_thresh', type=float, help='Threshold of GRM sparsity — elements below this value are set to zero', default=0.05)
parser.add_argument('--impute_unrel', action='store_true', default=False, help='Whether to include unrelated individuals and impute their parental genotypes lineary or not. See Unified estimator in Guan et al.')
parser.add_argument('--robust', action='store_true', default=False, help='Use the robust estimator')
parser.add_argument('--sib_diff', action='store_true', default=False, help='Use the sibling difference method')
parser.add_argument('--parsum',action='store_true',help='Regress onto proband and sum of (imputed/observed) maternal and paternal genotypes. Default uses separate paternal and maternal genotypes when available.',default = False)
parser.add_argument('--fit_sib',action='store_true',help='Fit indirect effect from sibling ',default=False)
parser.add_argument('--phen',type=str,help='Name of the phenotype to be analysed — case sensitive. Default uses first phenotype in file.', default=None)
parser.add_argument('--phen_index',type=int,help='If the phenotype file contains multiple phenotypes, which phenotype should be analysed (default 1, first)', default=1)
parser.add_argument('--missing_char',type=str,help='Missing value string in phenotype file (default NA)', default='NA')
parser.add_argument('--min_maf',type=float,help='Ignore SNPs with minor allele frequency below min_maf (default 0.01)', default=0.01)
parser.add_argument('--max_missing',type=float,help='Ignore SNPs with greater percent missing calls than max_missing (default 5)', default=5)
parser.add_argument('--vc_out', type=str, help='Prefix of output filename for variance component array (without .npy).')
parser.add_argument('--vc_list', type=float, nargs='+', default=None, help='Pass in variance components as a list of floats.')
parser.add_argument('--no_sib_var', action='store_true', default=False, help='Do not fit sibling variance component. Not recommended for family-GWAS.')
parser.add_argument('--keep', default=None, type=str, help='Filename of IDs to be kept for analysis (No header).')
parser.add_argument('--cpus', type=int, help='Number of cpus to distribute batches across', default=1)
parser.add_argument('--threads',type=int,help='Number of threads to use per CPU. Uses all available by default.',default=1)
parser.add_argument('--no_hdf5_out',action='store_true',help='Suppress HDF5 output of summary statistics',default=False)
parser.add_argument('--batch_size',type=int,help='Batch size of SNPs to load at a time (reduce to reduce memory requirements)',default=100000)
__doc__ = __doc__.replace("@parser@", get_parser_doc(parser))
[docs]
def main(args):
"""Calling this function with args is equivalent to running this script from commandline with the same arguments.
Args:
args: list
list of all the desired options and arguments. The possible values are all the values you can pass this script from commandline.
"""
import os, time
num_threads = args.threads
print('Number of threads for numpy: '+str(num_threads))
# export OMP_NUM_THREADS=...
os.environ['OMP_NUM_THREADS'] = str(num_threads)
# export OPENBLAS_NUM_THREADS=...
os.environ['OPENBLAS_NUM_THREADS'] = str(num_threads)
# export MKL_NUM_THREADS=...
os.environ['MKL_NUM_THREADS'] = str(num_threads)
# export VECLIB_MAXIMUM_THREADS=...
os.environ['VECLIB_MAXIMUM_THREADS'] = str(num_threads)
# export NUMEXPR_NUM_THREADS=...
os.environ['NUMEXPR_NUM_THREADS'] = str(num_threads)
import numpy as np
from numba import set_num_threads
from numba import config as numba_config
import snipar.read as read
import snipar.slmm as slmm
from snipar.gwas import process_chromosome
from snipar.pedigree import get_sibpairs_from_ped
from snipar.preprocess import remove_sibs
from snipar.utilities import parse_obsfiles, parse_filelist, make_id_dict
# Check if mutually incompatible design arguments supplied
if int(args.robust) + int(args.sib_diff) + int(args.impute_unrel) > 1:
raise argparse.ArgumentTypeError('Only supply one of --robust, --sib_diff and --impute_unrel.')
if (args.robust or args.sib_diff) and args.fit_sib:
raise argparse.ArgumentTypeError('Cannot fit sib IGE while --robust or --sib_diff is supplied.')
if (args.sib_diff or args.robust) and args.parsum:
raise argparse.ArgumentTypeError('--parsum is ignored with --sib_diff and --robust estimator.')
# Check if GRM provided
if args.grm is None and args.grmgz is None:
print('No GRM provided.')
args.no_grm_var = True
else:
args.no_grm_var = False
if args.grm:
if os.path.exists(args.grm):
print('Using pairwise relationships from: '+args.grm+' for GRM variance component')
else:
raise argparse.ArgumentTypeError('GRM not found.')
elif args.grmgz:
if os.path.exists(args.grmgz+'.grm.gz'):
print('Using: '+args.grmgz+' for GRM variance component')
else:
raise argparse.ArgumentTypeError('GRM file not found.')
# Check if fitting sibling variance component
if args.no_sib_var:
if args.no_grm_var:
raise argparse.ArgumentTypeError('Need to fit at least one of sibling and GRM variance components.')
else:
print('Fitting GRM and residual variance components. Note this may result in statistically inefficient direct genetic effect estimates. It is advised to fit the sibling variance component.')
elif args.no_grm_var:
print('Fitting sibling and residual variance components')
else:
print('Fitting sibling, GRM, and residual variance components')
# Set number of threads for numba
if args.threads is not None:
num_threads = min([args.threads, numba_config.NUMBA_NUM_THREADS])
else:
num_threads = numba_config.NUMBA_NUM_THREADS
set_num_threads(num_threads)
print('Number of threads for numba: '+str(num_threads))
print('Number of processes: '+str(args.cpus))
# Print where output written to
if args.out == './':
print(f'Output will be written to chr_@')
elif '@' in args.out:
print(f'Output will be written to {args.out}')
else:
print(f'Output will be written to {args.out}_chr_@')
# Check arguments
if args.bed is None and args.bgen is None:
raise(ValueError('Must provide one of --bedfiles and --bgenfiles'))
if args.bed is not None and args.bgen is not None:
raise(ValueError('Both bed files and bgen files provided. Please provide only one'))
if args.imp is None and args.pedigree is None:
raise(ValueError('Must provide pedigree if not providing imputed parental genotypes file(s)'))
# Find observed and imputed files
if args.imp is None:
print("No imputation provided.")
if args.robust:
raise ValueError("The robust estimator requires imputed parental genotypes. If these are not available, remove the --robust flag and a meta analysis of trios and siblings will be performed.")
trios_sibs = True if not args.sib_diff else False
trios_only = False
# if trios_sibs and not args.impute_unrel:
# print('Defaulting to meta-analyzing trios and siblings using genetic differences between siblings.')
if args.bed is not None:
bedfiles, chroms = parse_obsfiles(args.bed, 'bed', chromosomes=args.chr_range)
bgenfiles = [None for x in range(chroms.shape[0])]
elif args.bgen is not None:
bgenfiles, chroms = parse_obsfiles(args.bgen, 'bgen', chromosomes=args.chr_range)
bedfiles = [None for x in range(chroms.shape[0])]
pargts_list = [None for x in range(chroms.shape[0])]
else:
trios_sibs = False
trios_only = False
if args.bed is not None:
bedfiles, pargts_list, chroms = parse_filelist(args.bed, args.imp, 'bed', chromosomes=args.chr_range)
bgenfiles = [None for x in range(chroms.shape[0])]
elif args.bgen is not None:
bgenfiles, pargts_list, chroms = parse_filelist(args.bgen, args.imp, 'bgen', chromosomes=args.chr_range)
bedfiles = [None for x in range(chroms.shape[0])]
if chroms.shape[0]==0:
raise(ValueError('No input genotype files found'))
# Read phenotype and covariates
######### Read Phenotype ########
y = read.phenotype.read_phenotype(args.phenofile, column=args.phen, column_index=args.phen_index, na_values=args.missing_char)
######## Read covariates ########
if args.covar is not None:
print('Reading covariates')
covariates = read.phenotype.read_covariates(args.covar, pheno_ids=y.ids, missing_char=args.missing_char)
# Match to pheno ids
# covariates.filter_ids(y.ids)
else:
covariates = None
########## Read pedigree ########
if args.imp is None:
# Read pedigree from file
print('Reading pedigree from '+str(args.pedigree))
ped = np.loadtxt(args.pedigree,dtype=str)[1:,]
if ped.shape[1] < 4:
raise(ValueError('Not enough columns in pedigree file'))
elif ped.shape[1] > 4:
print('WARNING: pedigree file has more than 4 columns. The first four columns only will be used.')
# Remove rows with missing parents
sibpairs, ped = get_sibpairs_from_ped(ped)
imp_fams = None
else:
ped, imp_fams = read.build_ped_from_par_gts(pargts_list[0])
if args.sib_diff:
imp_fams = None # imputation not used
# # Check if sibling pairs are present
# if sibpairs is not None:
# print('Found '+str(sibpairs.shape[0])+' sibling pairs in pedigree')
# else:
# print('Found 0 sibling pairs')
# if args.sib_diff:
# raise argparse.ArgumentTypeError('No sibling pairs found in pedigree. Cannot use sibling difference estimator.')
# if not args.no_sib_var:
# print('No sibling pairs found in pedigree. Dropping sibling variance component')
# args.no_sib_var = True
####### Print which estimator will be used #########
if args.robust:
print("== The robust estimator will be used.")
elif args.sib_diff:
print("== The sib-difference estimator will be used.")
elif trios_sibs:
if args.impute_unrel:
print("== The unified estimator will be used.")
else:
print("== Defaulting to meta-analyzing trios and siblings using genetic differences between siblings.")
elif args.impute_unrel:
print("== The unified estimator will be used.")
else:
print("== The Young estimator will be used.")
print("Check if the dataset is feasible for the selected estimator...")
## Find which individuals can be used for the selected estimator ##
if args.sib_diff:
# Find individuals with genotyped siblings
ids = y.ids
fam_labels = y.fams
ped_dict = make_id_dict(ped,1)
y.filter_ids(ped[:,1])
print(str(y.shape[0])+' individuals with phenotype values found in pedigree')
ped_indices = np.array([ped_dict[x] for x in y.ids])
y.fams = ped[ped_indices,0]
ids = y.ids
fam_labels = y.fams
ids, fam_labels = read.get_ids_with_sibs(bedfiles[0] if args.bed is not None else bgenfiles[0], ped,
y.ids, return_info=False, ibdrel_path=args.grm)
elif trios_sibs:
if args.impute_unrel:
# Remove siblings
ids = remove_sibs(
ped, bedfiles[0] if args.bed is not None else bgenfiles[0], y.ids
)
# Find trios
ids, fam_labels = read.get_ids_with_par(
bedfiles[0] if args.bed is not None else bgenfiles[0], ped, imp_fams,
ids, sib=args.fit_sib, include_unrel=args.impute_unrel, ibdrel_path=args.grm,
return_info=False
)
trios_sibs = False
else:
# Find individuals with genotyped siblings and/or both parents genotyped
ids, fam_labels, trios_only = read.get_ids_with_trios_sibs(
bedfiles[0] if args.bed is not None else bgenfiles[0], ped, y.ids, ibdrel_path=args.grm
)
else:
# Find individuals with observed and/or imputed parental genotypes
ids, fam_labels = read.get_ids_with_par(
bedfiles[0] if args.bed is not None else bgenfiles[0], ped, imp_fams,
y.ids, sib=args.fit_sib, include_unrel=args.impute_unrel, ibdrel_path=args.grm,
return_info=False
)
#### Filter phenotype to valid IDs ###
y.filter_ids(ids)
if args.sib_diff:
ids = y.ids
fam_labels = y.fams
np.testing.assert_array_equal(ids, y.ids)
if args.covar:
if not args.sib_diff:
covariates.filter_ids(ids)
np.testing.assert_array_equal(ids, covariates.ids)
else:
covariates.filter_ids(y.ids)
# Read GRM if provided
if args.grm is not None:
id_dict = make_id_dict(ids)
grm_data, grm_row_ind, grm_col_ind = slmm.build_ibdrel_arr(
args.grm, id_dict=id_dict, keep=ids, thres=args.sparse_thresh)
elif args.grmgz is not None:
ids, fam_labels = slmm.match_grm_ids(
ids, fam_labels, grm_path=args.grmgz, grm_source='gcta')
id_dict = make_id_dict(ids)
grm_data, grm_row_ind, grm_col_ind = slmm.build_grm_arr(
args.grmgz, id_dict=id_dict, thres=args.sparse_thresh)
# Build sparse silbship array if fitting sib variance component
if not args.no_sib_var:
sib_data, sib_row_ind, sib_col_ind = slmm.build_sib_arr(fam_labels)
# Set variance components
if not args.no_grm_var and not args.no_sib_var:
varcomp_lst = (
(grm_data, grm_row_ind, grm_col_ind),
(sib_data, sib_row_ind, sib_col_ind),
)
elif not args.no_grm_var:
varcomp_lst = (
(grm_data, grm_row_ind, grm_col_ind),
)
elif not args.no_sib_var:
varcomp_lst = (
(sib_data, sib_row_ind, sib_col_ind),
)
if args.vc_list:
varcomps = tuple(args.vc_list)
if len(varcomps) != len(varcomp_lst) + 1:
raise ValueError(f'Supplied varcomps length {len(varcomps)} does not match the number of variance components.')
else:
varcomps = None
# Define sparse linear mixed model for variance component estimation
if args.covar is None:
y.gts -= y.gts.mean()
model = slmm.LinearMixedModel(y.gts.reshape(-1).data, varcomps=varcomps, varcomp_arr_lst=varcomp_lst, covar_X=None, add_intercept=False, add_jitter=False)
else:
model = slmm.LinearMixedModel(y.gts.reshape(-1).data, varcomps=varcomps, varcomp_arr_lst=varcomp_lst, covar_X=covariates.gts.data, add_intercept=True, add_jitter=False)
if not varcomps:
print(f'Optimizing variance components...')
# print('Nonzero entries: ', model.V.nnz, 'Number of individuals: ', model.n, 'density: ', model.V.nnz / model.n / model.n)
start = time.time()
model.scipy_optimize()
print(f'Time used for variance component estimation: {time.time() - start:.2f}s')
else:
print('Variance components supplied.')
if args.vc_out:
np.savetxt(f'{args.vc_out}', np.array(model.varcomps))
print(f'Variance components saved to {args.vc_out}.')
## Print variance components
varcomps = list(i for i in model.varcomps)
vcomp_index=0
if not args.no_grm_var:
print(f'GRM variance component: {varcomps[vcomp_index]:.3f}')
print(f'Variance explained by GRM: {varcomps[vcomp_index] / np.sum(varcomps) * 100:.1f}%')
vcomp_index+=1
if not args.no_sib_var:
print(f'Sibling variance component: {varcomps[vcomp_index]:.3f}')
print(f'Variance explained by sibling component: {varcomps[vcomp_index] / np.sum(varcomps) * 100:.1f}%')
print(f'Total variance: {np.sum(varcomps):.3f}')
# Print implied sibling correlation
if args.no_grm_var:
print(f'Implied sibling correlation: {varcomps[0] / np.sum(varcomps):.3f}')
elif not args.no_sib_var:
print(f'Implied sibling correlation: {(0.5*varcomps[0] + varcomps[1]) / np.sum(varcomps):.3f}')
else:
print(f'Implied sibling correlation: {0.5*varcomps[0] / np.sum(varcomps):.3f}')
# Define variance components: does this need to be done twice?
sigmas = model.varcomps
if not args.no_grm_var:
if not args.no_sib_var:
varcomp_lst = (
(grm_data, grm_row_ind, grm_col_ind),
(sib_data, sib_row_ind, sib_col_ind),
)
else:
varcomp_lst = (
(grm_data, grm_row_ind, grm_col_ind),
)
elif not args.no_sib_var:
varcomp_lst = (
(sib_data, sib_row_ind, sib_col_ind),
)
######### Process chromosomes ###########
start = time.time()
for i in range(chroms.shape[0]):
if args.bed is not None:
print('Observed genotypes file: '+bedfiles[i])
if args.bgen is not None:
print('Observed genotypes file: '+bgenfiles[i])
if args.imp is not None:
print('Imputed genotypes file: '+pargts_list[i])
if chroms.shape[0]>1:
print('Estimating SNP effects for chromosome '+str(chroms[i]))
else:
print('Estimaing SNP effects')
process_chromosome(chroms[i], y, varcomp_lst,
ped, imp_fams, sigmas, args.out, covariates,
bedfile=bedfiles[i], bgenfile=bgenfiles[i],
par_gts_f=pargts_list[i], fit_sib=args.fit_sib, sib_diff=args.sib_diff, parsum=args.parsum, standard_gwas=False,
impute_unrel=args.impute_unrel, robust=args.robust, trios_sibs=trios_sibs, trios_only=trios_only,
max_missing=args.max_missing, min_maf=args.min_maf, batch_size=args.batch_size,
no_hdf5_out=args.no_hdf5_out, no_txt_out=False, cpus=args.cpus, add_jitter=False,
debug=False)
print(f'Time used: {time.time() - start:.2f}s')
if __name__ == "__main__":
args = parser.parse_args()
main(args)