Source code for snipar.scripts.correlate

#!/usr/bin/env python
"""Infers correlations between direct effects and population effects, and between direct effects and average non-transmitted coefficients (NTCs).
Minimally: the script requires summary statistics as output by snipar's gwas.py script, and
either LD-scores (as output by snipar's ibd.py script or LDSC) or .bed files from
which LD-scores can be computed
Args:
@parser@
Results:
    correlations
        A text file containing the estimated correlations and their standard errors. 
        
"""
import numpy as np
import argparse
from snipar.correlate import *
from numba import set_num_threads
from numba import config as numba_config
from snipar.utilities import *
from snipar.utilities import get_parser_doc

parser = argparse.ArgumentParser()
parser.add_argument('sumstats', type=str, help='Address of sumstats files in SNIPar sumstats.gz text format (without .sumstats.gz suffix). If there is a @ in the address, @ is replaced by the chromosome numbers in chr_range (optional argument)')
parser.add_argument('--chr_range',
                        type=parseNumRange,
                        nargs='*',
                        action=NumRangeAction,
                        help='number of the chromosomes to be imputed. Should be a series of ranges with x-y format or integers.', default=None)
parser.add_argument('out',type=str,help='Prefix for output file(s)')
parser.add_argument('--ldscores',type=str,help='Address of ldscores as output by LDSC',default=None)
parser.add_argument('--bed', type=str,
                    help='Address of observed genotype files in .bed format (without .bed suffix). If there is a # in the address, # is replaced by the chromosome numbers in the range of 1-22.',
                    default=None)
parser.add_argument('--threads',type=int,help='Number of threads to use for IBD inference. Uses all available by default.',default=None)
parser.add_argument('--min_maf',type=float,help='Ignore SNPs with minor allele frequency below min_maf (default 0.05)', default=0.05)
parser.add_argument('--corr_filter',type=float,help='Filter out SNPs with outlying sampling correlations more than corr_filter SDs from mean (default 6)',default=6.0)
parser.add_argument('--n_blocks',type=int,help='Number of blocks to use for block-jacknife variance estimate (default 200)',default=200)
parser.add_argument('--save_delete',action='store_true',help='Save jacknife delete values',default=False)
parser.add_argument('--ld_wind',type=float,help='The window, in cM, within which LD scores are computed (default 1cM)',default=1.0)
parser.add_argument('--ld_out',type=str,help='Output LD scores in LDSC format to this address',default=None)
__doc__ = __doc__.replace("@parser@", get_parser_doc(parser))
[docs]def main(args): """"Calling this function with args is equivalent to running this script from commandline with the same arguments. Args: args: list list of all the desired options and arguments. The possible values are all the values you can pass this script from commandline. """ # Set number of threads if args.threads is not None: num_threads = min([args.threads, numba_config.NUMBA_NUM_THREADS]) else: num_threads = numba_config.NUMBA_NUM_THREADS set_num_threads(num_threads) print('Number of threads: '+str(num_threads)) if args.ldscores is None and args.bed is None: raise(ValueError('Must provide either LD scores or genotypes to compute ld scores')) if args.ldscores is not None and args.bed is not None: raise(ValueError('Both LD scores and genotypes provided. Provided LD scores will be used')) # Find sumstats files sumstats_files, chroms = parse_obsfiles(args.sumstats, obsformat='sumstats.gz', chromosomes=args.chr_range) # Read sumstats s = read_sumstats_files(sumstats_files, chroms) # Filter print('Filtering on missing values') s.filter_NAs() print('Filtering out SNPs with MAF<'+str(args.min_maf)) s.filter_maf(args.min_maf) print('Filtering out SNPs with sampling correlations more than '+str(args.corr_filter)+' SDs from mean') s.filter_corrs(args.corr_filter) # Get LD scores if args.ldscores is not None: ld_files, chroms = parse_obsfiles(args.ldscores, obsformat='l2.ldscore.gz', chromosomes=[x for x in range(1,23)]) s.scores_from_ldsc(ld_files) s.filter_NAs() elif args.bed is not None: bedfiles, chroms = parse_obsfiles(args.bed, obsformat='bed', chromosomes=chroms) s.compute_ld_scores(bedfiles, chroms, args.ld_wind, args.ld_out) s.filter_NAs() # Compute correlations print('Using '+str(s.sid.shape[0])+' SNPs to compute correlations') r_dir_pop, r_dir_pop_SE, r_dir_pop_delete = s.cor_direct_pop(args.n_blocks) print('Correlation between direct and population effects: '+str(round(r_dir_pop[0],4))+' (S.E. '+str(round(r_dir_pop_SE[0],4))+')') print('Regression coefficient of population on direct effects: '+str(round(r_dir_pop[1],4))+' (S.E. '+str(round(r_dir_pop_SE[1],4))+')') print('Proportion of variance in population effects uncorrelated with direct effects: '+str(round(r_dir_pop[2],4))+' (S.E. '+str(round(r_dir_pop_SE[2],4))+')') r_dir_avg_NTC, r_dir_avg_NTC_SE, r_dir_avg_NTC_delete = s.cor_direct_avg_NTC(args.n_blocks) print('Correlation between direct and average NTCs: '+str(round(r_dir_avg_NTC[0],4))+' (S.E. '+str(round(r_dir_avg_NTC_SE[0],4))+')') print('Regression coefficient of average NTCs on direct effects: '+str(round(r_dir_avg_NTC[1],4))+' (S.E. '+str(round(r_dir_avg_NTC_SE[1],4))+')') print('Proportion of variance in average NTCs uncorrelated with direct effects: '+str(round(r_dir_avg_NTC[2],4))+' (S.E. '+str(round(r_dir_avg_NTC_SE[2],4))+')') outfile = str(args.out)+'_corrs.txt' print('Saving correlation estimates to '+outfile) first_col = np.array(['r_direct_population','reg_population_direct','v_population_uncorr_direct','r_direct_avg_NTC','reg_avg_NTC_direct','v_avg_NTC_uncorr_direct']).reshape((6,1)) header = np.array(['correlation','est','SE']).reshape((1,3)) cor_out = np.zeros((6,2)) cor_out[0:3,0] = r_dir_pop cor_out[0:3,1] = r_dir_pop_SE cor_out[3:6,0] = r_dir_avg_NTC cor_out[3:6,1] = r_dir_avg_NTC_SE outarray = np.vstack((header,np.hstack((first_col,cor_out)))) np.savetxt(outfile,outarray,fmt='%s') if args.save_delete: delete_outfile = args.out+'_delete.txt' print('Saving jacknife delete values to '+str(delete_outfile)) delete_out = np.vstack((r_dir_pop_delete,r_dir_avg_NTC_delete)).T delete_out = np.vstack((np.array(['r_direct_population','r_direct_avg_NTC']).reshape((1,2)),delete_out)) np.savetxt(delete_outfile, delete_out,fmt='%s')
if __name__ == "__main__": args=parser.parse_args() main(args)