#!/usr/bin/env python
"""Simulates genotype-phenotype data using forward simulation. Phenotypes can be affected
by direct genetic effects, indirect genetic effects (vertical transmission), and assortative mating.
Args:
@parser@
Results:
genotype data in .bed format; full pedigree including phenotype and genetic components for all generations
"""
import numpy as np
import h5py, argparse
from snipar.ibd import write_segs_from_matrix
from snipar.map import decode_map_from_pos
from snipar.utilities import *
from snipar.simulate import *
from pysnptools.snpreader import SnpData,Bed
from snipar.utilities import get_parser_doc
parser = argparse.ArgumentParser()
parser.add_argument('n_causal',type=int,help='Number of causal loci')
parser.add_argument('h2',type=float,help='Heritability due to direct effects in first generation',default=None)
parser.add_argument('outprefix',type=str,help='Prefix for simulation output files')
parser.add_argument('--bgen',
type=str,help='Address of the phased genotypes in .bgen format. If there is a @ in the address, @ is replaced by the chromosome numbers in the range of chr_range for each chromosome (chr_range is an optional parameters for this script).')
parser.add_argument('--chr_range',
type=parseNumRange,
nargs='*',
action=NumRangeAction,
help='number of the chromosomes to be imputed. Should be a series of ranges with x-y format or integers.', default=None)
parser.add_argument('--nfam',type=int,help='Number of families to simulate. If inputting bgen and not given, will be one half of samples in bgen',default=None)
parser.add_argument('--min_maf',type=float,help='Minimum minor allele frequency for simulated genotyped, which will be simulted from density proportional to 1/x',default=0.05)
parser.add_argument('--maf',type=float,help='Minor allele frequency for simulated genotypes (not needed when providing bgen files)',default=None)
parser.add_argument('--n_random',type=int,help='Number of generations of random mating',default=0)
parser.add_argument('--n_am',type=int,help='Number of generations of assortative mating',default=0)
parser.add_argument('--r_par',type=float,help='Phenotypic correlation of parents (for assortative mating)',default=None)
parser.add_argument('--v_indir',type=float,help='Variance explained by parental indirect genetic effects as a fraction of the heritability, e.g 0.5',default=0)
parser.add_argument('--r_dir_indir',type=float,help='Correlation between direct and indirect genetic effects',default=None)
parser.add_argument('--beta_vert',type=float,help='Vertical transmission coefficient',default=0)
parser.add_argument('--save_par_gts',action='store_true',help='Save the genotypes of the parents of the final generation',default=False)
parser.add_argument('--impute',action='store_true',help='Impute parental genotypes from phased sibling genotypes & IBD',default=False)
parser.add_argument('--unphased_impute',action='store_true',help='Impute parental genotypes from unphased sibling genotypes & IBD',default=False)
__doc__ = __doc__.replace("@parser@", get_parser_doc(parser))
[docs]def main(args):
""""Calling this function with args is equivalent to running this script from commandline with the same arguments.
Args:
args: list
list of all the desired options and arguments. The possible values are all the values you can pass this script from commandline.
"""
if args.beta_vert > 0:
if args.v_indir >0:
raise(ValueError('Cannot simulate both indirect effects and vertical transmission separately. Choose one'))
if args.beta_vert > np.power(2*(1+args.r_par),-0.5):
raise(ValueError('Vertical transmission parameter would lead to infinite variance. Reduce to below 1/sqrt(2(1+r_par))'))
if args.n_random == 0:
print('Adding an additional initial generation of random-mating in order to initialize vertical transmission model')
args.n_random = 1
print('Simulating an initial generation by random-mating')
print('Followed by '+str(args.n_random)+' generations of random-mating')
print('Followed by '+str(args.n_am)+' generations of assortative mating')
# Check parental correlation
if args.n_am > 0:
if args.r_par is not None:
if (args.r_par**2) <= 1:
r_y = args.r_par
else:
raise(ValueError('Parental correlation must be between -1 and 1'))
else:
raise(ValueError('Must specify parental phenotypic correlation for assortative mating'))
# Check heritability
if 0 <= args.h2 <= 1:
h2 = args.h2
else:
raise(ValueError('Heritability must be between 0 and 1'))
# Check v_indirect
if not args.v_indir==0:
if args.r_dir_indir is None:
raise(ValueError('Must specify correlation between direct and indirect genetic effects'))
else:
if (args.r_dir_indir**2) > 1:
raise(ValueError('Correlation between direct and indirect effects must be between -1 and 1'))
if args.v_indir < 0:
raise(ValueError('Heritability must be between 0 and 1'))
#### Obtain haplotypes for base generation ###
if args.bgen is None:
if args.nfam is None:
raise(ValueError('Must specify number of families to simulate'))
unlinked = True
haps, maps, snp_ids, alleles, positions, chroms = simulate_first_gen(args.nfam, args.n_causal, maf=None,min_maf=0.05)
else:
unlinked = False
haps, maps, snp_ids, alleles, positions, chroms = haps_from_bgen(args.bgen, args.chr_range)
######################## Perform simulation ##################
new_haps, haps, father_indices, mother_indices, ibd, ped, a, V = forward_sim(haps, maps, args.n_random, args.n_am, unlinked, args.n_causal, args.h2,
v_indirect=args.v_indir, r_direct_indirect=args.r_dir_indir, r_y=args.r_par, beta_vert=args.beta_vert)
##############################################################
### Save variance components ###
print('Saving variance components to '+args.outprefix+'VCs.txt')
np.savetxt(args.outprefix+'VCs.txt',V,fmt='%s')
### Save pedigree and fam files
print('Writing pedigree and fam files')
n_last = ped[ped.shape[0]-1,0].split('_')[0]
last_gen = [x.split('_')[0]==n_last for x in ped[:,0]]
np.savetxt(args.outprefix+'pedigree.txt',ped,fmt='%s')
#np.savetxt(args.outprefix+'sibs.fam',ped[last_gen,:],fmt='%s')
phen_out = ped[last_gen,:]
np.savetxt(args.outprefix+'phenotype.txt',phen_out[:,[0,1,5]],fmt='%s')
## Save to HDF5 file
print('Saving genotypes')
# save offspring genotypes
for i in range(len(new_haps)):
print('Writing genotypes for chromosome '+str(chroms[i]))
bim_i = np.vstack((np.repeat(chroms[i],snp_ids[i].shape[0]),snp_ids[i], maps[i], positions[i], alleles[i][:,0],alleles[i][:,1])).T
gts_chr = SnpData(iid=ped[last_gen,0:2],sid=snp_ids[i],pos=bim_i[:,[0,2,3]],
val=np.sum(new_haps[i], axis=3, dtype=np.uint8).reshape((new_haps[i].shape[0]*2,new_haps[i].shape[2])))
Bed.write(args.outprefix+'chr_'+str(chroms[i])+'.bed',gts_chr,count_A1=True,_require_float32_64=False)
np.savetxt(args.outprefix+'chr_'+str(chroms[i])+'.bim',bim_i,fmt='%s',delimiter='\t')
if args.save_par_gts:
par_gen = [x.split('_')[0]==str(int(n_last)-1) for x in ped[:,0]]
par_gts_chr = SnpData(iid=ped[par_gen,0:2],sid=snp_ids[i],pos=bim_i[:,[0,2,3]],
val=np.sum(haps[i], axis=3, dtype=np.uint8).reshape((haps[i].shape[0]*2,haps[i].shape[2])))
Bed.write(args.outprefix+'chr_'+str(chroms[i])+'_par.bed',par_gts_chr,count_A1=True,_require_float32_64=False)
del par_gts_chr
np.savetxt(args.outprefix+'chr_'+str(chroms[i])+'_par.bim',bim_i,fmt='%s',delimiter='\t')
# # Imputed parental genotypes
if args.impute or args.unphased_impute:
print('Imputing parental genotypes and saving')
freqs = np.mean(gts_chr.val, axis=0) / 2.0
imp_ped = ped[last_gen,0:4]
imp_ped = np.hstack((imp_ped,np.zeros((imp_ped.shape[0],2),dtype=bool)))
# Phased
if args.impute:
hf = h5py.File(args.outprefix+'phased_impute_chr_'+str(chroms[i])+'.hdf5','w')
phased_imp = impute_all_fams_phased(new_haps[i],freqs,ibd[i])
hf['imputed_par_gts'] = phased_imp
del phased_imp
hf['bim_values'] = encode_str_array(bim_i)
hf['bim_columns'] = encode_str_array(np.array(['rsid','map','position','allele1','allele2']))
hf['pedigree'] = encode_str_array(imp_ped)
hf['families'] = encode_str_array(imp_ped[0::2,0])
hf.close()
# Unphased
if args.unphased_impute:
hf = h5py.File(args.outprefix+'unphased_impute_chr_'+str(chroms[i])+'.hdf5','w')
ibd[i] = np.sum(ibd[i],axis=2)
imp = impute_all_fams(gts_chr, freqs, ibd[i])
hf['imputed_par_gts'] = imp
del imp
hf['bim_values'] = encode_str_array(bim_i)
hf['bim_columns'] = encode_str_array(np.array(['rsid','map','position','allele1','allele2']))
hf['pedigree'] = encode_str_array(imp_ped)
hf['families'] = encode_str_array(imp_ped[0::2,0])
hf.close()
del gts_chr
#### Write IBD segments
snp_count = 0
sibpairs = ped[last_gen,1]
sibpairs = sibpairs.reshape((int(sibpairs.shape[0]/2),2))
if args.v_indir==0:
causal_out = np.zeros((a.shape[0],5),dtype='U30')
else:
causal_out = np.zeros((a.shape[0],5),dtype='U30')
for i in range(len(haps)):
print('Writing IBD segments for chromosome '+str(chroms[i]))
# Segments
if not args.unphased_impute:
ibd[i] = np.sum(ibd[i],axis=2)
segs = write_segs_from_matrix(ibd[i], sibpairs,
snp_ids[i], positions[i],maps[i],chroms[i],
args.outprefix+'chr_'+str(chroms[i])+'.segments.gz')
# Causal effects
if args.v_indir==0:
a_chr = a[snp_count:(snp_count + snp_ids[i].shape[0])]
a_chr_v1 = a_chr+np.random.normal(0,np.std(a_chr),a_chr.shape)
causal_out[snp_count:(snp_count + snp_ids[i].shape[0]),:] = np.vstack((snp_ids[i],alleles[i][:,0],alleles[i][:,1],
a_chr,a_chr_v1)).T
if i==0:
causal_out = np.vstack((np.array(['SNP','A1','A2','direct','direct_v1']).reshape((1,5)),causal_out))
else:
a_chr = a[snp_count:(snp_count + snp_ids[i].shape[0]),:]
causal_out[snp_count:(snp_count + snp_ids[i].shape[0]),:] = np.vstack((snp_ids[i],alleles[i][:,0],alleles[i][:,1],
a_chr[:,0],a_chr[:,1])).T
if i==0:
causal_out = np.vstack((np.array(['SNP','A1','A2','direct','indirect']).reshape((1,5)),causal_out))
snp_count += snp_ids[i].shape[0]
# count
np.savetxt(args.outprefix+'causal_effects.txt',causal_out,fmt='%s')
if __name__ == "__main__":
args=parser.parse_args()
main(args)