import gzip
from numba import njit, prange
from snipar.map import *
from snipar.ld import compute_ld_scores
import numpy as np
from snipar.read.bed import read_sibs_from_bed
from snipar.read.bgen import read_sibs_from_bgen
from snipar.utilities import make_id_dict
from snipar.utilities import outfile_name
from snipar.utilities import open_bgen
####### Transition Matrix ######
[docs]@njit
def transition_matrix(cM):
"""Compute probabilities of transitioning between IBD states as a function of genetic distance.
Args:
cM : :class:`float`
genetic distance in centiMorgans (cM)
Returns:
log(P) : :class:`~numpy:numpy.array`
the natural logarithm (element-wise) of the matrix of transition probabilities
"""
P = np.identity(3)
r_array = np.array([[-1.0,1.0,0.0],[0.5,-1.0,0.5],[0.0,1.0,-1.0]],dtype=np.float64)
r = 1.0 - np.power((1.0 + np.exp(-cM / 50)) / 2.0, 4)
P += r_array*r
return np.log(P)
#
[docs]@njit
def p_ibd_0(f):
"""Compute Joint-PMF for sibling pair genotypes given IBD0.
Args:
f : :class:`float`
allele frequency
Returns:
P : :class:`~numpy:numpy.array`
matrix of probabilities
"""
P_vec = np.array([(1.0-f)**2.0,2.0*f*(1-f),f**2.0]).reshape((3,1))
return P_vec @ P_vec.T
[docs]@njit
def p_ibd_1(f):
"""Compute Joint-PMF for sibling pair genotypes given IBD1.
Args:
f : :class:`float`
allele frequency
Returns:
P : :class:`~numpy:numpy.array`
matrix of probabilities
"""
fmatrix = np.array([[0.0,1.0,0.0],[1.0,1.0,2.0],[0.0,2.0,3.0]])
minus_fmatrix = np.array([[3.0,2.0,0.0],[2.0,1.0,1.0],[0.0,1.0,0.0]])
P = np.exp(fmatrix*np.log(f)+minus_fmatrix*np.log(1-f))
P[0,2] = 0
P[2,0] = 0
return P
[docs]@njit
def p_ibd_2(f):
"""Compute Joint-PMF for sibling pair genotypes given IBD2.
Args:
f : :class:`float`
allele frequency
Returns:
P : :class:`~numpy:numpy.array`
matrix of probabilities
"""
P = np.zeros((3,3),dtype=np.float64)
fdiag = np.array([(1.0-f)**2.0,2.0*f*(1.0-f),f**2.0])
np.fill_diagonal(P,fdiag.reshape((3,1)))
return P
[docs]@njit
def p_obs_given_IBD(g1_obs,g2_obs,f,p):
"""Compute Joint-PMF for sibling pair genotypes given IBD0.
Args:
g1_obs : :class:`integer`
observed genotype for sibling 1
g2_obs : :class:`integer`
observed genotype for sibling 2
f : :class:`float`
allele frequency
p : :class:'float'
genotyping error probability
Returns:
log(P) : :class:`~numpy:numpy.array`
vector giving log-probabilities of observing g1_obs,g2_obs give IBD 0,1,2
"""
P_out = np.zeros((3))
# Get probabilities of observed genotypes given true
err_vectors = np.array([[1.0 - p, p / 2.0, 0.0],
[p, 1.0 - p, p],
[0.0, p / 2.0, 1.0 - p]])
# Get probabilities of true genotypes given IBD
probs = np.zeros((3, 3, 3))
probs[0, ...] = p_ibd_0(f)
probs[1, ...] = p_ibd_1(f)
probs[2, ...] = p_ibd_2(f)
# Compute probabilities conditional on IBD
for i in range(3):
P_out[i] = err_vectors[g1_obs, ...].T @ probs[i,...] @ err_vectors[g2_obs, ...]
# Return
return np.log(P_out)
[docs]@njit
def make_dynamic(g1, g2, freqs, map, weights, error_probs):
"""Make state-matrix and pointer matrix for a sibling pair by dynamic programming
Args:
g1 : :class:`~numpy:numpy.array`
integer vector of first sibling's genotypes
g2 : :class:`~numpy:numpy.array`
integer vector of first sibling's genotypes
freqs : :class:`~numpy:numpy.array`
floating point vector of allele frequencies
map : :class:`~numpy:numpy.array`
floating point vector of genetic positions in cM
weights : :class:`~numpy:numpy.array`
floating point vector of SNP weights (usually inverse LD-scores)
p : :class:'float'
genotyping error probability
Returns:
state_matrix : :class:`~numpy:numpy.array`
matrix where each column gives the prob of max prob path to that state, where each row is IBD 0,1,2
pointers : :class:`~numpy:numpy.array`
integer vector giving the pointer to which state from previous position lead to max at this position
"""
state_matrix = np.zeros((3, g1.shape[0]), dtype=np.float64)
pointers = np.zeros((3, g1.shape[0]), dtype=np.int8)
# Check for nans
not_nan = np.logical_not(np.logical_or(np.isnan(g1), np.isnan(g2)))
# Initialise
state_matrix[:,0] = np.log(np.array([0.25, 0.5, 0.25],dtype=np.float64))
if not_nan[0]:
state_matrix[:, 0] += weights[0]*p_obs_given_IBD(np.int8(g1[0]), np.int8(g2[0]), freqs[0], error_probs[0])
# Compute
for l in range(1, g1.shape[0]):
if not_nan[l]:
probs = weights[l]*p_obs_given_IBD(np.int8(g1[l]), np.int8(g2[l]), freqs[l], error_probs[l])
else:
probs = np.zeros((3))
tmatrix = transition_matrix(map[l]-map[l-1])
for i in range(3):
tprobs = tmatrix[:, i]+state_matrix[:, l-1]
state_matrix[i, l] = np.max(tprobs)+probs[i]
pointers[i, l] = np.argmax(tprobs)
return state_matrix, pointers
[docs]@njit
def viterbi(state_matrix,pointers):
"""Get viterbi path from state_matrix and pointers output of make_dynamic
Args:
state_matrix : :class:`~numpy:numpy.array`
matrix where each column gives the prob of max prob path to that state, where each row is IBD 0,1,2
pointers : :class:`~numpy:numpy.array`
integer vector giving the pointer to which state from previous position lead to max at this position
Returns:
path : :class:`~numpy:numpy.array`
integer vector giving the Viterbi path through the IBD states
"""
path = np.zeros(state_matrix.shape[1],dtype=np.int8)
path[path.shape[0]-1] = np.argmax(state_matrix[:, state_matrix.shape[1]-1])
for i in range(1,path.shape[0]):
path[path.shape[0]-(i+1)] = pointers[path[path.shape[0]-i], path.shape[0]-i]
return path
[docs]@njit(parallel=True)
def infer_ibd(sibpairs, gts, freqs, map, weights, error_probs):
ibd = np.zeros((sibpairs.shape[0], gts.shape[1]), dtype=np.int8)
for i in prange(sibpairs.shape[0]):
sibpair = sibpairs[i, :]
state_matrix, pointers = make_dynamic(gts[sibpair[0], :], gts[sibpair[1], :], freqs, map, weights, error_probs)
ibd[i, ...] = viterbi(state_matrix, pointers)
return ibd
[docs]class segment(object):
def __init__(self,start_index,end_index,start_bp,end_bp,start_snp,end_snp,length,state):
self.start = start_index
self.end = end_index
self.start_bp = start_bp
self.end_bp = end_bp
self.start_snp = start_snp
self.end_snp = end_snp
self.length = length
self.state = state
[docs] def to_text(self,id1,id2,chr,end=False):
seg_txt = str(id1)+'\t'+str(id2)+'\t'+str(self.state)+'\t'+str(chr)+'\t'+str(self.start_bp)+'\t'+str(self.end_bp)+'\t'+str(self.start_snp)+'\t'+str(self.end_snp)+'\t'+str(self.length)
if end:
return seg_txt
else:
return seg_txt+'\n'
[docs]def find_segments(path,map,snps,pos):
segments = []
ibd_start = path[0]
ibd_start_index = 0
for i in range(1,path.shape[0]):
if not path[i] == ibd_start:
segments.append(segment(ibd_start_index,i,
pos[ibd_start_index],pos[i-1],
snps[ibd_start_index],snps[i-1],
map[i-1]-map[ibd_start_index],ibd_start))
ibd_start_index = i
ibd_start = path[i]
segments.append(segment(ibd_start_index,i,
pos[ibd_start_index],pos[i],
snps[ibd_start_index],snps[i],
map[i]-map[ibd_start_index],ibd_start))
return segments
[docs]def smooth_segments(path,map,snps,pos,min_length):
# Smooth path
segments = find_segments(path,map,snps,pos)
if len(segments)>1:
for i in range(len(segments)):
if segments[i].length < min_length:
if i==0:
if segments[i].start == segments[i].end:
path[segments[i].start] = segments[i+1].state
else:
path[segments[i].start:segments[i].end] = segments[i+1].state
elif i<(len(segments)-1):
if segments[i-1].state == segments[i+1].state:
if segments[i].start == segments[i].end:
path[segments[i].start] = segments[i+1].state
else:
path[segments[i].start:segments[i].end] = segments[i+1].state
else:
if segments[i].start == segments[i].end:
path[segments[i].start] = segments[i-1].state
else:
path[segments[i].start:segments[i].end] = segments[i - 1].state
segments = find_segments(path,map,snps,pos)
return path, segments
[docs]def smooth_ibd(ibd,map,snps,pos,min_length):
allsegs = []
for i in range(ibd.shape[0]):
ibd_path, segments = smooth_segments(ibd[i,:],map,snps,pos,min_length)
ibd[i,:] = ibd_path
allsegs.append(segments)
return ibd, allsegs
[docs]def write_segs(sibpairs,allsegs,chr,outfile):
seg_out = gzip.open(outfile,'wb')
# Header
seg_out.write('ID1\tID2\tIBDType\tChr\tstart_coordinate\tstop_coordinate\tstartSNP\tstopSNP\tlength\n'.encode())
# Write segment lines
for i in range(0,sibpairs.shape[0]):
nseg = len(allsegs[i])
for j in range(nseg):
if i == (sibpairs.shape[0]-1) and j == (nseg-1):
seg_out.write(allsegs[i][j].to_text(sibpairs[i, 0], sibpairs[i, 1], chr, end=True).encode())
else:
seg_out.write(allsegs[i][j].to_text(sibpairs[i, 0], sibpairs[i, 1], chr, end=False).encode())
seg_out.close()
[docs]def write_segs_from_matrix(ibd,sibpairs,snps,pos,map,chrom,outfile):
# Get segments
allsegs = []
for i in range(sibpairs.shape[0]):
allsegs.append(find_segments(ibd[i,:],map,snps,pos))
# Write segments
write_segs(sibpairs,allsegs,chrom,outfile)
return allsegs
[docs]def infer_ibd_chr(sibpairs, error_prob, error_probs, outprefix, bedfile=None, bgenfile=None, chrom=None, min_length=0.01, mapfile=None, ibdmatrix=False, ld_out=False, min_maf=0.01, max_missing=5, max_error=0.01):
if bedfile is None and bgenfile is None:
raise(ValueError('Must provide either bed file or bgenfile'))
if bedfile is not None and bgenfile is not None:
raise(ValueError('Provide either bed file or bgen file. Not both.'))
if bedfile is not None:
## Read bed
print('Reading genotypes from ' + bedfile)
bimfile = bedfile.split('.bed')[0] + '.bim'
# Determine chromosome
if chrom is None:
chrom = np.loadtxt(bimfile, usecols=0, dtype=str)
chrom = np.unique(chrom)
if chrom.shape[0] > 1:
raise (ValueError('More than 1 chromosome in input bedfile'))
else:
chrom = chrom[0]
# Read sibling genotypes from bed file
gts = read_sibs_from_bed(bedfile, sibpairs)
print('Inferring IBD for chromosome ' + str(chrom))
elif bgenfile is not None:
## Read bed
print('Reading genotypes from ' + bgenfile)
# Determine chromosome
if chrom is None:
bgen = open_bgen(bgenfile,verbose=False)
chrom = bgen.chromosomes
chrom = np.unique(chrom)
if chrom.shape[0] > 1:
raise (ValueError('More than 1 chromosome in input bgenfile'))
else:
chrom = chrom[0]
if chrom=='':
chrom = 0
# Read sibling genotypes from bed file
gts = read_sibs_from_bgen(bgenfile, sibpairs)
print('Inferring IBD for chromosome ' + str(chrom))
# Calculate allele frequencies
print('Calculating allele frequencies')
gts.compute_freqs()
# Check which sibling pairs have genotypes
sibpair_indices = np.zeros((sibpairs.shape), dtype=bool)
sibpair_indices[:, 0] = np.array([x in gts.id_dict for x in sibpairs[:, 0]])
sibpair_indices[:, 1] = np.array([x in gts.id_dict for x in sibpairs[:, 1]])
sibpairs = sibpairs[np.sum(sibpair_indices, axis=1) == 2, :]
if sibpairs.shape[0] == 0:
raise (ValueError('No genotyped sibling pairs found'))
print(str(np.sum(sibpairs.shape[0])) + ' sibpairs have genotypes')
# Find indices of sibpairs
sibpair_indices = np.zeros((sibpairs.shape), dtype=int)
sibpair_indices[:, 0] = np.array([gts.id_dict[x] for x in sibpairs[:, 0]])
sibpair_indices[:, 1] = np.array([gts.id_dict[x] for x in sibpairs[:, 1]])
# Filtering on MAF, LD score, and genotyping error
# Find error probabilities
p_error = np.zeros((gts.sid.shape[0]))
p_error[:] = error_prob
if error_probs is not None:
in_error_probs = np.array([x in error_probs.sid_dict for x in gts.sid])
error_index = np.array([error_probs.sid_dict[x] for x in gts.sid[in_error_probs]])
p_error[in_error_probs] = error_probs.error_ests[error_index]
gts.error_probs = p_error
# Filter
print('Before filtering on MAF, missingness, and genotyping error, there were ' + str(gts.shape[1]) + ' SNPs')
gts.filter_maf(min_maf)
gts.filter_missingness(max_missing)
gts.filter(gts.error_probs < max_error)
print('After filtering, there are ' + str(gts.shape[1]) + ' SNPs')
# Read map file
if mapfile is None and bedfile is not None:
print('Separate genetic map not provided, so attempting to read map from ' + bimfile)
map = np.loadtxt(bimfile, usecols=2)
map_snp_dict = make_id_dict(np.loadtxt(bimfile, usecols=1, dtype=str))
# Check for NAs
if np.var(map) == 0:
print('Map information not found in bim file.')
print('Using default map (decode sex averaged map on GRCh38 coordinates)')
gts.map = decode_map_from_pos(chrom, gts.pos)
pc_mapped = str(round(100*(1-np.mean(np.isnan(gts.map))),2))
print('Found map positions for '+str(pc_mapped)+'% of SNPs')
gts.filter(~np.isnan(gts.map))
else:
if np.sum(np.isnan(map)) > 0:
raise (ValueError('Map cannot have NAs'))
if np.min(map) < 0:
raise (ValueError('Map file cannot have negative values'))
# Check ordering
ordered_map = np.sort(map)
if np.array_equal(map, ordered_map):
pass
else:
raise (ValueError('Map not monotonic. Please make sure input is ordered correctly'))
# Check scale
if np.max(map) > 5000:
raise (ValueError('Maximum value of map too large'))
gts.filter(np.array([x in map_snp_dict for x in gts.sid]))
gts.map = map[[map_snp_dict[x] for x in gts.sid]]
elif mapfile is None and bgenfile is not None:
print('Map file not provided.')
print('Using default map (decode sex averaged map on Hg19 coordinates)')
gts.map = decode_map_from_pos(chrom, gts.pos)
pc_mapped = 100*(1-np.mean(np.isnan(gts.map)))
if pc_mapped < 50:
print('Warning: map positions not found for the majority of SNPs. Consider providing a genetic map using --map')
print('Found map positions for '+str(round(pc_mapped,2))+'% of SNPs')
gts.filter(~np.isnan(gts.map))
else:
print('Reading map from ' + str(mapfile))
gts.map = get_map_positions(mapfile, gts)
print('Read map')
# Weights
print('Computing LD weights')
ld_n = np.min([1000, gts.shape[0]])
ld_sample = np.random.randint(0, gts.shape[0], size=ld_n)
ld = compute_ld_scores(np.array(gts.gts[ld_sample,:], dtype=np.float_), gts.map, max_dist=1)
gts.weights = np.power(ld, -1)
# IBD
print('Inferring IBD')
ibd = infer_ibd(sibpair_indices, np.array(gts.gts,dtype=np.float_), gts.freqs, gts.map, gts.weights, gts.error_probs)
ibd, allsegs = smooth_ibd(ibd, gts.map, gts.sid, gts.pos, min_length)
## Write output
# Write segments
segs_outfile = outfile_name(outprefix,'.ibd.segments.gz', chrom)
print('Writing segments to ' + segs_outfile)
write_segs(sibpairs, allsegs, chrom, segs_outfile)
# Write matrix
if ibdmatrix:
outfile = outfile_name(outprefix,'.ibdmatrix.gz', chrom)
print('Writing matrix output to ' + str(outfile))
ibd = np.row_stack(
(np.column_stack((np.array(['sib1', 'sib2']).reshape((1, 2)), gts.sid.reshape(1, gts.shape[1]))),
np.column_stack((sibpairs, ibd))))
np.savetxt(outfile, ibd, fmt='%s')
if ld_out:
ld_outfile = outfile_name(outprefix,'.l2.ldscore.gz', chrom)
print('Writing LD-scores to '+ld_outfile)
ld_out = np.vstack((np.array(['CHR', 'SNP', 'BP', 'L2']).reshape((1,4)),np.vstack((np.array([chrom for x in gts.sid]), gts.sid, gts.pos, ld)).T))
np.savetxt(ld_outfile, ld_out, fmt='%s')