Source code for snipar.ibd

import gzip
from numba import njit, prange
from snipar.map import *
from snipar.ld import compute_ld_scores
import numpy as np
from snipar.read.bed import read_sibs_from_bed
from snipar.read.bgen import read_sibs_from_bgen
from snipar.utilities import make_id_dict
from snipar.utilities import outfile_name
from snipar.utilities import open_bgen

####### Transition Matrix ######
[docs]@njit def transition_matrix(cM): """Compute probabilities of transitioning between IBD states as a function of genetic distance. Args: cM : :class:`float` genetic distance in centiMorgans (cM) Returns: log(P) : :class:`~numpy:numpy.array` the natural logarithm (element-wise) of the matrix of transition probabilities """ P = np.identity(3) r_array = np.array([[-1.0,1.0,0.0],[0.5,-1.0,0.5],[0.0,1.0,-1.0]],dtype=np.float64) r = 1.0 - np.power((1.0 + np.exp(-cM / 50)) / 2.0, 4) P += r_array*r return np.log(P)
#
[docs]@njit def p_ibd_0(f): """Compute Joint-PMF for sibling pair genotypes given IBD0. Args: f : :class:`float` allele frequency Returns: P : :class:`~numpy:numpy.array` matrix of probabilities """ P_vec = np.array([(1.0-f)**2.0,2.0*f*(1-f),f**2.0]).reshape((3,1)) return P_vec @ P_vec.T
[docs]@njit def p_ibd_1(f): """Compute Joint-PMF for sibling pair genotypes given IBD1. Args: f : :class:`float` allele frequency Returns: P : :class:`~numpy:numpy.array` matrix of probabilities """ fmatrix = np.array([[0.0,1.0,0.0],[1.0,1.0,2.0],[0.0,2.0,3.0]]) minus_fmatrix = np.array([[3.0,2.0,0.0],[2.0,1.0,1.0],[0.0,1.0,0.0]]) P = np.exp(fmatrix*np.log(f)+minus_fmatrix*np.log(1-f)) P[0,2] = 0 P[2,0] = 0 return P
[docs]@njit def p_ibd_2(f): """Compute Joint-PMF for sibling pair genotypes given IBD2. Args: f : :class:`float` allele frequency Returns: P : :class:`~numpy:numpy.array` matrix of probabilities """ P = np.zeros((3,3),dtype=np.float64) fdiag = np.array([(1.0-f)**2.0,2.0*f*(1.0-f),f**2.0]) np.fill_diagonal(P,fdiag.reshape((3,1))) return P
[docs]@njit def p_obs_given_IBD(g1_obs,g2_obs,f,p): """Compute Joint-PMF for sibling pair genotypes given IBD0. Args: g1_obs : :class:`integer` observed genotype for sibling 1 g2_obs : :class:`integer` observed genotype for sibling 2 f : :class:`float` allele frequency p : :class:'float' genotyping error probability Returns: log(P) : :class:`~numpy:numpy.array` vector giving log-probabilities of observing g1_obs,g2_obs give IBD 0,1,2 """ P_out = np.zeros((3)) # Get probabilities of observed genotypes given true err_vectors = np.array([[1.0 - p, p / 2.0, 0.0], [p, 1.0 - p, p], [0.0, p / 2.0, 1.0 - p]]) # Get probabilities of true genotypes given IBD probs = np.zeros((3, 3, 3)) probs[0, ...] = p_ibd_0(f) probs[1, ...] = p_ibd_1(f) probs[2, ...] = p_ibd_2(f) # Compute probabilities conditional on IBD for i in range(3): P_out[i] = err_vectors[g1_obs, ...].T @ probs[i,...] @ err_vectors[g2_obs, ...] # Return return np.log(P_out)
[docs]@njit def make_dynamic(g1, g2, freqs, map, weights, error_probs): """Make state-matrix and pointer matrix for a sibling pair by dynamic programming Args: g1 : :class:`~numpy:numpy.array` integer vector of first sibling's genotypes g2 : :class:`~numpy:numpy.array` integer vector of first sibling's genotypes freqs : :class:`~numpy:numpy.array` floating point vector of allele frequencies map : :class:`~numpy:numpy.array` floating point vector of genetic positions in cM weights : :class:`~numpy:numpy.array` floating point vector of SNP weights (usually inverse LD-scores) p : :class:'float' genotyping error probability Returns: state_matrix : :class:`~numpy:numpy.array` matrix where each column gives the prob of max prob path to that state, where each row is IBD 0,1,2 pointers : :class:`~numpy:numpy.array` integer vector giving the pointer to which state from previous position lead to max at this position """ state_matrix = np.zeros((3, g1.shape[0]), dtype=np.float64) pointers = np.zeros((3, g1.shape[0]), dtype=np.int8) # Check for nans not_nan = np.logical_not(np.logical_or(np.isnan(g1), np.isnan(g2))) # Initialise state_matrix[:,0] = np.log(np.array([0.25, 0.5, 0.25],dtype=np.float64)) if not_nan[0]: state_matrix[:, 0] += weights[0]*p_obs_given_IBD(np.int8(g1[0]), np.int8(g2[0]), freqs[0], error_probs[0]) # Compute for l in range(1, g1.shape[0]): if not_nan[l]: probs = weights[l]*p_obs_given_IBD(np.int8(g1[l]), np.int8(g2[l]), freqs[l], error_probs[l]) else: probs = np.zeros((3)) tmatrix = transition_matrix(map[l]-map[l-1]) for i in range(3): tprobs = tmatrix[:, i]+state_matrix[:, l-1] state_matrix[i, l] = np.max(tprobs)+probs[i] pointers[i, l] = np.argmax(tprobs) return state_matrix, pointers
[docs]@njit def viterbi(state_matrix,pointers): """Get viterbi path from state_matrix and pointers output of make_dynamic Args: state_matrix : :class:`~numpy:numpy.array` matrix where each column gives the prob of max prob path to that state, where each row is IBD 0,1,2 pointers : :class:`~numpy:numpy.array` integer vector giving the pointer to which state from previous position lead to max at this position Returns: path : :class:`~numpy:numpy.array` integer vector giving the Viterbi path through the IBD states """ path = np.zeros(state_matrix.shape[1],dtype=np.int8) path[path.shape[0]-1] = np.argmax(state_matrix[:, state_matrix.shape[1]-1]) for i in range(1,path.shape[0]): path[path.shape[0]-(i+1)] = pointers[path[path.shape[0]-i], path.shape[0]-i] return path
[docs]@njit(parallel=True) def infer_ibd(sibpairs, gts, freqs, map, weights, error_probs): ibd = np.zeros((sibpairs.shape[0], gts.shape[1]), dtype=np.int8) for i in prange(sibpairs.shape[0]): sibpair = sibpairs[i, :] state_matrix, pointers = make_dynamic(gts[sibpair[0], :], gts[sibpair[1], :], freqs, map, weights, error_probs) ibd[i, ...] = viterbi(state_matrix, pointers) return ibd
[docs]class segment(object): def __init__(self,start_index,end_index,start_bp,end_bp,start_snp,end_snp,length,state): self.start = start_index self.end = end_index self.start_bp = start_bp self.end_bp = end_bp self.start_snp = start_snp self.end_snp = end_snp self.length = length self.state = state
[docs] def to_text(self,id1,id2,chr,end=False): seg_txt = str(id1)+'\t'+str(id2)+'\t'+str(self.state)+'\t'+str(chr)+'\t'+str(self.start_bp)+'\t'+str(self.end_bp)+'\t'+str(self.start_snp)+'\t'+str(self.end_snp)+'\t'+str(self.length) if end: return seg_txt else: return seg_txt+'\n'
[docs]def find_segments(path,map,snps,pos): segments = [] ibd_start = path[0] ibd_start_index = 0 for i in range(1,path.shape[0]): if not path[i] == ibd_start: segments.append(segment(ibd_start_index,i, pos[ibd_start_index],pos[i-1], snps[ibd_start_index],snps[i-1], map[i-1]-map[ibd_start_index],ibd_start)) ibd_start_index = i ibd_start = path[i] segments.append(segment(ibd_start_index,i, pos[ibd_start_index],pos[i], snps[ibd_start_index],snps[i], map[i]-map[ibd_start_index],ibd_start)) return segments
[docs]def smooth_segments(path,map,snps,pos,min_length): # Smooth path segments = find_segments(path,map,snps,pos) if len(segments)>1: for i in range(len(segments)): if segments[i].length < min_length: if i==0: if segments[i].start == segments[i].end: path[segments[i].start] = segments[i+1].state else: path[segments[i].start:segments[i].end] = segments[i+1].state elif i<(len(segments)-1): if segments[i-1].state == segments[i+1].state: if segments[i].start == segments[i].end: path[segments[i].start] = segments[i+1].state else: path[segments[i].start:segments[i].end] = segments[i+1].state else: if segments[i].start == segments[i].end: path[segments[i].start] = segments[i-1].state else: path[segments[i].start:segments[i].end] = segments[i - 1].state segments = find_segments(path,map,snps,pos) return path, segments
[docs]def smooth_ibd(ibd,map,snps,pos,min_length): allsegs = [] for i in range(ibd.shape[0]): ibd_path, segments = smooth_segments(ibd[i,:],map,snps,pos,min_length) ibd[i,:] = ibd_path allsegs.append(segments) return ibd, allsegs
[docs]def write_segs(sibpairs,allsegs,chr,outfile): seg_out = gzip.open(outfile,'wb') # Header seg_out.write('ID1\tID2\tIBDType\tChr\tstart_coordinate\tstop_coordinate\tstartSNP\tstopSNP\tlength\n'.encode()) # Write segment lines for i in range(0,sibpairs.shape[0]): nseg = len(allsegs[i]) for j in range(nseg): if i == (sibpairs.shape[0]-1) and j == (nseg-1): seg_out.write(allsegs[i][j].to_text(sibpairs[i, 0], sibpairs[i, 1], chr, end=True).encode()) else: seg_out.write(allsegs[i][j].to_text(sibpairs[i, 0], sibpairs[i, 1], chr, end=False).encode()) seg_out.close()
[docs]def write_segs_from_matrix(ibd,sibpairs,snps,pos,map,chrom,outfile): # Get segments allsegs = [] for i in range(sibpairs.shape[0]): allsegs.append(find_segments(ibd[i,:],map,snps,pos)) # Write segments write_segs(sibpairs,allsegs,chrom,outfile) return allsegs
[docs]def infer_ibd_chr(sibpairs, error_prob, error_probs, outprefix, bedfile=None, bgenfile=None, chrom=None, min_length=0.01, mapfile=None, ibdmatrix=False, ld_out=False, min_maf=0.01, max_missing=5, max_error=0.01): if bedfile is None and bgenfile is None: raise(ValueError('Must provide either bed file or bgenfile')) if bedfile is not None and bgenfile is not None: raise(ValueError('Provide either bed file or bgen file. Not both.')) if bedfile is not None: ## Read bed print('Reading genotypes from ' + bedfile) bimfile = bedfile.split('.bed')[0] + '.bim' # Determine chromosome if chrom is None: chrom = np.loadtxt(bimfile, usecols=0, dtype=str) chrom = np.unique(chrom) if chrom.shape[0] > 1: raise (ValueError('More than 1 chromosome in input bedfile')) else: chrom = chrom[0] # Read sibling genotypes from bed file gts = read_sibs_from_bed(bedfile, sibpairs) print('Inferring IBD for chromosome ' + str(chrom)) elif bgenfile is not None: ## Read bed print('Reading genotypes from ' + bgenfile) # Determine chromosome if chrom is None: bgen = open_bgen(bgenfile,verbose=False) chrom = bgen.chromosomes chrom = np.unique(chrom) if chrom.shape[0] > 1: raise (ValueError('More than 1 chromosome in input bgenfile')) else: chrom = chrom[0] if chrom=='': chrom = 0 # Read sibling genotypes from bed file gts = read_sibs_from_bgen(bgenfile, sibpairs) print('Inferring IBD for chromosome ' + str(chrom)) # Calculate allele frequencies print('Calculating allele frequencies') gts.compute_freqs() # Check which sibling pairs have genotypes sibpair_indices = np.zeros((sibpairs.shape), dtype=bool) sibpair_indices[:, 0] = np.array([x in gts.id_dict for x in sibpairs[:, 0]]) sibpair_indices[:, 1] = np.array([x in gts.id_dict for x in sibpairs[:, 1]]) sibpairs = sibpairs[np.sum(sibpair_indices, axis=1) == 2, :] if sibpairs.shape[0] == 0: raise (ValueError('No genotyped sibling pairs found')) print(str(np.sum(sibpairs.shape[0])) + ' sibpairs have genotypes') # Find indices of sibpairs sibpair_indices = np.zeros((sibpairs.shape), dtype=int) sibpair_indices[:, 0] = np.array([gts.id_dict[x] for x in sibpairs[:, 0]]) sibpair_indices[:, 1] = np.array([gts.id_dict[x] for x in sibpairs[:, 1]]) # Filtering on MAF, LD score, and genotyping error # Find error probabilities p_error = np.zeros((gts.sid.shape[0])) p_error[:] = error_prob if error_probs is not None: in_error_probs = np.array([x in error_probs.sid_dict for x in gts.sid]) error_index = np.array([error_probs.sid_dict[x] for x in gts.sid[in_error_probs]]) p_error[in_error_probs] = error_probs.error_ests[error_index] gts.error_probs = p_error # Filter print('Before filtering on MAF, missingness, and genotyping error, there were ' + str(gts.shape[1]) + ' SNPs') gts.filter_maf(min_maf) gts.filter_missingness(max_missing) gts.filter(gts.error_probs < max_error) print('After filtering, there are ' + str(gts.shape[1]) + ' SNPs') # Read map file if mapfile is None and bedfile is not None: print('Separate genetic map not provided, so attempting to read map from ' + bimfile) map = np.loadtxt(bimfile, usecols=2) map_snp_dict = make_id_dict(np.loadtxt(bimfile, usecols=1, dtype=str)) # Check for NAs if np.var(map) == 0: print('Map information not found in bim file.') print('Using default map (decode sex averaged map on GRCh38 coordinates)') gts.map = decode_map_from_pos(chrom, gts.pos) pc_mapped = str(round(100*(1-np.mean(np.isnan(gts.map))),2)) print('Found map positions for '+str(pc_mapped)+'% of SNPs') gts.filter(~np.isnan(gts.map)) else: if np.sum(np.isnan(map)) > 0: raise (ValueError('Map cannot have NAs')) if np.min(map) < 0: raise (ValueError('Map file cannot have negative values')) # Check ordering ordered_map = np.sort(map) if np.array_equal(map, ordered_map): pass else: raise (ValueError('Map not monotonic. Please make sure input is ordered correctly')) # Check scale if np.max(map) > 5000: raise (ValueError('Maximum value of map too large')) gts.filter(np.array([x in map_snp_dict for x in gts.sid])) gts.map = map[[map_snp_dict[x] for x in gts.sid]] elif mapfile is None and bgenfile is not None: print('Map file not provided.') print('Using default map (decode sex averaged map on Hg19 coordinates)') gts.map = decode_map_from_pos(chrom, gts.pos) pc_mapped = 100*(1-np.mean(np.isnan(gts.map))) if pc_mapped < 50: print('Warning: map positions not found for the majority of SNPs. Consider providing a genetic map using --map') print('Found map positions for '+str(round(pc_mapped,2))+'% of SNPs') gts.filter(~np.isnan(gts.map)) else: print('Reading map from ' + str(mapfile)) gts.map = get_map_positions(mapfile, gts) print('Read map') # Weights print('Computing LD weights') ld_n = np.min([1000, gts.shape[0]]) ld_sample = np.random.randint(0, gts.shape[0], size=ld_n) ld = compute_ld_scores(np.array(gts.gts[ld_sample,:], dtype=np.float_), gts.map, max_dist=1) gts.weights = np.power(ld, -1) # IBD print('Inferring IBD') ibd = infer_ibd(sibpair_indices, np.array(gts.gts,dtype=np.float_), gts.freqs, gts.map, gts.weights, gts.error_probs) ibd, allsegs = smooth_ibd(ibd, gts.map, gts.sid, gts.pos, min_length) ## Write output # Write segments segs_outfile = outfile_name(outprefix,'.ibd.segments.gz', chrom) print('Writing segments to ' + segs_outfile) write_segs(sibpairs, allsegs, chrom, segs_outfile) # Write matrix if ibdmatrix: outfile = outfile_name(outprefix,'.ibdmatrix.gz', chrom) print('Writing matrix output to ' + str(outfile)) ibd = np.row_stack( (np.column_stack((np.array(['sib1', 'sib2']).reshape((1, 2)), gts.sid.reshape(1, gts.shape[1]))), np.column_stack((sibpairs, ibd)))) np.savetxt(outfile, ibd, fmt='%s') if ld_out: ld_outfile = outfile_name(outprefix,'.l2.ldscore.gz', chrom) print('Writing LD-scores to '+ld_outfile) ld_out = np.vstack((np.array(['CHR', 'SNP', 'BP', 'L2']).reshape((1,4)),np.vstack((np.array([chrom for x in gts.sid]), gts.sid, gts.pos, ld)).T)) np.savetxt(ld_outfile, ld_out, fmt='%s')