Source code for snipar.pedigree

import numpy as np
import pandas as pd
import logging
from snipar.utilities import make_id_dict

[docs]def get_sibpairs_from_ped(ped):
    # Remove rows with missing parents
    parent_missing = np.array([ped[i,2]=='0' or ped[i,3]=='0' for i in range(ped.shape[0])])
    #print('Removing '+str(np.sum(parent_missing))+' rows from pedigree due to missing parent(s)')
    ped = ped[np.logical_not(parent_missing),:]
    # Remove control families
    controls = np.array([x[0]=='_' for x in ped[:,0]])
    ped = ped[~controls,:]
    parent_pairs = np.array([ped[i,2]+ped[i,3] for i in range(ped.shape[0])])
    # Find unique parent-pairs
    unique_pairs, sib_counts = np.unique(parent_pairs, return_counts=True)
    fam_dict = dict()
    for i in range(ped.shape[0]):
        if parent_pairs[i] in fam_dict:
            fam_dict[parent_pairs[i]] = np.append(fam_dict[parent_pairs[i]],i)
        else:
            fam_dict[parent_pairs[i]] = np.array([i])
    # Fill in sibships
    for i in range(unique_pairs.shape[0]):
        ped[fam_dict[unique_pairs[i]],0] = i 
    # Parent pairs with more than one offspring
    sib_parent_pairs = unique_pairs[sib_counts>1]
    fam_sizes = np.array([x*(x-1)/2 for x in sib_counts[sib_counts>1]])
    npairs = int(np.sum(fam_sizes))
    if npairs==0:
        return None, ped
    else:    
        # Find all sibling pairs
        sibpairs = np.zeros((npairs,2),dtype=ped.dtype)
        paircount = 0
        for ppair in sib_parent_pairs:
            sib_indices = fam_dict[ppair]
            for i in range(0,sib_indices.shape[0]-1):
                for j in range(i+1,sib_indices.shape[0]):
                    sibpairs[paircount,:] = np.array([ped[sib_indices[i],1],ped[sib_indices[j],1]])
                    paircount += 1
        return sibpairs, ped

[docs]class Person:
    """Just a simple data structure representing individuals

    Args:
        id : str
            IID of the individual.
        fid : str
            FID of the individual.
        pid : str
            IID of the father of that individual.
        mid : str
            IID of the mother of that individual.
    """
    def __init__(self, id, fid=None, pid=None, mid=None):
        self.id = id
        self.fid = fid
        self.pid = pid
        self.mid = mid


[docs]def create_pedigree(king_address, agesex_address, same_parents_in_ped=True):
    """Creates pedigree table from agesex file and kinship file in KING format.
    
    Args:
        king_address : str
            Address of a kinship file in KING format. kinship file is a '\t' seperated csv with columns "FID1", "ID1", "FID2", "ID2, "InfType".
            Each row represents a relationship between two individuals. InfType column states the relationship between two individuals.
            The only relationships that matter for this script are full sibling and parent-offspring which are shown by 'FS' and 'PO' respectively.
            This file is used in creating a pedigree file and can be generated using KING.
            As fids starting with '_' are reserved for control there should be no fids starting with '_'.

        agesex_address : str
            Address of the agesex file. This is a " " seperated CSV with columns "FID", "IID", "FATHER_ID", "MOTHER_ID", "sex", "age".
            Each row contains the age and sex of one individual. Male and Female sex should be represented with 'M' and 'F'.
            Age column is used for distinguishing between parent and child in a parent-offspring relationship inferred from the kinship file.
            ID1 is a parent of ID2 if there is a 'PO' relationship between them and 'ID1' is at least 12 years older than ID2.
    
    Returns:
        pd.DataFrame:
            A pedigree table with 'FID', 'IID', 'FATHER_ID', 'MOTHER_ID'. Each row represents an individual.
    """

    kinship = pd.read_csv(king_address, delimiter="\t").astype(str)
    logging.info("loaded kinship file")
    # Filter MZ
    mz_kin = kinship.loc[kinship['InfType']=='Dup/MZ']
    if mz_kin.shape[0]>0:
        print('Found '+str(mz_kin.shape[0])+' duplicates/MZ twin pairs. Removing one from each pair')
        mz_id1 = set(np.unique(mz_kin.loc[:,['ID1']]))
        id1_in_mz = np.array([x in mz_id1 for x in kinship.ID1])
        id2_in_mz = np.array([x in mz_id1 for x in kinship.ID2])
        in_mz = np.logical_or(id1_in_mz,id2_in_mz)
        kinship = kinship.drop(kinship[in_mz].index)
    agesex = pd.read_csv(agesex_address, delim_whitespace=True)
    agesex["IID"] = agesex["IID"].astype(str)
    agesex["FID"] = agesex["FID"].astype(str)
    logging.info("loaded agesex file")
    agesex = agesex.set_index("IID")
    logging.info("creating age and sex dictionaries")
    kinship = pd.merge(kinship, agesex.rename(columns={"sex":"sex1", "age":"age1"}), left_on="ID1", right_index=True)
    kinship = pd.merge(kinship, agesex.rename(columns={"sex":"sex2", "age":"age2"}), left_on="ID2", right_index=True)
    logging.info("dictionaries created")
    people = {}
    fid_counter = 0
    dropouts = []
    kinship_cols = kinship.columns.tolist()
    index_id1 = kinship_cols.index("ID1")
    index_id2 = kinship_cols.index("ID2")
    index_sex1 = kinship_cols.index("sex1")
    index_sex2 = kinship_cols.index("sex2")
    index_age1 = kinship_cols.index("age1")
    index_age2 = kinship_cols.index("age2")
    index_inftype = kinship_cols.index("InfType")
    logging.info("creating pedigree objects")
    pop_size = kinship.values.shape[0]
    t = kinship.values.tolist()
    for row in range(pop_size):
        relation = t[row][index_inftype]
        id1 = t[row][index_id1]
        id2 = t[row][index_id2]
        age1 = t[row][index_age1]
        age2 = t[row][index_age2]
        sex1 = t[row][index_sex1]
        sex2 = t[row][index_sex2]
        p1 = people.get(id1)
        if p1 is None:
            p1 = Person(id1)
            people[id1] = p1
        
        p2 = people.get(id2)
        if p2 is None:
            p2 = Person(id2)
            people[id2] = p2

        if relation == "PO":
            if age1 >  age2+12:
                if sex1 == "F":
                    p2.mid = p1.id
                if sex1 == "M":
                    p2.pid = p1.id

            if age2 > age1+12:
                if sex2 == "F":
                    p1.mid = p2.id
                if sex2 == "M":
                    p1.pid = p2.id
        if relation == "FS":
            if p1.fid is None and p2.fid is None:
                p1.fid = str(fid_counter)
                p2.fid = str(fid_counter)
                fid_counter += 1
            
            if p1.fid is None and p2.fid is not None:
                p1.fid = p2.fid

            if p1.fid is not None and p2.fid is None:
                p2.fid = p1.fid

    for excess in dropouts:
        people.pop(excess)

    data = []
    for p in people.values():
        if p.fid is None:
            p.fid = str(fid_counter)
            fid_counter += 1

        if p.mid is None:
            #default mother id
            p.mid = p.fid + "___M"

        if p.pid is None:
            #default father ir
            p.pid = p.fid + "___P"
        
        data.append((p.fid, p.id, p.pid, p.mid))

    data = pd.DataFrame(data, columns = ['FID' , 'IID', 'FATHER_ID' , 'MOTHER_ID']).astype(str)
    
    # Check if parents are the same in ped for each sibship
    if same_parents_in_ped:
        fam_dict = dict()
        for i in range(data.shape[0]):
            if data.FID[i] in fam_dict:
                fam_dict[data.FID[i]] = np.append(fam_dict[data.FID[i]],i)
            else:
                fam_dict[data.FID[i]] = np.array([i])
        sibships, fam_sizes = np.unique(data.FID, return_counts=True)
        sibships = sibships[fam_sizes>1]
        inconsistent_fams = set()
        for sibship in sibships:
            sibship_ped = data.loc[fam_dict[sibship],:].values
            parent_pairs = np.array([sibship_ped[i,2]+sibship_ped[i,3] for i in range(sibship_ped.shape[0])])
            unique_pairs = np.unique(parent_pairs)
            if unique_pairs.shape[0]>1:
                inconsistent_fams.add(sibship)
                data.loc[fam_dict[sibship],['FATHER_ID','MOTHER_ID']] = np.array([sibship+'___P',sibship+'___M'])
        if len(inconsistent_fams)>0:
            print('Warning: found '+str(len(inconsistent_fams))+' sibships with different parents when creating pedigree. Removing the parents from pedigree.')
    
    return data

[docs]def find_individuals_with_sibs(ids, ped, gts_ids, return_ids_only = False):
    """
    Used in get_gts_matrix and get_fam_means to find the individuals in ids that have genotyped siblings.
    """
    ## Find genotyped sibships of size > 1
    ped_dict = make_id_dict(ped, 1)
    ids_in_ped = np.array([x in ped_dict for x in gts_ids])
    indices_in_ped = np.array([ped_dict[x] for x in gts_ids[ids_in_ped]])
    gts_fams = np.zeros((gts_ids.shape[0]),dtype=gts_ids.dtype)
    gts_fams[ids_in_ped] = ped[indices_in_ped,0]
    ped_gts = ped[indices_in_ped, :]
    fams, counts = np.unique(ped_gts[:,0], return_counts=True)
    sibships = set(fams[counts > 1])
    # Check if parents are the same in ped for each sibship
    fam_dict = dict()
    for i in range(ped_gts.shape[0]):
        if ped_gts[i,0] in sibships:
            if ped_gts[i,0] in fam_dict:
                fam_dict[ped_gts[i,0]] = np.append(fam_dict[ped_gts[i,0]],i)
            else:
                fam_dict[ped_gts[i,0]] = np.array([i])
    inconsistent_sibships = set()
    for sibship in sibships:
        sibship_ped = ped_gts[fam_dict[sibship],:]
        parent_pairs = np.array([sibship_ped[i,2]+sibship_ped[i,3] for i in range(sibship_ped.shape[0])])
        unique_pairs = np.unique(parent_pairs)
        if unique_pairs.shape[0]>1:
            inconsistent_sibships.add(sibship)
    if len(inconsistent_sibships)>0:
        sibships = sibships.difference(inconsistent_sibships)
    ## Find individuals with genotyped siblings
    ids_in_ped = np.array([x in ped_dict for x in ids])
    ids = ids[ids_in_ped]
    ids_fams = np.array([ped[ped_dict[x], 0] for x in ids])
    ids_with_sibs = np.array([x in sibships for x in ids_fams])
    ids = ids[ids_with_sibs]
    ids_fams = ids_fams[ids_with_sibs]
    if return_ids_only:
        return ids
    else:
        return ids, ids_fams, gts_fams

[docs]def get_sibpairs_from_king(kinfile):
    kin_header = np.array(open(kinfile,'r').readline().split('\t'))
    inf_type_index = np.where(np.array([x[0:7]=='InfType' for x in kin_header]))[0][0]
    id1_index = np.where(np.array(kin_header)=='ID1')[0][0]
    id2_index = np.where(np.array(kin_header)=='ID2')[0][0]
    sibpairs = np.loadtxt(kinfile,dtype=str,skiprows=1,usecols=(id1_index,id2_index,inf_type_index))
    sibpairs = sibpairs[sibpairs[:,2]=='FS',0:2]
    return sibpairs