Source code for snipar.read.phenotype

from pysnptools.snpreader import Pheno
import numpy as np
from snipar.gtarray import gtarray
from snipar.utilities import make_id_dict

[docs]def read_phenotype(phenofile, missing_char = 'NA', phen_index = 1):
    """Read a phenotype file and remove missing values.

    Args:
        phenofile : :class:`str`
            path to plain text phenotype file with columns FID, IID, phenotype1, phenotype2, ...
        missing_char : :class:`str`
            The character that denotes a missing phenotype value; 'NA' by default.
        phen_index : :class:`int`
           The index of the phenotype (counting from 1) if multiple phenotype columns present in phenofile

    Returns:
        y : :class:`~numpy:numpy.array`
            vector of non-missing phenotype values from specified column of phenofile
        pheno_ids: :class:`~numpy:numpy.array`
            corresponding vector of individual IDs (IID)
    """
    pheno = Pheno(phenofile, missing=missing_char)[:,phen_index-1].read()
    y = np.array(pheno.val)
    y.reshape((y.shape[0],1))
    pheno_ids = np.array(pheno.iid)[:,1]
    # Remove y NAs
    y_not_nan = np.logical_not(np.isnan(y[:,0]))
    if np.sum(y_not_nan) < y.shape[0]:
        y = y[y_not_nan,:]
        pheno_ids = pheno_ids[y_not_nan]
    print('Number of non-missing phenotype observations: ' + str(y.shape[0]))
    return gtarray(y,ids=pheno_ids)

[docs]def match_phenotype(G,y,pheno_ids):
    """Match a phenotype to a genotype array by individual IDs.

    Args:
        G : :class:`gtarray`
            genotype array to match phenotype to
        y : :class:`~numpy:numpy.array`
            vector of phenotype values
        pheno_ids: :class:`~numpy:numpy.array`
            vector of individual IDs corresponding to phenotype vector, y

    Returns:
       y : :class:`~numpy:numpy.array`
            vector of phenotype values matched by individual IDs to the genotype array

    """
    in_G_dict = np.array([x in G.id_dict for x in pheno_ids])
    y = y[in_G_dict]
    pheno_ids = pheno_ids[in_G_dict]
    pheno_id_dict = make_id_dict(pheno_ids)
    y = y[[pheno_id_dict[x] for x in G.ids]]
    return y

[docs]def read_covariates(covar, pheno_ids=None, missing_char = 'NA'):
    covar = Pheno(covar, missing=missing_char).read()
    X = np.array(covar.val)
    X = gtarray(X, ids=np.array(covar.iid)[:,1], sid=covar.sid)
    if pheno_ids is not None:
        in_covar = np.array([x in X.id_dict for x in pheno_ids])
        if np.sum((~in_covar))>0:
            raise(ValueError('Missing covariate values for some phenotyped individuals'))
    X.fill_NAs()
    return X