Source code for snipar.read.phenotype

from pysnptools.snpreader import Pheno
import numpy as np
from snipar.gtarray import gtarray
from snipar.utilities import make_id_dict

[docs]def read_phenotype(phenofile, missing_char = 'NA', phen_index = 1): """Read a phenotype file and remove missing values. Args: phenofile : :class:`str` path to plain text phenotype file with columns FID, IID, phenotype1, phenotype2, ... missing_char : :class:`str` The character that denotes a missing phenotype value; 'NA' by default. phen_index : :class:`int` The index of the phenotype (counting from 1) if multiple phenotype columns present in phenofile Returns: y : :class:`~numpy:numpy.array` vector of non-missing phenotype values from specified column of phenofile pheno_ids: :class:`~numpy:numpy.array` corresponding vector of individual IDs (IID) """ pheno = Pheno(phenofile, missing=missing_char)[:,phen_index-1].read() y = np.array(pheno.val) y.reshape((y.shape[0],1)) pheno_ids = np.array(pheno.iid)[:,1] # Remove y NAs y_not_nan = np.logical_not(np.isnan(y[:,0])) if np.sum(y_not_nan) < y.shape[0]: y = y[y_not_nan,:] pheno_ids = pheno_ids[y_not_nan] print('Number of non-missing phenotype observations: ' + str(y.shape[0])) return gtarray(y,ids=pheno_ids)
[docs]def match_phenotype(G,y,pheno_ids): """Match a phenotype to a genotype array by individual IDs. Args: G : :class:`gtarray` genotype array to match phenotype to y : :class:`~numpy:numpy.array` vector of phenotype values pheno_ids: :class:`~numpy:numpy.array` vector of individual IDs corresponding to phenotype vector, y Returns: y : :class:`~numpy:numpy.array` vector of phenotype values matched by individual IDs to the genotype array """ in_G_dict = np.array([x in G.id_dict for x in pheno_ids]) y = y[in_G_dict] pheno_ids = pheno_ids[in_G_dict] pheno_id_dict = make_id_dict(pheno_ids) y = y[[pheno_id_dict[x] for x in G.ids]] return y
[docs]def read_covariates(covar, pheno_ids=None, missing_char = 'NA'): covar = Pheno(covar, missing=missing_char).read() X = np.array(covar.val) X = gtarray(X, ids=np.array(covar.iid)[:,1], sid=covar.sid) if pheno_ids is not None: in_covar = np.array([x in X.id_dict for x in pheno_ids]) if np.sum((~in_covar))>0: raise(ValueError('Missing covariate values for some phenotyped individuals')) X.fill_NAs() return X