Sourcepredict example3: Segregating patients with or without Clostridium difficile infection (CDI) on the basis of 16s microbiome

Source Article: Domestic canines do not display evidence of gut microbial dysbiosis in the presence of Clostridioides (Clostridium) difficile, despite cellular susceptibility to its toxins 10.1016/j.anaerobe.2019.03.017

Healthy human dataset: PRJNA386260
CDI human dataset: PRJNA307992
[1]:
import pandas as pd
import numpy as np
from plotnine import *
from ete3 import NCBITaxa
import multiprocessing
from functools import partial
import seaborn as sns
[2]:
ncbi = NCBITaxa()

Downloading data

[3]:
cdi_color = "#E7CE1A"
healthy_color = "grey"
[4]:
tax_level = ['genus','species']
[5]:
import multiprocessing
import subprocess

def dl(file, outdir):
    cmd = f"wget {file} -P {outdir}"
    print(cmd)
    try:
        subprocess.check_output(cmd, shell=True)
    except subprocess.CalledProcessError:
        print(f"Error downloading {file}")


def dl_multi(allfiles, outdir, process):
    dl_fun = partial(dl, outdir=outdir)
    with multiprocessing.Pool(process) as p:
        p.map(dl_fun, allfiles)
[6]:
healthy_meta = pd.read_csv("healthy/PRJNA386260_metadata.txt", sep="\t", index_col='run_accession')
CDI_meta = pd.read_csv("CDI/PRJNA307992_metadata.txt", sep="\t", index_col='run_accession')
[7]:
healthy_meta['labels'] = ['healthy']*healthy_meta.shape[0]
CDI_meta['labels'] = ['CDI']*CDI_meta.shape[0]
[8]:
healthy_fastqs = list(healthy_meta['fastq_ftp'].str.split(";", expand=True)[0]) + list(healthy_meta['fastq_ftp'].str.split(";", expand=True)[1])
[9]:
CDI_fastqs = list(CDI_meta['fastq_ftp'].str.split(";", expand=True)[0]) + list(CDI_meta['fastq_ftp'].str.split(";", expand=True)[1])

Uncomment to download files

[10]:
#dl_multi(allfiles=healthy_fastqs, outdir="./healthy/", process=4)
#dl_multi(allfiles=CDI_fastqs, outdir="./CDI/", process=4)

Utility functions

Removing outlier samples (less than 10 species) and species present in less than 10 samples

[11]:
def remove_outlier(df, n=10):
    return(df.loc[df.nunique(axis=1) > n, df.nunique(axis=0) > n])

Removing TAXID not in NCBI taxonomy

[12]:
def remove_not_taxo(df):
    """
    df(pandas DataFrame) with TAXID in index, and samples in columns
    """
    valid_ranks = {k:v for (k,v) in zip(ncbi.get_rank(df.index).keys(), ncbi.get_rank(df.index).values()) if v != 'no rank'}
    return(df.loc[valid_ranks.keys(),:])

Normalization methods

[13]:
def gmpr_size_factor(col, ar):
    """Generate GMPR size factor
    Args:
        col (int): columm index of the numpy array
        ar (numpy array): numpy array of TAXID counts,
            colums as Samples, Rows as TAXIDs
    Returns:
        float: GMPR size factor per column
    """
    pr = np.apply_along_axis(lambda x: np.divide(ar[:, col], x), 0, ar)
    pr[np.isinf(pr)] = np.nan
    pr[pr == 0] = np.nan
    pr_median = np.nanmedian(pr, axis=0)
    return(np.exp(np.mean(np.log(pr_median))))


def GMPR_normalize(df, process=4):
    """Compute GMPR normalization
    Global Mean of Pairwise Ratios
    Chen, L., Reeve, J., Zhang, L., Huang, S., Wang, X., & Chen, J. (2018).
    GMPR: A robust normalization method for zero-inflated count data
    with application to microbiome sequencing data.
    PeerJ, 6, e4600.
    Args:
        df (pandas Dataframe): TAXID count dataframe,
            colums as Samples, Rows as TAXIDs
        process (int): number of process for parallelization
    """
    ar = np.asarray(df)

    gmpr_sf_partial = partial(gmpr_size_factor, ar=ar)
    with multiprocessing.Pool(process) as p:
        sf = p.map(gmpr_sf_partial, list(range(np.shape(ar)[1])))

    return(pd.DataFrame(np.divide(ar, sf), index=df.index, columns=df.columns))
[14]:
def RLE_normalize(pd_dataframe):
    """Normalize with Relative Log Expression
    Args:
        pd_dataframe (pandas DataFrame): TAXID count dataframe,
            colums as Samples, Rows as TAXIDs
    Returns:
        pandas DataFrame: RLE Normalized datafrane. Colums as Samples, Rows as TAXIDs
    Example:
        >>> RLE_normalize(pd.DataFrame)
    """

    step1 = pd_dataframe.apply(np.log, 0)
    step2 = step1.apply(np.average, 1)
    step3 = step2[step2.replace([np.inf, -np.inf], np.nan).notnull()]
    step4_1 = step1[step1.replace(
        [np.inf, -np.inf], np.nan).notnull().all(axis=1)]
    step4 = step4_1.subtract(step3, 0)
    step5 = step4.apply(np.median, 0)
    step6 = step5.apply(np.exp)
    step7 = pd_dataframe.divide(step6, 1).apply(round, 1)
    return(step7)
[15]:
def subsample_normalize_pd(pd_dataframe):
    """Normalize with Subsampling
    Args:
        pd_dataframe (pandas DataFrame): TAXID count dataframe,
            colums as Samples, Rows as TAXIDs
    Returns:
       pandas DataFrame: Subsample Normalized dataframe. Colums as Samples, Rows as TAXIDs
    """

    def subsample_normalize(serie, omax):
        """Subsample normalization column wise
        imin: minimum of input range
        imax: maximum of input range
        omin: minimum of output range
        omax: maximum of output range
        x in [imin, imax]
        f(x) in [omin, omax]
                 x - imin
        f(x) = ------------ x(omax - omin) + omin
               imax - imin
        Args:
            serie (pandas Series): Indivudal Sample Column
            omax (int): maximum of output range
        Returns:
            pandas Series: normalized pandas Series
        """

        imin = min(serie)
        imax = max(serie)
        omin = 0
        if imax > 0:
            newserie = serie.apply(lambda x: (
                (x - imin)/(imax - imin)*(omax-omin)+omin))
        else:
            newserie = serie
        return(newserie)

    step1 = pd_dataframe.apply(max, 1)
    themax = max(step1)

    step2 = pd_dataframe.apply(
        subsample_normalize, axis=0, args=(themax,))
    step3 = step2.apply(np.floor, axis=1)
    return(step3)

PLS-DA with sklearn

[16]:
class plsda:
    def __init__(self, X,Y, labels):
        """
        X(pd DataFrame) normalized feature matrix with samples in index, and features in columns
        Y(np 1D array) binary response variable encoding the grouping for each sample
        labels(named pd Series) of group label for each sample
        """
        from sklearn.cross_decomposition import PLSRegression
        self.plsr = PLSRegression(n_components=2)
        self.plsr.fit(X, Y)
        self.scores = pd.DataFrame(self.plsr.x_scores_, index=X.index, columns=['DIM1','DIM2'])
        self.scores = self.scores.join(labels['labels'])
        self.weights = pd.DataFrame(self.plsr.x_weights_, index=X.columns, columns=['DIM1','DIM2']).sort_values('DIM1', ascending=False)
        self.weights['name'] = ncbi.get_taxid_translator(self.weights.index).values()
        self.top_weights = self.weights.head(20).append(self.weights.tail(20))
        self.top_weights['name'] = pd.Categorical(self.top_weights['name'], categories=self.top_weights['name'])

mds with sklearn

[17]:
class mds:
    def __init__(self, X, labels, metric='braycurtis'):
        """
        X(pd DataFrame) normalized feature matrix with samples in index, and features in columns
        labels(named pd Series) of group label for each sample
        """
        from sklearn.metrics import pairwise_distances
        from sklearn.manifold import MDS
        dist = pairwise_distances(X, metric=metric)
        self.mds = MDS(n_components=2, dissimilarity='precomputed')
        self.mds.fit(X=dist)
        self.embedding = pd.DataFrame(self.mds.embedding_, columns=['DIM1','DIM2'], index=X.index)
        self.embedding = self.embedding.join(labels)

Reading the results of the dada2-nf pipeline

1- Species level

[18]:
healthy_otu_s = pd.read_csv("/projects1/users/borry/30_dada2-nf/results_healthy/merged/dada2_otu_table.csv", index_col=0)
CDI_otu_s = pd.read_csv("/projects1/users/borry/30_dada2-nf/results_CDI/merged/dada2_otu_table.csv", index_col=0)
[19]:
healthy_otu_s = healthy_otu_s.drop([0], axis=0)
CDI_otu_s = CDI_otu_s.drop([0], axis=0)
[20]:
all_otu_s = healthy_otu_s.merge(CDI_otu_s, left_index=True, right_index=True)
all_otu_s = remove_outlier(all_otu_s)
all_otu_s.shape
[20]:
(103, 305)
[21]:
all_otu_s.head()
[21]:
SRR5578998 SRR5579099 SRR5579045 SRR5578981 SRR5579095 SRR5579054 SRR5578909 SRR5578907 SRR5578965 SRR5579021 ... SRR3102417 SRR3102498 SRR3102362 SRR3102525 SRR3102486 SRR3102556 SRR3102547 SRR3102572 SRR3102515 SRR3102517
199 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
820 745.0 725.0 973.0 708.0 710.0 507.0 483.0 223.0 43.0 313.0 ... 0.0 6319.0 1663.0 0.0 0.0 0.0 0.0 350.0 2235.0 31.0
821 1103.0 742.0 2126.0 0.0 1502.0 1193.0 1316.0 52.0 462.0 947.0 ... 0.0 8.0 4899.0 0.0 0.0 0.0 7318.0 833.0 0.0 0.0
824 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
851 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 868.0 0.0 0.0 0.0 0.0 0.0 2190.0 0.0 0.0 0.0

5 rows × 305 columns

2- Genus Level

[22]:
healthy_otu_g = pd.read_csv("/projects1/users/borry/30_dada2-nf/results_healthy_genus/merged/dada2_otu_table.csv", index_col=0)
cdi_otu_g = pd.read_csv("/projects1/users/borry/30_dada2-nf/results_CDI_genus/merged/dada2_otu_table.csv", index_col=0)
[23]:
healthy_otu_g = healthy_otu_g.drop([0], axis=0)
cdi_otu_g = cdi_otu_g.drop([0], axis=0)
[24]:
all_otu_g = healthy_otu_g.merge(cdi_otu_g, left_index=True, right_index=True)
all_otu_g = remove_outlier(all_otu_g)
print(all_otu_g.shape)
all_otu_g.head()
(114, 405)
[24]:
SRR5578998 SRR5579099 SRR5579045 SRR5578981 SRR5579095 SRR5579054 SRR5578909 SRR5578907 SRR5578965 SRR5579115 ... SRR3102507 SRR3102439 SRR3102474 SRR3102487 SRR3102547 SRR3102572 SRR3102381 SRR3102515 SRR3102405 SRR3102517
194 0.0 0.0 0.0 0.0 0.0 6.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 2.0 2.0 0.0 0.0 0.0 0.0 0.0
286 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 17.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
469 0.0 0.0 0.0 0.0 0.0 17.0 0.0 0.0 0.0 0.0 ... 0.0 9.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
482 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0
544 0.0 0.0 1203.0 0.0 0.0 0.0 0.0 182.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 405 columns

Normalizing dataframes

[25]:
X_s = subsample_normalize_pd(all_otu_s).T
[26]:
X_g = GMPR_normalize(all_otu_g, 4).dropna(axis=1).T
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: divide by zero encountered in true_divide
  # Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: invalid value encountered in true_divide
  # Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: invalid value encountered in true_divide
  # Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: divide by zero encountered in true_divide
  # Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: divide by zero encountered in true_divide
  # Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: invalid value encountered in true_divide
  # Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: divide by zero encountered in true_divide
  # Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: invalid value encountered in true_divide
  # Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/numpy/lib/nanfunctions.py:1115: RuntimeWarning: All-NaN slice encountered
  overwrite_input=overwrite_input)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/numpy/lib/nanfunctions.py:1115: RuntimeWarning: All-NaN slice encountered
  overwrite_input=overwrite_input)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/numpy/lib/nanfunctions.py:1115: RuntimeWarning: All-NaN slice encountered
  overwrite_input=overwrite_input)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/numpy/lib/nanfunctions.py:1115: RuntimeWarning: All-NaN slice encountered
  overwrite_input=overwrite_input)

Creating labels dataframe and response variable array

[27]:
labels = healthy_meta['labels'].to_frame().append(CDI_meta['labels'].to_frame())
[28]:
labels_s = labels.loc[X_s.index, :]
Y_s = np.where(labels_s['labels']=='healthy', '1','0')
[29]:
labels_g = labels.loc[X_g.index, :]
Y_g = np.where(labels_g['labels']=='healthy', '1','0')

Exploring the dataset

[30]:
import seaborn as sns

PLS-DA with Python

[31]:
plsda_s = plsda(X_s, Y_s, labels_s)
[32]:
plsda_s.scores
[32]:
DIM1 DIM2 labels
SRR5578998 0.935718 0.126756 healthy
SRR5579099 2.614041 0.075360 healthy
SRR5579045 -0.424088 0.029255 healthy
SRR5578981 1.068008 0.688370 healthy
SRR5579095 -0.458098 0.082241 healthy
... ... ... ...
SRR3102556 -5.292148 -2.756433 CDI
SRR3102547 -1.763199 0.052028 CDI
SRR3102572 -1.139385 -1.983172 CDI
SRR3102515 -3.229330 -3.165894 CDI
SRR3102517 -2.452133 -0.887940 CDI

305 rows × 3 columns

[33]:
g = ggplot(plsda_s.scores, aes(x='DIM1',y='DIM2', color='labels'))
g += geom_point()
g += scale_color_manual(name='Status',values = {"CDI":cdi_color, "healthy":healthy_color})
g += theme_classic()
g += theme(plot_background=element_blank(),
           panel_background=element_blank(),
           legend_background=element_blank(),
           axis_line=element_line(color='black'),
           legend_text=element_text(color='black', weight='bold'),
          axis_text=element_text(color='black', weight='bold'),
          axis_title=element_text(color='black', weight='bold'),
          legend_title=element_text(color='black', weight='bold'))
g.save('results/PLS-DA.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
  from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/PLS-DA.png
  warn('Filename: {}'.format(filename), PlotnineWarning)
_images/CDI_analysis_50_1.png
[33]:
<ggplot: (-9223363254321219135)>

The separation appears clearly in the first latent variable (DIM1)

[34]:
g = ggplot(plsda_s.top_weights, aes(x='name',y='DIM1', fill='DIM1'))
g += geom_bar(stat='identity', width=0.7)
g += coord_flip()
g += scale_fill_gradient(name = 'weight', low=healthy_color, high=cdi_color)
g += xlab('species')
g += ylab('weight in PC1')
g += theme_classic()
g += theme(plot_background=element_blank(),
           panel_background=element_blank(),
           legend_background=element_blank(),
           axis_line=element_line(color='black'),
           legend_text=element_text(color='black', weight='bold'),
          axis_text=element_text(color='black', weight='bold'),
          axis_title=element_text(color='black', weight='bold'),
          legend_title=element_text(color='black', weight='bold'))
g.save('results/weight_CDI.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
  from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/weight_CDI.png
  warn('Filename: {}'.format(filename), PlotnineWarning)
_images/CDI_analysis_52_1.png
[34]:
<ggplot: (8782533497590)>
[35]:
X_s_heatmap = X_s.merge(labels_s, left_index=True, right_index=True).sort_values('labels').drop('labels', axis=1)
X_s_heatmap.columns = ncbi.get_taxid_translator(X_s_heatmap.columns).values()
samp_colors = list(np.where(labels_s['labels'] == 'CDI', cdi_color,healthy_color))
sns.clustermap(X_s_heatmap.loc[:,plsda_s.top_weights['name']], row_colors=samp_colors, metric='braycurtis')
[35]:
<seaborn.matrix.ClusterGrid at 0x7fcd7ca32278>
_images/CDI_analysis_53_1.png
[36]:
labels_s
[36]:
labels
SRR5578998 healthy
SRR5579099 healthy
SRR5579045 healthy
SRR5578981 healthy
SRR5579095 healthy
... ...
SRR3102556 CDI
SRR3102547 CDI
SRR3102572 CDI
SRR3102515 CDI
SRR3102517 CDI

305 rows × 1 columns

[37]:
mds_s = mds(X_s, labels_s['labels'], metric='euclidean')
[38]:
g = ggplot(mds_s.embedding, aes(x='DIM1',y='DIM2', color='labels'))
g += geom_point()
g += scale_color_manual(name='Status',values = {"CDI":cdi_color, "healthy":healthy_color})
g += theme_classic()
g += theme(plot_background=element_blank(),
           panel_background=element_blank(),
           legend_background=element_blank(),
           axis_line=element_line(color='black'),
           legend_text=element_text(color='black', weight='bold'),
          axis_text=element_text(color='black', weight='bold'),
          axis_title=element_text(color='black', weight='bold'),
          legend_title=element_text(color='black', weight='bold'))
g.save('results/PCoA.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
  from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/PCoA.png
  warn('Filename: {}'.format(filename), PlotnineWarning)
_images/CDI_analysis_56_1.png
[38]:
<ggplot: (-9223363254388601077)>
[39]:
plsda_g = plsda(X_g, Y_g, labels_g)
[40]:
g = ggplot(plsda_g.scores, aes(x='DIM1',y='DIM2', color='labels'))
g += geom_point()
g += scale_color_manual(name='Status',values = {"CDI":cdi_color, "healthy":healthy_color})
g += theme_classic()
g += theme(plot_background=element_blank(),
           panel_background=element_blank(),
           legend_background=element_blank(),
           axis_line=element_line(color='black'),
           legend_text=element_text(color='black', weight='bold'),
          axis_text=element_text(color='black', weight='bold'),
          axis_title=element_text(color='black', weight='bold'),
          legend_title=element_text(color='black', weight='bold'))
g.save('results/PLS-DA_genus.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
  from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/PLS-DA_genus.png
  warn('Filename: {}'.format(filename), PlotnineWarning)
_images/CDI_analysis_59_1.png
[40]:
<ggplot: (8782466170155)>
[41]:
g = ggplot(plsda_g.top_weights, aes(x='name',y='DIM1', fill='DIM1'))
g += geom_bar(stat='identity', width=0.7)
g += coord_flip()
g += scale_fill_gradient(name = 'weight', low=healthy_color, high=cdi_color)
g += xlab('Genus')
g += ylab('weight in PC1')
g += theme_classic()
g += theme(plot_background=element_blank(),
           panel_background=element_blank(),
           legend_background=element_blank(),
           axis_line=element_line(color=healthy_color),
           legend_text=element_text(color=healthy_color, weight='bold'),
          axis_text=element_text(color=healthy_color, weight='bold'),
          axis_title=element_text(color=healthy_color, weight='bold'),
          legend_title=element_text(color=healthy_color, weight='bold'))
g.save('results/weight_CDI_genus.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
  from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/weight_CDI_genus.png
  warn('Filename: {}'.format(filename), PlotnineWarning)
_images/CDI_analysis_60_1.png
[41]:
<ggplot: (8782466099411)>
[42]:
X_g_heatmap = X_g.merge(labels_g, left_index=True, right_index=True).sort_values('labels').drop('labels', axis=1)
X_g_heatmap.columns = ncbi.get_taxid_translator(X_g_heatmap.columns).values()
samp_colors = list(np.where(labels_g['labels'] == 'CDI', cdi_color,healthy_color))
sns.clustermap(X_g_heatmap.loc[:,plsda_g.top_weights['name']], row_colors=samp_colors, metric='braycurtis')
[42]:
<seaborn.matrix.ClusterGrid at 0x7fcd3c30a198>
_images/CDI_analysis_61_1.png

MDS

[43]:
mds_g = mds(X_g, labels_g)
[44]:
g = ggplot(mds_g.embedding, aes(x='DIM1',y='DIM2', color='labels'))
g += geom_point()
g += scale_color_manual(name='Status',values = {"CDI":cdi_color, "healthy":healthy_color})
g += theme_classic()
g += theme(plot_background=element_blank(),
           panel_background=element_blank(),
           legend_background=element_blank(),
           axis_line=element_line(color='black'),
           legend_text=element_text(color='black', weight='bold'),
          axis_text=element_text(color='black', weight='bold'),
          axis_title=element_text(color='black', weight='bold'),
          legend_title=element_text(color='black', weight='bold'))
g.save('results/PCoA.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
  from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/PCoA.png
  warn('Filename: {}'.format(filename), PlotnineWarning)
_images/CDI_analysis_64_1.png
[44]:
<ggplot: (8782533528931)>

Pre-analysis conclusion:

Based on the clustering (Heatmap) and the MDS, there is a better separation of the two classed at the genus level.

Preparing data for sourcepredict

on species

[45]:
train_species = X_s.T.sample(frac=0.8, axis=1, random_state=2)
[46]:
test_species = X_s.T.drop(train_species.columns, axis=1)
[47]:
train_labels_species = labels_s.loc[train_species.columns,:]
[48]:
test_labels_species = labels_s.loc[test_species.columns,:]
[49]:
train_species.to_csv("source_species.csv")
test_species.to_csv("sink_species.csv")
train_labels_species.to_csv("source_labels_species.csv")
test_labels_species.to_csv("sink_labels_species.csv")

on genus

[50]:
train_genus = X_g.T.sample(frac=0.8, axis=1, random_state=2)
[51]:
test_genus = X_g.T.drop(train_genus.columns, axis=1)
[52]:
train_labels_genus = labels_g.loc[train_genus.columns,:]
[53]:
test_labels_genus = labels_g.loc[test_genus.columns,:]
[54]:
train_genus.to_csv("source_genus.csv")
test_genus.to_csv("sink_genus.csv")
train_labels_genus.to_csv("source_labels_genus.csv")
test_labels_genus.to_csv("sink_labels_genus.csv")

Running sourcepredict

on species

[55]:
%%time
! /projects1/users/borry/18_sourcepredict/sourcepredict -s source_species.csv -l source_labels_species.csv sink_species.csv -n None -me tsne -di 2 -t 6 -e embedding_species.csv
Step 1: Checking for unknown proportion
  == Sample: SRR5578937 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578937
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5578953 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578953
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5578924 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578924
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5578990 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578990
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579006 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579006
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579106 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579106
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579003 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579003
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579074 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579074
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5578943 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578943
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579072 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579072
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579036 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579036
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5578962 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578962
                 known:97.63%
                 unknown:2.37%
  == Sample: SRR5578931 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578931
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579039 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579039
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579061 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579061
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5578942 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578942
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5578919 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578919
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5578913 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578913
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5578991 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578991
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579100 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579100
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579037 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579037
                 known:97.72%
                 unknown:2.28%
  == Sample: SRR5579048 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579048
                 known:97.76%
                 unknown:2.24%
  == Sample: SRR5578952 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578952
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5578930 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578930
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579017 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579017
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579005 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579005
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579055 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579055
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579002 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579002
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5578968 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578968
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579116 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579116
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5578910 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578910
                 known:97.66%
                 unknown:2.34%
  == Sample: SRR5578947 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578947
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579080 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579080
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579103 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579103
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579035 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579035
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579085 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579085
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579079 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579079
                 known:97.63%
                 unknown:2.37%
  == Sample: SRR5579018 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579018
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579001 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579001
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5578938 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578938
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR5579108 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579108
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102551 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102551
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102397 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102397
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102539 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102539
                 known:96.97%
                 unknown:3.03%
  == Sample: SRR3102516 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102516
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102401 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102401
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102398 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102398
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102573 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102573
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102518 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102518
                 known:74.16%
                 unknown:25.84%
  == Sample: SRR3102463 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102463
                 known:97.68%
                 unknown:2.32%
  == Sample: SRR3102581 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102581
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102359 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102359
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102427 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102427
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102369 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102369
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102356 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102356
                 known:64.17%
                 unknown:35.83%
  == Sample: SRR3102561 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102561
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102374 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102374
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102372 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102372
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102386 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102386
                 known:97.62%
                 unknown:2.38%
  == Sample: SRR3102486 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102486
                 known:70.98%
                 unknown:29.02%
  == Sample: SRR3102556 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102556
                 known:69.48%
                 unknown:30.52%
Step 2: Checking for source proportion
        Computing weighted_unifrac distance on species rank
         Warning: ``tree`` must be rooted.
        There is a polytomy ar the root of this taxonomic tree.
        Unifrac distances wont't  work properly.
        Computing  Bray-Curtis distance instead.

        TSNE embedding in 2 dimensions
        KNN machine learning
        Performing 5 fold cross validation on 6 cores...
        Trained KNN classifier with 10 neighbors
        -> Testing Accuracy: 0.9
        ----------------------
        - Sample: SRR5578937
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5578953
                 CDI:15.66%
                 healthy:84.34%
        - Sample: SRR5578924
                 CDI:11.87%
                 healthy:88.13%
        - Sample: SRR5578990
                 CDI:11.32%
                 healthy:88.68%
        - Sample: SRR5579006
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5579106
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5579003
                 CDI:7.22%
                 healthy:92.78%
        - Sample: SRR5579074
                 CDI:7.25%
                 healthy:92.75%
        - Sample: SRR5578943
                 CDI:92.09%
                 healthy:7.91%
        - Sample: SRR5579072
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5579036
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5578962
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5578931
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5579039
                 CDI:15.35%
                 healthy:84.65%
        - Sample: SRR5579061
                 CDI:11.1%
                 healthy:88.9%
        - Sample: SRR5578942
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5578919
                 CDI:68.11%
                 healthy:31.89%
        - Sample: SRR5578913
                 CDI:8.83%
                 healthy:91.17%
        - Sample: SRR5578991
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5579100
                 CDI:13.42%
                 healthy:86.58%
        - Sample: SRR5579037
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5579048
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5578952
                 CDI:14.35%
                 healthy:85.65%
        - Sample: SRR5578930
                 CDI:9.87%
                 healthy:90.13%
        - Sample: SRR5579017
                 CDI:9.84%
                 healthy:90.16%
        - Sample: SRR5579005
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5579055
                 CDI:9.51%
                 healthy:90.49%
        - Sample: SRR5579002
                 CDI:92.09%
                 healthy:7.91%
        - Sample: SRR5578968
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5579116
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5578910
                 CDI:8.34%
                 healthy:91.66%
        - Sample: SRR5578947
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5579080
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5579103
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5579035
                 CDI:9.01%
                 healthy:90.99%
        - Sample: SRR5579085
                 CDI:7.16%
                 healthy:92.84%
        - Sample: SRR5579079
                 CDI:7.01%
                 healthy:92.99%
        - Sample: SRR5579018
                 CDI:7.37%
                 healthy:92.63%
        - Sample: SRR5579001
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR5578938
                 CDI:9.36%
                 healthy:90.64%
        - Sample: SRR5579108
                 CDI:10.13%
                 healthy:89.87%
        - Sample: SRR3102551
                 CDI:10.39%
                 healthy:89.61%
        - Sample: SRR3102397
                 CDI:86.75%
                 healthy:13.25%
        - Sample: SRR3102539
                 CDI:88.59%
                 healthy:11.41%
        - Sample: SRR3102516
                 CDI:79.7%
                 healthy:20.3%
        - Sample: SRR3102401
                 CDI:87.61%
                 healthy:12.39%
        - Sample: SRR3102398
                 CDI:92.09%
                 healthy:7.91%
        - Sample: SRR3102573
                 CDI:92.09%
                 healthy:7.91%
        - Sample: SRR3102518
                 CDI:83.99%
                 healthy:16.01%
        - Sample: SRR3102463
                 CDI:78.19%
                 healthy:21.81%
        - Sample: SRR3102581
                 CDI:5.35%
                 healthy:94.65%
        - Sample: SRR3102359
                 CDI:92.09%
                 healthy:7.91%
        - Sample: SRR3102427
                 CDI:92.09%
                 healthy:7.91%
        - Sample: SRR3102369
                 CDI:92.09%
                 healthy:7.91%
        - Sample: SRR3102356
                 CDI:82.45%
                 healthy:17.55%
        - Sample: SRR3102561
                 CDI:78.45%
                 healthy:21.55%
        - Sample: SRR3102374
                 CDI:86.09%
                 healthy:13.91%
        - Sample: SRR3102372
                 CDI:70.12%
                 healthy:29.88%
        - Sample: SRR3102386
                 CDI:88.18%
                 healthy:11.82%
        - Sample: SRR3102486
                 CDI:86.38%
                 healthy:13.62%
        - Sample: SRR3102556
                 CDI:88.76%
                 healthy:11.24%
Sourcepredict result written to sink_species.sourcepredict.csv
Embedding coordinates written to embedding_species.csv
CPU times: user 896 ms, sys: 320 ms, total: 1.22 s
Wall time: 1min 1s

on genus

[56]:
%%time
! /projects1/users/borry/18_sourcepredict/sourcepredict -s source_genus.csv -l source_labels_genus.csv sink_genus.csv -r genus -n None -me tsne -di 2 -dt weighted_unifrac -t 6 -e embedding_genus.csv
Step 1: Checking for unknown proportion
  == Sample: SRR5578953 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578953
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5578924 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578924
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579094 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579094
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579106 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579106
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579030 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579030
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579113 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579113
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579077 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579077
                 known:97.43%
                 unknown:2.57%
  == Sample: SRR5578962 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578962
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579050 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579050
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5578957 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578957
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579028 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579028
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579071 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579071
                 known:97.38%
                 unknown:2.62%
  == Sample: SRR5578993 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578993
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5578963 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578963
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579100 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579100
                 known:97.49%
                 unknown:2.51%
  == Sample: SRR5578906 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578906
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579112 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579112
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579013 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579013
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579042 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579042
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5578912 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578912
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579002 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579002
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579089 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579089
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579087 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579087
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579102 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579102
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579049 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 0.99
        ----------------------
        - Sample: SRR5579049
                 known:98.22%
                 unknown:1.78%
  == Sample: SRR5579029 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579029
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5578910 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578910
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579010 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579010
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579064 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579064
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5579051 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579051
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5578984 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578984
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5578949 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578949
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5578975 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578975
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5578940 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578940
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5578925 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578925
                 known:97.41%
                 unknown:2.59%
  == Sample: SRR5579015 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5579015
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR5578927 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578927
                 known:97.49%
                 unknown:2.51%
  == Sample: SRR5578923 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR5578923
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR3102557 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102557
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR3102424 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 0.99
        ----------------------
        - Sample: SRR3102424
                 known:92.66%
                 unknown:7.34%
  == Sample: SRR3102366 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102366
                 known:93.41%
                 unknown:6.59%
  == Sample: SRR3102422 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 0.99
        ----------------------
        - Sample: SRR3102422
                 known:89.06%
                 unknown:10.94%
  == Sample: SRR3102462 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 0.98
        ----------------------
        - Sample: SRR3102462
                 known:69.99%
                 unknown:30.01%
  == Sample: SRR3102449 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102449
                 known:96.44%
                 unknown:3.56%
  == Sample: SRR3102526 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102526
                 known:96.96%
                 unknown:3.04%
  == Sample: SRR3102497 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102497
                 known:95.78%
                 unknown:4.22%
  == Sample: SRR3102402 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102402
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR3102440 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102440
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR3102463 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102463
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR3102379 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102379
                 known:91.82%
                 unknown:8.18%
  == Sample: SRR3102529 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102529
                 known:91.89%
                 unknown:8.11%
  == Sample: SRR3102427 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102427
                 known:97.39%
                 unknown:2.61%
  == Sample: SRR3102550 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102550
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR3102410 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 0.99
        ----------------------
        - Sample: SRR3102410
                 known:94.87%
                 unknown:5.13%
  == Sample: SRR3102376 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102376
                 known:92.88%
                 unknown:7.12%
  == Sample: SRR3102533 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102533
                 known:97.39%
                 unknown:2.61%
  == Sample: SRR3102489 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102489
                 known:95.74%
                 unknown:4.26%
  == Sample: SRR3102490 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102490
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR3102580 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102580
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR3102375 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102375
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR3102483 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102483
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR3102535 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102535
                 known:96.31%
                 unknown:3.69%
  == Sample: SRR3102446 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102446
                 known:97.41%
                 unknown:2.59%
  == Sample: SRR3102527 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102527
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR3102409 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102409
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR3102362 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102362
                 known:97.37%
                 unknown:2.63%
  == Sample: SRR3102487 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 0.99
        ----------------------
        - Sample: SRR3102487
                 known:96.66%
                 unknown:3.34%
  == Sample: SRR3102517 ==
        Adding unknown
        Normalizing (no normalization)
        Computing Bray-Curtis distance
        Performing MDS embedding in 2 dimensions
        KNN machine learning
        Training KNN classifier on 6 cores...
        -> Testing Accuracy: 1.0
        ----------------------
        - Sample: SRR3102517
                 known:97.37%
                 unknown:2.63%
Step 2: Checking for source proportion
        Computing weighted_unifrac distance on genus rank
        TSNE embedding in 2 dimensions
        KNN machine learning
        Performing 5 fold cross validation on 6 cores...
        Trained KNN classifier with 10 neighbors
        -> Testing Accuracy: 0.85
        ----------------------
        - Sample: SRR5578953
                 CDI:2.9%
                 healthy:97.1%
        - Sample: SRR5578924
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5579094
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5579106
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5579030
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5579113
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5579077
                 CDI:13.41%
                 healthy:86.59%
        - Sample: SRR5578962
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5579050
                 CDI:2.26%
                 healthy:97.74%
        - Sample: SRR5578957
                 CDI:2.82%
                 healthy:97.18%
        - Sample: SRR5579028
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5579071
                 CDI:2.77%
                 healthy:97.23%
        - Sample: SRR5578993
                 CDI:2.78%
                 healthy:97.22%
        - Sample: SRR5578963
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5579100
                 CDI:6.98%
                 healthy:93.02%
        - Sample: SRR5578906
                 CDI:3.1%
                 healthy:96.9%
        - Sample: SRR5579112
                 CDI:2.95%
                 healthy:97.05%
        - Sample: SRR5579013
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5579042
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5578912
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR5579002
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5579089
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5579087
                 CDI:88.23%
                 healthy:11.77%
        - Sample: SRR5579102
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5579049
                 CDI:9.53%
                 healthy:90.47%
        - Sample: SRR5579029
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5578910
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5579010
                 CDI:13.19%
                 healthy:86.81%
        - Sample: SRR5579064
                 CDI:2.48%
                 healthy:97.52%
        - Sample: SRR5579051
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5578984
                 CDI:44.06%
                 healthy:55.94%
        - Sample: SRR5578949
                 CDI:3.11%
                 healthy:96.89%
        - Sample: SRR5578975
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5578940
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5578925
                 CDI:2.55%
                 healthy:97.45%
        - Sample: SRR5579015
                 CDI:1.71%
                 healthy:98.29%
        - Sample: SRR5578927
                 CDI:2.93%
                 healthy:97.07%
        - Sample: SRR5578923
                 CDI:4.14%
                 healthy:95.86%
        - Sample: SRR3102557
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102424
                 CDI:92.07%
                 healthy:7.93%
        - Sample: SRR3102366
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102422
                 CDI:90.09%
                 healthy:9.91%
        - Sample: SRR3102462
                 CDI:93.16%
                 healthy:6.84%
        - Sample: SRR3102449
                 CDI:92.87%
                 healthy:7.13%
        - Sample: SRR3102526
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102497
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102402
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102440
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102463
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102379
                 CDI:94.18%
                 healthy:5.82%
        - Sample: SRR3102529
                 CDI:93.06%
                 healthy:6.94%
        - Sample: SRR3102427
                 CDI:55.04%
                 healthy:44.96%
        - Sample: SRR3102550
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102410
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102376
                 CDI:2.71%
                 healthy:97.29%
        - Sample: SRR3102533
                 CDI:93.57%
                 healthy:6.43%
        - Sample: SRR3102489
                 CDI:75.7%
                 healthy:24.3%
        - Sample: SRR3102490
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102580
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102375
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102483
                 CDI:45.14%
                 healthy:54.86%
        - Sample: SRR3102535
                 CDI:92.53%
                 healthy:7.47%
        - Sample: SRR3102446
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102527
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102409
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102362
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102487
                 CDI:95.31%
                 healthy:4.69%
        - Sample: SRR3102517
                 CDI:95.31%
                 healthy:4.69%
Sourcepredict result written to sink_genus.sourcepredict.csv
Embedding coordinates written to embedding_genus.csv
CPU times: user 1.18 s, sys: 368 ms, total: 1.55 s
Wall time: 1min 19s

Reading Sourcepredict results

[57]:
from sklearn.metrics import accuracy_score
[58]:
pred_genus = pd.read_csv("sink_genus.sourcepredict.csv", index_col=0)
test_labels_genus = pd.read_csv("sink_labels_genus.csv", index_col=0)
[59]:
conf_table_genus = pred_genus.idxmax(axis=0).to_frame(name='predicted').merge(test_labels_genus, left_index=True, right_index=True)
[60]:
conf_table_genus = conf_table_genus.dropna()
[61]:
conf_table_genus.shape
[61]:
(68, 2)
[62]:
conf_table_genus.apply(pd.value_counts, axis=0)
[62]:
predicted labels
healthy 38 38
CDI 30 30
[63]:
acc_genus = accuracy_score(y_true=conf_table_genus['labels'], y_pred=conf_table_genus['predicted'])
print(f"Accuracy: {round(acc_genus,2)}")
Accuracy: 0.94
[64]:
from plotnine import *
[65]:
embed = pd.read_csv("embedding_genus.csv", index_col=0)
embed = embed.rename(columns={'labels':'type'})
embed['type'] = embed['type'].str.replace('CDI','source').replace('healthy','source')
embed = embed.join(labels_g['labels']).rename(columns={'labels':'actual'})
embed = embed.join(pd.Series(pred_genus.idxmax(), name='predicted'))
[66]:
g = ggplot(embed.query("type == 'source'"), aes(x='PC1',y='PC2', color='labels'))
g += geom_point(size=3, stroke=1, alpha=0.5)
g += scale_color_manual(name = 'Reference', values = {"CDI":cdi_color, "healthy":healthy_color})
g += xlab('DIM1')
g += ylab('DIM2')
g += theme_classic()
g += theme(plot_background=element_blank(),
           panel_background=element_blank(),
           legend_background=element_blank(),
           axis_line=element_line(color=healthy_color),
           legend_text=element_text(color='black', weight='bold'),
          axis_text=element_text(color='black', weight='bold'),
          axis_title=element_text(color='black', weight='bold'),
          legend_title=element_text(color='black', weight='bold'))
g.save('results/train_embedding.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
  from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/train_embedding.png
  warn('Filename: {}'.format(filename), PlotnineWarning)
_images/CDI_analysis_95_1.png
[66]:
<ggplot: (8782530729613)>
[67]:
g = ggplot(embed.query("type == 'sink'"), aes(x='PC1',y='PC2', color='predicted'))
g += geom_point(size=4, shape='o', fill = 'black', stroke=2)
g += geom_point(data=embed.query("type == 'sink'"), mapping=aes(x='PC1',y='PC2', fill='actual'), size=3, color='black', stroke=1)
g += scale_color_manual(name = 'Reference', values = {"CDI":cdi_color, "healthy":healthy_color})
g += scale_fill_manual(name = 'Prediction', values = {"CDI":cdi_color, "healthy":healthy_color})
g += xlab('DIM1')
g += ylab('DIM2')
g += theme_classic()
g += theme(plot_background=element_blank(),
           panel_background=element_blank(),
           legend_background=element_blank(),
           axis_line=element_line(color='black'),
           legend_text=element_text(color='black', weight='bold'),
          axis_text=element_text(color='black', weight='bold'),
          axis_title=element_text(color='black', weight='bold'),
          legend_title=element_text(color='black', weight='bold'))
g.save('results/test_embedding.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
  from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/test_embedding.png
  warn('Filename: {}'.format(filename), PlotnineWarning)
_images/CDI_analysis_96_1.png
[67]:
<ggplot: (-9223363254324085430)>

Comparing with Sourcetracker 2

Generating the data for Sourcetracker 2

[68]:
X_g.T.to_csv("st_genus.txt", sep="\t", index_label="TAXID")

test_labels_genus[‘SourceSink’]= [‘sink’]*test_labels_genus.shape[0]

[69]:
train_labels_genus['SourceSink'] = ['source']*train_labels_genus.shape[0]
[70]:
st_metadata = train_labels_genus.append(test_labels_genus).rename(columns = {"labels":"Env"})[['SourceSink','Env']]
st_metadata['SourceSink'][train_labels_genus.index] = ['source']*train_labels_genus.shape[0]
st_metadata['SourceSink'][test_labels_genus.index] = ['sink']*test_labels_genus.shape[0]
st_metadata
[70]:
SourceSink Env
SRR5579101 source healthy
SRR3102473 source CDI
SRR3102501 source CDI
SRR5578964 source healthy
SRR5579017 source healthy
... ... ...
SRR3102527 sink CDI
SRR3102409 sink CDI
SRR3102362 sink CDI
SRR3102487 sink CDI
SRR3102517 sink CDI

340 rows × 2 columns

[71]:
st_metadata.to_csv("st_genus_metadata.csv", sep="\t", index_label='#SampleID')
sourcetracker2 gibbs -i st_genus.biom -m st_genus_metadata.csv -o st_genus_out --jobs 6 --source_rarefaction_depth 0 --sink_rarefaction_depth 0

Reading sourcetracker results

[72]:
st_pred = pd.read_csv("st_genus_out/mixing_proportions.txt", sep = "\t", index_col=0)
[73]:
st2_pred = st_pred.idxmax(axis=1).to_frame(name='predicted').merge(test_labels_genus, left_index=True, right_index=True).rename(columns={'labels':'actual'})
[74]:
st2_pred.head()
[74]:
predicted actual
SRR5578953 healthy healthy
SRR5578924 healthy healthy
SRR5579094 healthy healthy
SRR5579106 healthy healthy
SRR5579030 healthy healthy
[75]:
st_acc_genus = accuracy_score(y_true=st2_pred['actual'], y_pred=st2_pred['predicted'])
print(f"Accuracy: {round(st_acc_genus,2)}")
Accuracy: 0.8