Sourcepredict example3: Segregating patients with or without Clostridium difficile infection (CDI) on the basis of 16s microbiome¶
Source Article: Domestic canines do not display evidence of gut microbial dysbiosis in the presence of Clostridioides (Clostridium) difficile, despite cellular susceptibility to its toxins 10.1016/j.anaerobe.2019.03.017
[1]:
import pandas as pd
import numpy as np
from plotnine import *
from ete3 import NCBITaxa
import multiprocessing
from functools import partial
import seaborn as sns
[2]:
ncbi = NCBITaxa()
Downloading data¶
[3]:
cdi_color = "#E7CE1A"
healthy_color = "grey"
[4]:
tax_level = ['genus','species']
[5]:
import multiprocessing
import subprocess
def dl(file, outdir):
cmd = f"wget {file} -P {outdir}"
print(cmd)
try:
subprocess.check_output(cmd, shell=True)
except subprocess.CalledProcessError:
print(f"Error downloading {file}")
def dl_multi(allfiles, outdir, process):
dl_fun = partial(dl, outdir=outdir)
with multiprocessing.Pool(process) as p:
p.map(dl_fun, allfiles)
[6]:
healthy_meta = pd.read_csv("healthy/PRJNA386260_metadata.txt", sep="\t", index_col='run_accession')
CDI_meta = pd.read_csv("CDI/PRJNA307992_metadata.txt", sep="\t", index_col='run_accession')
[7]:
healthy_meta['labels'] = ['healthy']*healthy_meta.shape[0]
CDI_meta['labels'] = ['CDI']*CDI_meta.shape[0]
[8]:
healthy_fastqs = list(healthy_meta['fastq_ftp'].str.split(";", expand=True)[0]) + list(healthy_meta['fastq_ftp'].str.split(";", expand=True)[1])
[9]:
CDI_fastqs = list(CDI_meta['fastq_ftp'].str.split(";", expand=True)[0]) + list(CDI_meta['fastq_ftp'].str.split(";", expand=True)[1])
Uncomment to download files
[10]:
#dl_multi(allfiles=healthy_fastqs, outdir="./healthy/", process=4)
#dl_multi(allfiles=CDI_fastqs, outdir="./CDI/", process=4)
Utility functions¶
Removing outlier samples (less than 10 species) and species present in less than 10 samples
[11]:
def remove_outlier(df, n=10):
return(df.loc[df.nunique(axis=1) > n, df.nunique(axis=0) > n])
Removing TAXID not in NCBI taxonomy
[12]:
def remove_not_taxo(df):
"""
df(pandas DataFrame) with TAXID in index, and samples in columns
"""
valid_ranks = {k:v for (k,v) in zip(ncbi.get_rank(df.index).keys(), ncbi.get_rank(df.index).values()) if v != 'no rank'}
return(df.loc[valid_ranks.keys(),:])
Normalization methods
[13]:
def gmpr_size_factor(col, ar):
"""Generate GMPR size factor
Args:
col (int): columm index of the numpy array
ar (numpy array): numpy array of TAXID counts,
colums as Samples, Rows as TAXIDs
Returns:
float: GMPR size factor per column
"""
pr = np.apply_along_axis(lambda x: np.divide(ar[:, col], x), 0, ar)
pr[np.isinf(pr)] = np.nan
pr[pr == 0] = np.nan
pr_median = np.nanmedian(pr, axis=0)
return(np.exp(np.mean(np.log(pr_median))))
def GMPR_normalize(df, process=4):
"""Compute GMPR normalization
Global Mean of Pairwise Ratios
Chen, L., Reeve, J., Zhang, L., Huang, S., Wang, X., & Chen, J. (2018).
GMPR: A robust normalization method for zero-inflated count data
with application to microbiome sequencing data.
PeerJ, 6, e4600.
Args:
df (pandas Dataframe): TAXID count dataframe,
colums as Samples, Rows as TAXIDs
process (int): number of process for parallelization
"""
ar = np.asarray(df)
gmpr_sf_partial = partial(gmpr_size_factor, ar=ar)
with multiprocessing.Pool(process) as p:
sf = p.map(gmpr_sf_partial, list(range(np.shape(ar)[1])))
return(pd.DataFrame(np.divide(ar, sf), index=df.index, columns=df.columns))
[14]:
def RLE_normalize(pd_dataframe):
"""Normalize with Relative Log Expression
Args:
pd_dataframe (pandas DataFrame): TAXID count dataframe,
colums as Samples, Rows as TAXIDs
Returns:
pandas DataFrame: RLE Normalized datafrane. Colums as Samples, Rows as TAXIDs
Example:
>>> RLE_normalize(pd.DataFrame)
"""
step1 = pd_dataframe.apply(np.log, 0)
step2 = step1.apply(np.average, 1)
step3 = step2[step2.replace([np.inf, -np.inf], np.nan).notnull()]
step4_1 = step1[step1.replace(
[np.inf, -np.inf], np.nan).notnull().all(axis=1)]
step4 = step4_1.subtract(step3, 0)
step5 = step4.apply(np.median, 0)
step6 = step5.apply(np.exp)
step7 = pd_dataframe.divide(step6, 1).apply(round, 1)
return(step7)
[15]:
def subsample_normalize_pd(pd_dataframe):
"""Normalize with Subsampling
Args:
pd_dataframe (pandas DataFrame): TAXID count dataframe,
colums as Samples, Rows as TAXIDs
Returns:
pandas DataFrame: Subsample Normalized dataframe. Colums as Samples, Rows as TAXIDs
"""
def subsample_normalize(serie, omax):
"""Subsample normalization column wise
imin: minimum of input range
imax: maximum of input range
omin: minimum of output range
omax: maximum of output range
x in [imin, imax]
f(x) in [omin, omax]
x - imin
f(x) = ------------ x(omax - omin) + omin
imax - imin
Args:
serie (pandas Series): Indivudal Sample Column
omax (int): maximum of output range
Returns:
pandas Series: normalized pandas Series
"""
imin = min(serie)
imax = max(serie)
omin = 0
if imax > 0:
newserie = serie.apply(lambda x: (
(x - imin)/(imax - imin)*(omax-omin)+omin))
else:
newserie = serie
return(newserie)
step1 = pd_dataframe.apply(max, 1)
themax = max(step1)
step2 = pd_dataframe.apply(
subsample_normalize, axis=0, args=(themax,))
step3 = step2.apply(np.floor, axis=1)
return(step3)
PLS-DA with sklearn
[16]:
class plsda:
def __init__(self, X,Y, labels):
"""
X(pd DataFrame) normalized feature matrix with samples in index, and features in columns
Y(np 1D array) binary response variable encoding the grouping for each sample
labels(named pd Series) of group label for each sample
"""
from sklearn.cross_decomposition import PLSRegression
self.plsr = PLSRegression(n_components=2)
self.plsr.fit(X, Y)
self.scores = pd.DataFrame(self.plsr.x_scores_, index=X.index, columns=['DIM1','DIM2'])
self.scores = self.scores.join(labels['labels'])
self.weights = pd.DataFrame(self.plsr.x_weights_, index=X.columns, columns=['DIM1','DIM2']).sort_values('DIM1', ascending=False)
self.weights['name'] = ncbi.get_taxid_translator(self.weights.index).values()
self.top_weights = self.weights.head(20).append(self.weights.tail(20))
self.top_weights['name'] = pd.Categorical(self.top_weights['name'], categories=self.top_weights['name'])
mds with sklearn
[17]:
class mds:
def __init__(self, X, labels, metric='braycurtis'):
"""
X(pd DataFrame) normalized feature matrix with samples in index, and features in columns
labels(named pd Series) of group label for each sample
"""
from sklearn.metrics import pairwise_distances
from sklearn.manifold import MDS
dist = pairwise_distances(X, metric=metric)
self.mds = MDS(n_components=2, dissimilarity='precomputed')
self.mds.fit(X=dist)
self.embedding = pd.DataFrame(self.mds.embedding_, columns=['DIM1','DIM2'], index=X.index)
self.embedding = self.embedding.join(labels)
Reading the results of the dada2-nf pipeline¶
1- Species level¶
[18]:
healthy_otu_s = pd.read_csv("/projects1/users/borry/30_dada2-nf/results_healthy/merged/dada2_otu_table.csv", index_col=0)
CDI_otu_s = pd.read_csv("/projects1/users/borry/30_dada2-nf/results_CDI/merged/dada2_otu_table.csv", index_col=0)
[19]:
healthy_otu_s = healthy_otu_s.drop([0], axis=0)
CDI_otu_s = CDI_otu_s.drop([0], axis=0)
[20]:
all_otu_s = healthy_otu_s.merge(CDI_otu_s, left_index=True, right_index=True)
all_otu_s = remove_outlier(all_otu_s)
all_otu_s.shape
[20]:
(103, 305)
[21]:
all_otu_s.head()
[21]:
SRR5578998 | SRR5579099 | SRR5579045 | SRR5578981 | SRR5579095 | SRR5579054 | SRR5578909 | SRR5578907 | SRR5578965 | SRR5579021 | ... | SRR3102417 | SRR3102498 | SRR3102362 | SRR3102525 | SRR3102486 | SRR3102556 | SRR3102547 | SRR3102572 | SRR3102515 | SRR3102517 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
199 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
820 | 745.0 | 725.0 | 973.0 | 708.0 | 710.0 | 507.0 | 483.0 | 223.0 | 43.0 | 313.0 | ... | 0.0 | 6319.0 | 1663.0 | 0.0 | 0.0 | 0.0 | 0.0 | 350.0 | 2235.0 | 31.0 |
821 | 1103.0 | 742.0 | 2126.0 | 0.0 | 1502.0 | 1193.0 | 1316.0 | 52.0 | 462.0 | 947.0 | ... | 0.0 | 8.0 | 4899.0 | 0.0 | 0.0 | 0.0 | 7318.0 | 833.0 | 0.0 | 0.0 |
824 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
851 | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 868.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2190.0 | 0.0 | 0.0 | 0.0 |
5 rows × 305 columns
2- Genus Level¶
[22]:
healthy_otu_g = pd.read_csv("/projects1/users/borry/30_dada2-nf/results_healthy_genus/merged/dada2_otu_table.csv", index_col=0)
cdi_otu_g = pd.read_csv("/projects1/users/borry/30_dada2-nf/results_CDI_genus/merged/dada2_otu_table.csv", index_col=0)
[23]:
healthy_otu_g = healthy_otu_g.drop([0], axis=0)
cdi_otu_g = cdi_otu_g.drop([0], axis=0)
[24]:
all_otu_g = healthy_otu_g.merge(cdi_otu_g, left_index=True, right_index=True)
all_otu_g = remove_outlier(all_otu_g)
print(all_otu_g.shape)
all_otu_g.head()
(114, 405)
[24]:
SRR5578998 | SRR5579099 | SRR5579045 | SRR5578981 | SRR5579095 | SRR5579054 | SRR5578909 | SRR5578907 | SRR5578965 | SRR5579115 | ... | SRR3102507 | SRR3102439 | SRR3102474 | SRR3102487 | SRR3102547 | SRR3102572 | SRR3102381 | SRR3102515 | SRR3102405 | SRR3102517 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
194 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
286 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 17.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
469 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 17.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 9.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
482 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 |
544 | 0.0 | 0.0 | 1203.0 | 0.0 | 0.0 | 0.0 | 0.0 | 182.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 405 columns
Normalizing dataframes¶
[25]:
X_s = subsample_normalize_pd(all_otu_s).T
[26]:
X_g = GMPR_normalize(all_otu_g, 4).dropna(axis=1).T
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: divide by zero encountered in true_divide
# Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: invalid value encountered in true_divide
# Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: invalid value encountered in true_divide
# Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: divide by zero encountered in true_divide
# Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: divide by zero encountered in true_divide
# Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: invalid value encountered in true_divide
# Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: divide by zero encountered in true_divide
# Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/ipykernel_launcher.py:10: RuntimeWarning: invalid value encountered in true_divide
# Remove the CWD from sys.path while we load stuff.
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/numpy/lib/nanfunctions.py:1115: RuntimeWarning: All-NaN slice encountered
overwrite_input=overwrite_input)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/numpy/lib/nanfunctions.py:1115: RuntimeWarning: All-NaN slice encountered
overwrite_input=overwrite_input)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/numpy/lib/nanfunctions.py:1115: RuntimeWarning: All-NaN slice encountered
overwrite_input=overwrite_input)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/numpy/lib/nanfunctions.py:1115: RuntimeWarning: All-NaN slice encountered
overwrite_input=overwrite_input)
Creating labels dataframe and response variable array¶
[27]:
labels = healthy_meta['labels'].to_frame().append(CDI_meta['labels'].to_frame())
[28]:
labels_s = labels.loc[X_s.index, :]
Y_s = np.where(labels_s['labels']=='healthy', '1','0')
[29]:
labels_g = labels.loc[X_g.index, :]
Y_g = np.where(labels_g['labels']=='healthy', '1','0')
Exploring the dataset¶
[30]:
import seaborn as sns
[31]:
plsda_s = plsda(X_s, Y_s, labels_s)
[32]:
plsda_s.scores
[32]:
DIM1 | DIM2 | labels | |
---|---|---|---|
SRR5578998 | 0.935718 | 0.126756 | healthy |
SRR5579099 | 2.614041 | 0.075360 | healthy |
SRR5579045 | -0.424088 | 0.029255 | healthy |
SRR5578981 | 1.068008 | 0.688370 | healthy |
SRR5579095 | -0.458098 | 0.082241 | healthy |
... | ... | ... | ... |
SRR3102556 | -5.292148 | -2.756433 | CDI |
SRR3102547 | -1.763199 | 0.052028 | CDI |
SRR3102572 | -1.139385 | -1.983172 | CDI |
SRR3102515 | -3.229330 | -3.165894 | CDI |
SRR3102517 | -2.452133 | -0.887940 | CDI |
305 rows × 3 columns
[33]:
g = ggplot(plsda_s.scores, aes(x='DIM1',y='DIM2', color='labels'))
g += geom_point()
g += scale_color_manual(name='Status',values = {"CDI":cdi_color, "healthy":healthy_color})
g += theme_classic()
g += theme(plot_background=element_blank(),
panel_background=element_blank(),
legend_background=element_blank(),
axis_line=element_line(color='black'),
legend_text=element_text(color='black', weight='bold'),
axis_text=element_text(color='black', weight='bold'),
axis_title=element_text(color='black', weight='bold'),
legend_title=element_text(color='black', weight='bold'))
g.save('results/PLS-DA.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/PLS-DA.png
warn('Filename: {}'.format(filename), PlotnineWarning)
[33]:
<ggplot: (-9223363254321219135)>
The separation appears clearly in the first latent variable (DIM1)
[34]:
g = ggplot(plsda_s.top_weights, aes(x='name',y='DIM1', fill='DIM1'))
g += geom_bar(stat='identity', width=0.7)
g += coord_flip()
g += scale_fill_gradient(name = 'weight', low=healthy_color, high=cdi_color)
g += xlab('species')
g += ylab('weight in PC1')
g += theme_classic()
g += theme(plot_background=element_blank(),
panel_background=element_blank(),
legend_background=element_blank(),
axis_line=element_line(color='black'),
legend_text=element_text(color='black', weight='bold'),
axis_text=element_text(color='black', weight='bold'),
axis_title=element_text(color='black', weight='bold'),
legend_title=element_text(color='black', weight='bold'))
g.save('results/weight_CDI.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/weight_CDI.png
warn('Filename: {}'.format(filename), PlotnineWarning)
[34]:
<ggplot: (8782533497590)>
[35]:
X_s_heatmap = X_s.merge(labels_s, left_index=True, right_index=True).sort_values('labels').drop('labels', axis=1)
X_s_heatmap.columns = ncbi.get_taxid_translator(X_s_heatmap.columns).values()
samp_colors = list(np.where(labels_s['labels'] == 'CDI', cdi_color,healthy_color))
sns.clustermap(X_s_heatmap.loc[:,plsda_s.top_weights['name']], row_colors=samp_colors, metric='braycurtis')
[35]:
<seaborn.matrix.ClusterGrid at 0x7fcd7ca32278>
[36]:
labels_s
[36]:
labels | |
---|---|
SRR5578998 | healthy |
SRR5579099 | healthy |
SRR5579045 | healthy |
SRR5578981 | healthy |
SRR5579095 | healthy |
... | ... |
SRR3102556 | CDI |
SRR3102547 | CDI |
SRR3102572 | CDI |
SRR3102515 | CDI |
SRR3102517 | CDI |
305 rows × 1 columns
[37]:
mds_s = mds(X_s, labels_s['labels'], metric='euclidean')
[38]:
g = ggplot(mds_s.embedding, aes(x='DIM1',y='DIM2', color='labels'))
g += geom_point()
g += scale_color_manual(name='Status',values = {"CDI":cdi_color, "healthy":healthy_color})
g += theme_classic()
g += theme(plot_background=element_blank(),
panel_background=element_blank(),
legend_background=element_blank(),
axis_line=element_line(color='black'),
legend_text=element_text(color='black', weight='bold'),
axis_text=element_text(color='black', weight='bold'),
axis_title=element_text(color='black', weight='bold'),
legend_title=element_text(color='black', weight='bold'))
g.save('results/PCoA.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/PCoA.png
warn('Filename: {}'.format(filename), PlotnineWarning)
[38]:
<ggplot: (-9223363254388601077)>
[39]:
plsda_g = plsda(X_g, Y_g, labels_g)
[40]:
g = ggplot(plsda_g.scores, aes(x='DIM1',y='DIM2', color='labels'))
g += geom_point()
g += scale_color_manual(name='Status',values = {"CDI":cdi_color, "healthy":healthy_color})
g += theme_classic()
g += theme(plot_background=element_blank(),
panel_background=element_blank(),
legend_background=element_blank(),
axis_line=element_line(color='black'),
legend_text=element_text(color='black', weight='bold'),
axis_text=element_text(color='black', weight='bold'),
axis_title=element_text(color='black', weight='bold'),
legend_title=element_text(color='black', weight='bold'))
g.save('results/PLS-DA_genus.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/PLS-DA_genus.png
warn('Filename: {}'.format(filename), PlotnineWarning)
[40]:
<ggplot: (8782466170155)>
[41]:
g = ggplot(plsda_g.top_weights, aes(x='name',y='DIM1', fill='DIM1'))
g += geom_bar(stat='identity', width=0.7)
g += coord_flip()
g += scale_fill_gradient(name = 'weight', low=healthy_color, high=cdi_color)
g += xlab('Genus')
g += ylab('weight in PC1')
g += theme_classic()
g += theme(plot_background=element_blank(),
panel_background=element_blank(),
legend_background=element_blank(),
axis_line=element_line(color=healthy_color),
legend_text=element_text(color=healthy_color, weight='bold'),
axis_text=element_text(color=healthy_color, weight='bold'),
axis_title=element_text(color=healthy_color, weight='bold'),
legend_title=element_text(color=healthy_color, weight='bold'))
g.save('results/weight_CDI_genus.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/weight_CDI_genus.png
warn('Filename: {}'.format(filename), PlotnineWarning)
[41]:
<ggplot: (8782466099411)>
[42]:
X_g_heatmap = X_g.merge(labels_g, left_index=True, right_index=True).sort_values('labels').drop('labels', axis=1)
X_g_heatmap.columns = ncbi.get_taxid_translator(X_g_heatmap.columns).values()
samp_colors = list(np.where(labels_g['labels'] == 'CDI', cdi_color,healthy_color))
sns.clustermap(X_g_heatmap.loc[:,plsda_g.top_weights['name']], row_colors=samp_colors, metric='braycurtis')
[42]:
<seaborn.matrix.ClusterGrid at 0x7fcd3c30a198>
MDS
[43]:
mds_g = mds(X_g, labels_g)
[44]:
g = ggplot(mds_g.embedding, aes(x='DIM1',y='DIM2', color='labels'))
g += geom_point()
g += scale_color_manual(name='Status',values = {"CDI":cdi_color, "healthy":healthy_color})
g += theme_classic()
g += theme(plot_background=element_blank(),
panel_background=element_blank(),
legend_background=element_blank(),
axis_line=element_line(color='black'),
legend_text=element_text(color='black', weight='bold'),
axis_text=element_text(color='black', weight='bold'),
axis_title=element_text(color='black', weight='bold'),
legend_title=element_text(color='black', weight='bold'))
g.save('results/PCoA.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/PCoA.png
warn('Filename: {}'.format(filename), PlotnineWarning)
[44]:
<ggplot: (8782533528931)>
Pre-analysis conclusion:¶
Based on the clustering (Heatmap) and the MDS, there is a better separation of the two classed at the genus level.
Preparing data for sourcepredict¶
on species¶
[45]:
train_species = X_s.T.sample(frac=0.8, axis=1, random_state=2)
[46]:
test_species = X_s.T.drop(train_species.columns, axis=1)
[47]:
train_labels_species = labels_s.loc[train_species.columns,:]
[48]:
test_labels_species = labels_s.loc[test_species.columns,:]
[49]:
train_species.to_csv("source_species.csv")
test_species.to_csv("sink_species.csv")
train_labels_species.to_csv("source_labels_species.csv")
test_labels_species.to_csv("sink_labels_species.csv")
on genus¶
[50]:
train_genus = X_g.T.sample(frac=0.8, axis=1, random_state=2)
[51]:
test_genus = X_g.T.drop(train_genus.columns, axis=1)
[52]:
train_labels_genus = labels_g.loc[train_genus.columns,:]
[53]:
test_labels_genus = labels_g.loc[test_genus.columns,:]
[54]:
train_genus.to_csv("source_genus.csv")
test_genus.to_csv("sink_genus.csv")
train_labels_genus.to_csv("source_labels_genus.csv")
test_labels_genus.to_csv("sink_labels_genus.csv")
Running sourcepredict¶
on species¶
[55]:
%%time
! /projects1/users/borry/18_sourcepredict/sourcepredict -s source_species.csv -l source_labels_species.csv sink_species.csv -n None -me tsne -di 2 -t 6 -e embedding_species.csv
Step 1: Checking for unknown proportion
== Sample: SRR5578937 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578937
known:97.62%
unknown:2.38%
== Sample: SRR5578953 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578953
known:97.62%
unknown:2.38%
== Sample: SRR5578924 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578924
known:97.62%
unknown:2.38%
== Sample: SRR5578990 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578990
known:97.62%
unknown:2.38%
== Sample: SRR5579006 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579006
known:97.62%
unknown:2.38%
== Sample: SRR5579106 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579106
known:97.62%
unknown:2.38%
== Sample: SRR5579003 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579003
known:97.62%
unknown:2.38%
== Sample: SRR5579074 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579074
known:97.62%
unknown:2.38%
== Sample: SRR5578943 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578943
known:97.62%
unknown:2.38%
== Sample: SRR5579072 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579072
known:97.62%
unknown:2.38%
== Sample: SRR5579036 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579036
known:97.62%
unknown:2.38%
== Sample: SRR5578962 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578962
known:97.63%
unknown:2.37%
== Sample: SRR5578931 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578931
known:97.62%
unknown:2.38%
== Sample: SRR5579039 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579039
known:97.62%
unknown:2.38%
== Sample: SRR5579061 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579061
known:97.62%
unknown:2.38%
== Sample: SRR5578942 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578942
known:97.62%
unknown:2.38%
== Sample: SRR5578919 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578919
known:97.62%
unknown:2.38%
== Sample: SRR5578913 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578913
known:97.62%
unknown:2.38%
== Sample: SRR5578991 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578991
known:97.62%
unknown:2.38%
== Sample: SRR5579100 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579100
known:97.62%
unknown:2.38%
== Sample: SRR5579037 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579037
known:97.72%
unknown:2.28%
== Sample: SRR5579048 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579048
known:97.76%
unknown:2.24%
== Sample: SRR5578952 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578952
known:97.62%
unknown:2.38%
== Sample: SRR5578930 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578930
known:97.62%
unknown:2.38%
== Sample: SRR5579017 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579017
known:97.62%
unknown:2.38%
== Sample: SRR5579005 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579005
known:97.62%
unknown:2.38%
== Sample: SRR5579055 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579055
known:97.62%
unknown:2.38%
== Sample: SRR5579002 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579002
known:97.62%
unknown:2.38%
== Sample: SRR5578968 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578968
known:97.62%
unknown:2.38%
== Sample: SRR5579116 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579116
known:97.62%
unknown:2.38%
== Sample: SRR5578910 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578910
known:97.66%
unknown:2.34%
== Sample: SRR5578947 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578947
known:97.62%
unknown:2.38%
== Sample: SRR5579080 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579080
known:97.62%
unknown:2.38%
== Sample: SRR5579103 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579103
known:97.62%
unknown:2.38%
== Sample: SRR5579035 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579035
known:97.62%
unknown:2.38%
== Sample: SRR5579085 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579085
known:97.62%
unknown:2.38%
== Sample: SRR5579079 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579079
known:97.63%
unknown:2.37%
== Sample: SRR5579018 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579018
known:97.62%
unknown:2.38%
== Sample: SRR5579001 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579001
known:97.62%
unknown:2.38%
== Sample: SRR5578938 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578938
known:97.62%
unknown:2.38%
== Sample: SRR5579108 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579108
known:97.62%
unknown:2.38%
== Sample: SRR3102551 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102551
known:97.62%
unknown:2.38%
== Sample: SRR3102397 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102397
known:97.62%
unknown:2.38%
== Sample: SRR3102539 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102539
known:96.97%
unknown:3.03%
== Sample: SRR3102516 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102516
known:97.62%
unknown:2.38%
== Sample: SRR3102401 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102401
known:97.62%
unknown:2.38%
== Sample: SRR3102398 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102398
known:97.62%
unknown:2.38%
== Sample: SRR3102573 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102573
known:97.62%
unknown:2.38%
== Sample: SRR3102518 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102518
known:74.16%
unknown:25.84%
== Sample: SRR3102463 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102463
known:97.68%
unknown:2.32%
== Sample: SRR3102581 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102581
known:97.62%
unknown:2.38%
== Sample: SRR3102359 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102359
known:97.62%
unknown:2.38%
== Sample: SRR3102427 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102427
known:97.62%
unknown:2.38%
== Sample: SRR3102369 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102369
known:97.62%
unknown:2.38%
== Sample: SRR3102356 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102356
known:64.17%
unknown:35.83%
== Sample: SRR3102561 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102561
known:97.62%
unknown:2.38%
== Sample: SRR3102374 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102374
known:97.62%
unknown:2.38%
== Sample: SRR3102372 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102372
known:97.62%
unknown:2.38%
== Sample: SRR3102386 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102386
known:97.62%
unknown:2.38%
== Sample: SRR3102486 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102486
known:70.98%
unknown:29.02%
== Sample: SRR3102556 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102556
known:69.48%
unknown:30.52%
Step 2: Checking for source proportion
Computing weighted_unifrac distance on species rank
Warning: ``tree`` must be rooted.
There is a polytomy ar the root of this taxonomic tree.
Unifrac distances wont't work properly.
Computing Bray-Curtis distance instead.
TSNE embedding in 2 dimensions
KNN machine learning
Performing 5 fold cross validation on 6 cores...
Trained KNN classifier with 10 neighbors
-> Testing Accuracy: 0.9
----------------------
- Sample: SRR5578937
CDI:5.35%
healthy:94.65%
- Sample: SRR5578953
CDI:15.66%
healthy:84.34%
- Sample: SRR5578924
CDI:11.87%
healthy:88.13%
- Sample: SRR5578990
CDI:11.32%
healthy:88.68%
- Sample: SRR5579006
CDI:5.35%
healthy:94.65%
- Sample: SRR5579106
CDI:5.35%
healthy:94.65%
- Sample: SRR5579003
CDI:7.22%
healthy:92.78%
- Sample: SRR5579074
CDI:7.25%
healthy:92.75%
- Sample: SRR5578943
CDI:92.09%
healthy:7.91%
- Sample: SRR5579072
CDI:5.35%
healthy:94.65%
- Sample: SRR5579036
CDI:5.35%
healthy:94.65%
- Sample: SRR5578962
CDI:5.35%
healthy:94.65%
- Sample: SRR5578931
CDI:5.35%
healthy:94.65%
- Sample: SRR5579039
CDI:15.35%
healthy:84.65%
- Sample: SRR5579061
CDI:11.1%
healthy:88.9%
- Sample: SRR5578942
CDI:5.35%
healthy:94.65%
- Sample: SRR5578919
CDI:68.11%
healthy:31.89%
- Sample: SRR5578913
CDI:8.83%
healthy:91.17%
- Sample: SRR5578991
CDI:5.35%
healthy:94.65%
- Sample: SRR5579100
CDI:13.42%
healthy:86.58%
- Sample: SRR5579037
CDI:5.35%
healthy:94.65%
- Sample: SRR5579048
CDI:5.35%
healthy:94.65%
- Sample: SRR5578952
CDI:14.35%
healthy:85.65%
- Sample: SRR5578930
CDI:9.87%
healthy:90.13%
- Sample: SRR5579017
CDI:9.84%
healthy:90.16%
- Sample: SRR5579005
CDI:5.35%
healthy:94.65%
- Sample: SRR5579055
CDI:9.51%
healthy:90.49%
- Sample: SRR5579002
CDI:92.09%
healthy:7.91%
- Sample: SRR5578968
CDI:5.35%
healthy:94.65%
- Sample: SRR5579116
CDI:5.35%
healthy:94.65%
- Sample: SRR5578910
CDI:8.34%
healthy:91.66%
- Sample: SRR5578947
CDI:5.35%
healthy:94.65%
- Sample: SRR5579080
CDI:5.35%
healthy:94.65%
- Sample: SRR5579103
CDI:5.35%
healthy:94.65%
- Sample: SRR5579035
CDI:9.01%
healthy:90.99%
- Sample: SRR5579085
CDI:7.16%
healthy:92.84%
- Sample: SRR5579079
CDI:7.01%
healthy:92.99%
- Sample: SRR5579018
CDI:7.37%
healthy:92.63%
- Sample: SRR5579001
CDI:5.35%
healthy:94.65%
- Sample: SRR5578938
CDI:9.36%
healthy:90.64%
- Sample: SRR5579108
CDI:10.13%
healthy:89.87%
- Sample: SRR3102551
CDI:10.39%
healthy:89.61%
- Sample: SRR3102397
CDI:86.75%
healthy:13.25%
- Sample: SRR3102539
CDI:88.59%
healthy:11.41%
- Sample: SRR3102516
CDI:79.7%
healthy:20.3%
- Sample: SRR3102401
CDI:87.61%
healthy:12.39%
- Sample: SRR3102398
CDI:92.09%
healthy:7.91%
- Sample: SRR3102573
CDI:92.09%
healthy:7.91%
- Sample: SRR3102518
CDI:83.99%
healthy:16.01%
- Sample: SRR3102463
CDI:78.19%
healthy:21.81%
- Sample: SRR3102581
CDI:5.35%
healthy:94.65%
- Sample: SRR3102359
CDI:92.09%
healthy:7.91%
- Sample: SRR3102427
CDI:92.09%
healthy:7.91%
- Sample: SRR3102369
CDI:92.09%
healthy:7.91%
- Sample: SRR3102356
CDI:82.45%
healthy:17.55%
- Sample: SRR3102561
CDI:78.45%
healthy:21.55%
- Sample: SRR3102374
CDI:86.09%
healthy:13.91%
- Sample: SRR3102372
CDI:70.12%
healthy:29.88%
- Sample: SRR3102386
CDI:88.18%
healthy:11.82%
- Sample: SRR3102486
CDI:86.38%
healthy:13.62%
- Sample: SRR3102556
CDI:88.76%
healthy:11.24%
Sourcepredict result written to sink_species.sourcepredict.csv
Embedding coordinates written to embedding_species.csv
CPU times: user 896 ms, sys: 320 ms, total: 1.22 s
Wall time: 1min 1s
on genus¶
[56]:
%%time
! /projects1/users/borry/18_sourcepredict/sourcepredict -s source_genus.csv -l source_labels_genus.csv sink_genus.csv -r genus -n None -me tsne -di 2 -dt weighted_unifrac -t 6 -e embedding_genus.csv
Step 1: Checking for unknown proportion
== Sample: SRR5578953 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578953
known:97.37%
unknown:2.63%
== Sample: SRR5578924 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578924
known:97.37%
unknown:2.63%
== Sample: SRR5579094 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579094
known:97.37%
unknown:2.63%
== Sample: SRR5579106 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579106
known:97.37%
unknown:2.63%
== Sample: SRR5579030 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579030
known:97.37%
unknown:2.63%
== Sample: SRR5579113 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579113
known:97.37%
unknown:2.63%
== Sample: SRR5579077 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579077
known:97.43%
unknown:2.57%
== Sample: SRR5578962 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578962
known:97.37%
unknown:2.63%
== Sample: SRR5579050 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579050
known:97.37%
unknown:2.63%
== Sample: SRR5578957 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578957
known:97.37%
unknown:2.63%
== Sample: SRR5579028 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579028
known:97.37%
unknown:2.63%
== Sample: SRR5579071 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579071
known:97.38%
unknown:2.62%
== Sample: SRR5578993 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578993
known:97.37%
unknown:2.63%
== Sample: SRR5578963 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578963
known:97.37%
unknown:2.63%
== Sample: SRR5579100 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579100
known:97.49%
unknown:2.51%
== Sample: SRR5578906 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578906
known:97.37%
unknown:2.63%
== Sample: SRR5579112 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579112
known:97.37%
unknown:2.63%
== Sample: SRR5579013 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579013
known:97.37%
unknown:2.63%
== Sample: SRR5579042 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579042
known:97.37%
unknown:2.63%
== Sample: SRR5578912 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578912
known:97.37%
unknown:2.63%
== Sample: SRR5579002 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579002
known:97.37%
unknown:2.63%
== Sample: SRR5579089 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579089
known:97.37%
unknown:2.63%
== Sample: SRR5579087 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579087
known:97.37%
unknown:2.63%
== Sample: SRR5579102 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579102
known:97.37%
unknown:2.63%
== Sample: SRR5579049 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 0.99
----------------------
- Sample: SRR5579049
known:98.22%
unknown:1.78%
== Sample: SRR5579029 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579029
known:97.37%
unknown:2.63%
== Sample: SRR5578910 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578910
known:97.37%
unknown:2.63%
== Sample: SRR5579010 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579010
known:97.37%
unknown:2.63%
== Sample: SRR5579064 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579064
known:97.37%
unknown:2.63%
== Sample: SRR5579051 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579051
known:97.37%
unknown:2.63%
== Sample: SRR5578984 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578984
known:97.37%
unknown:2.63%
== Sample: SRR5578949 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578949
known:97.37%
unknown:2.63%
== Sample: SRR5578975 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578975
known:97.37%
unknown:2.63%
== Sample: SRR5578940 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578940
known:97.37%
unknown:2.63%
== Sample: SRR5578925 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578925
known:97.41%
unknown:2.59%
== Sample: SRR5579015 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5579015
known:97.37%
unknown:2.63%
== Sample: SRR5578927 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578927
known:97.49%
unknown:2.51%
== Sample: SRR5578923 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR5578923
known:97.37%
unknown:2.63%
== Sample: SRR3102557 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102557
known:97.37%
unknown:2.63%
== Sample: SRR3102424 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 0.99
----------------------
- Sample: SRR3102424
known:92.66%
unknown:7.34%
== Sample: SRR3102366 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102366
known:93.41%
unknown:6.59%
== Sample: SRR3102422 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 0.99
----------------------
- Sample: SRR3102422
known:89.06%
unknown:10.94%
== Sample: SRR3102462 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 0.98
----------------------
- Sample: SRR3102462
known:69.99%
unknown:30.01%
== Sample: SRR3102449 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102449
known:96.44%
unknown:3.56%
== Sample: SRR3102526 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102526
known:96.96%
unknown:3.04%
== Sample: SRR3102497 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102497
known:95.78%
unknown:4.22%
== Sample: SRR3102402 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102402
known:97.37%
unknown:2.63%
== Sample: SRR3102440 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102440
known:97.37%
unknown:2.63%
== Sample: SRR3102463 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102463
known:97.37%
unknown:2.63%
== Sample: SRR3102379 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102379
known:91.82%
unknown:8.18%
== Sample: SRR3102529 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102529
known:91.89%
unknown:8.11%
== Sample: SRR3102427 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102427
known:97.39%
unknown:2.61%
== Sample: SRR3102550 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102550
known:97.37%
unknown:2.63%
== Sample: SRR3102410 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 0.99
----------------------
- Sample: SRR3102410
known:94.87%
unknown:5.13%
== Sample: SRR3102376 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102376
known:92.88%
unknown:7.12%
== Sample: SRR3102533 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102533
known:97.39%
unknown:2.61%
== Sample: SRR3102489 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102489
known:95.74%
unknown:4.26%
== Sample: SRR3102490 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102490
known:97.37%
unknown:2.63%
== Sample: SRR3102580 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102580
known:97.37%
unknown:2.63%
== Sample: SRR3102375 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102375
known:97.37%
unknown:2.63%
== Sample: SRR3102483 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102483
known:97.37%
unknown:2.63%
== Sample: SRR3102535 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102535
known:96.31%
unknown:3.69%
== Sample: SRR3102446 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102446
known:97.41%
unknown:2.59%
== Sample: SRR3102527 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102527
known:97.37%
unknown:2.63%
== Sample: SRR3102409 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102409
known:97.37%
unknown:2.63%
== Sample: SRR3102362 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102362
known:97.37%
unknown:2.63%
== Sample: SRR3102487 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 0.99
----------------------
- Sample: SRR3102487
known:96.66%
unknown:3.34%
== Sample: SRR3102517 ==
Adding unknown
Normalizing (no normalization)
Computing Bray-Curtis distance
Performing MDS embedding in 2 dimensions
KNN machine learning
Training KNN classifier on 6 cores...
-> Testing Accuracy: 1.0
----------------------
- Sample: SRR3102517
known:97.37%
unknown:2.63%
Step 2: Checking for source proportion
Computing weighted_unifrac distance on genus rank
TSNE embedding in 2 dimensions
KNN machine learning
Performing 5 fold cross validation on 6 cores...
Trained KNN classifier with 10 neighbors
-> Testing Accuracy: 0.85
----------------------
- Sample: SRR5578953
CDI:2.9%
healthy:97.1%
- Sample: SRR5578924
CDI:1.71%
healthy:98.29%
- Sample: SRR5579094
CDI:1.71%
healthy:98.29%
- Sample: SRR5579106
CDI:1.71%
healthy:98.29%
- Sample: SRR5579030
CDI:1.71%
healthy:98.29%
- Sample: SRR5579113
CDI:1.71%
healthy:98.29%
- Sample: SRR5579077
CDI:13.41%
healthy:86.59%
- Sample: SRR5578962
CDI:1.71%
healthy:98.29%
- Sample: SRR5579050
CDI:2.26%
healthy:97.74%
- Sample: SRR5578957
CDI:2.82%
healthy:97.18%
- Sample: SRR5579028
CDI:1.71%
healthy:98.29%
- Sample: SRR5579071
CDI:2.77%
healthy:97.23%
- Sample: SRR5578993
CDI:2.78%
healthy:97.22%
- Sample: SRR5578963
CDI:1.71%
healthy:98.29%
- Sample: SRR5579100
CDI:6.98%
healthy:93.02%
- Sample: SRR5578906
CDI:3.1%
healthy:96.9%
- Sample: SRR5579112
CDI:2.95%
healthy:97.05%
- Sample: SRR5579013
CDI:1.71%
healthy:98.29%
- Sample: SRR5579042
CDI:1.71%
healthy:98.29%
- Sample: SRR5578912
CDI:95.31%
healthy:4.69%
- Sample: SRR5579002
CDI:1.71%
healthy:98.29%
- Sample: SRR5579089
CDI:1.71%
healthy:98.29%
- Sample: SRR5579087
CDI:88.23%
healthy:11.77%
- Sample: SRR5579102
CDI:1.71%
healthy:98.29%
- Sample: SRR5579049
CDI:9.53%
healthy:90.47%
- Sample: SRR5579029
CDI:1.71%
healthy:98.29%
- Sample: SRR5578910
CDI:1.71%
healthy:98.29%
- Sample: SRR5579010
CDI:13.19%
healthy:86.81%
- Sample: SRR5579064
CDI:2.48%
healthy:97.52%
- Sample: SRR5579051
CDI:1.71%
healthy:98.29%
- Sample: SRR5578984
CDI:44.06%
healthy:55.94%
- Sample: SRR5578949
CDI:3.11%
healthy:96.89%
- Sample: SRR5578975
CDI:1.71%
healthy:98.29%
- Sample: SRR5578940
CDI:1.71%
healthy:98.29%
- Sample: SRR5578925
CDI:2.55%
healthy:97.45%
- Sample: SRR5579015
CDI:1.71%
healthy:98.29%
- Sample: SRR5578927
CDI:2.93%
healthy:97.07%
- Sample: SRR5578923
CDI:4.14%
healthy:95.86%
- Sample: SRR3102557
CDI:95.31%
healthy:4.69%
- Sample: SRR3102424
CDI:92.07%
healthy:7.93%
- Sample: SRR3102366
CDI:95.31%
healthy:4.69%
- Sample: SRR3102422
CDI:90.09%
healthy:9.91%
- Sample: SRR3102462
CDI:93.16%
healthy:6.84%
- Sample: SRR3102449
CDI:92.87%
healthy:7.13%
- Sample: SRR3102526
CDI:95.31%
healthy:4.69%
- Sample: SRR3102497
CDI:95.31%
healthy:4.69%
- Sample: SRR3102402
CDI:95.31%
healthy:4.69%
- Sample: SRR3102440
CDI:95.31%
healthy:4.69%
- Sample: SRR3102463
CDI:95.31%
healthy:4.69%
- Sample: SRR3102379
CDI:94.18%
healthy:5.82%
- Sample: SRR3102529
CDI:93.06%
healthy:6.94%
- Sample: SRR3102427
CDI:55.04%
healthy:44.96%
- Sample: SRR3102550
CDI:95.31%
healthy:4.69%
- Sample: SRR3102410
CDI:95.31%
healthy:4.69%
- Sample: SRR3102376
CDI:2.71%
healthy:97.29%
- Sample: SRR3102533
CDI:93.57%
healthy:6.43%
- Sample: SRR3102489
CDI:75.7%
healthy:24.3%
- Sample: SRR3102490
CDI:95.31%
healthy:4.69%
- Sample: SRR3102580
CDI:95.31%
healthy:4.69%
- Sample: SRR3102375
CDI:95.31%
healthy:4.69%
- Sample: SRR3102483
CDI:45.14%
healthy:54.86%
- Sample: SRR3102535
CDI:92.53%
healthy:7.47%
- Sample: SRR3102446
CDI:95.31%
healthy:4.69%
- Sample: SRR3102527
CDI:95.31%
healthy:4.69%
- Sample: SRR3102409
CDI:95.31%
healthy:4.69%
- Sample: SRR3102362
CDI:95.31%
healthy:4.69%
- Sample: SRR3102487
CDI:95.31%
healthy:4.69%
- Sample: SRR3102517
CDI:95.31%
healthy:4.69%
Sourcepredict result written to sink_genus.sourcepredict.csv
Embedding coordinates written to embedding_genus.csv
CPU times: user 1.18 s, sys: 368 ms, total: 1.55 s
Wall time: 1min 19s
Reading Sourcepredict results¶
[57]:
from sklearn.metrics import accuracy_score
[58]:
pred_genus = pd.read_csv("sink_genus.sourcepredict.csv", index_col=0)
test_labels_genus = pd.read_csv("sink_labels_genus.csv", index_col=0)
[59]:
conf_table_genus = pred_genus.idxmax(axis=0).to_frame(name='predicted').merge(test_labels_genus, left_index=True, right_index=True)
[60]:
conf_table_genus = conf_table_genus.dropna()
[61]:
conf_table_genus.shape
[61]:
(68, 2)
[62]:
conf_table_genus.apply(pd.value_counts, axis=0)
[62]:
predicted | labels | |
---|---|---|
healthy | 38 | 38 |
CDI | 30 | 30 |
[63]:
acc_genus = accuracy_score(y_true=conf_table_genus['labels'], y_pred=conf_table_genus['predicted'])
print(f"Accuracy: {round(acc_genus,2)}")
Accuracy: 0.94
[64]:
from plotnine import *
[65]:
embed = pd.read_csv("embedding_genus.csv", index_col=0)
embed = embed.rename(columns={'labels':'type'})
embed['type'] = embed['type'].str.replace('CDI','source').replace('healthy','source')
embed = embed.join(labels_g['labels']).rename(columns={'labels':'actual'})
embed = embed.join(pd.Series(pred_genus.idxmax(), name='predicted'))
[66]:
g = ggplot(embed.query("type == 'source'"), aes(x='PC1',y='PC2', color='labels'))
g += geom_point(size=3, stroke=1, alpha=0.5)
g += scale_color_manual(name = 'Reference', values = {"CDI":cdi_color, "healthy":healthy_color})
g += xlab('DIM1')
g += ylab('DIM2')
g += theme_classic()
g += theme(plot_background=element_blank(),
panel_background=element_blank(),
legend_background=element_blank(),
axis_line=element_line(color=healthy_color),
legend_text=element_text(color='black', weight='bold'),
axis_text=element_text(color='black', weight='bold'),
axis_title=element_text(color='black', weight='bold'),
legend_title=element_text(color='black', weight='bold'))
g.save('results/train_embedding.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/train_embedding.png
warn('Filename: {}'.format(filename), PlotnineWarning)
[66]:
<ggplot: (8782530729613)>
[67]:
g = ggplot(embed.query("type == 'sink'"), aes(x='PC1',y='PC2', color='predicted'))
g += geom_point(size=4, shape='o', fill = 'black', stroke=2)
g += geom_point(data=embed.query("type == 'sink'"), mapping=aes(x='PC1',y='PC2', fill='actual'), size=3, color='black', stroke=1)
g += scale_color_manual(name = 'Reference', values = {"CDI":cdi_color, "healthy":healthy_color})
g += scale_fill_manual(name = 'Prediction', values = {"CDI":cdi_color, "healthy":healthy_color})
g += xlab('DIM1')
g += ylab('DIM2')
g += theme_classic()
g += theme(plot_background=element_blank(),
panel_background=element_blank(),
legend_background=element_blank(),
axis_line=element_line(color='black'),
legend_text=element_text(color='black', weight='bold'),
axis_text=element_text(color='black', weight='bold'),
axis_title=element_text(color='black', weight='bold'),
legend_title=element_text(color='black', weight='bold'))
g.save('results/test_embedding.png', format='png', dpi=300, transparent=True)
g
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:729: PlotnineWarning: Saving 6.4 x 4.8 in image.
from_inches(height, units), units), PlotnineWarning)
/projects1/users/borry/15_miniconda3/envs/sourcepredict/lib/python3.6/site-packages/plotnine/ggplot.py:730: PlotnineWarning: Filename: results/test_embedding.png
warn('Filename: {}'.format(filename), PlotnineWarning)
[67]:
<ggplot: (-9223363254324085430)>
Comparing with Sourcetracker 2¶
Generating the data for Sourcetracker 2
[68]:
X_g.T.to_csv("st_genus.txt", sep="\t", index_label="TAXID")
test_labels_genus[‘SourceSink’]= [‘sink’]*test_labels_genus.shape[0]
[69]:
train_labels_genus['SourceSink'] = ['source']*train_labels_genus.shape[0]
[70]:
st_metadata = train_labels_genus.append(test_labels_genus).rename(columns = {"labels":"Env"})[['SourceSink','Env']]
st_metadata['SourceSink'][train_labels_genus.index] = ['source']*train_labels_genus.shape[0]
st_metadata['SourceSink'][test_labels_genus.index] = ['sink']*test_labels_genus.shape[0]
st_metadata
[70]:
SourceSink | Env | |
---|---|---|
SRR5579101 | source | healthy |
SRR3102473 | source | CDI |
SRR3102501 | source | CDI |
SRR5578964 | source | healthy |
SRR5579017 | source | healthy |
... | ... | ... |
SRR3102527 | sink | CDI |
SRR3102409 | sink | CDI |
SRR3102362 | sink | CDI |
SRR3102487 | sink | CDI |
SRR3102517 | sink | CDI |
340 rows × 2 columns
[71]:
st_metadata.to_csv("st_genus_metadata.csv", sep="\t", index_label='#SampleID')
sourcetracker2 gibbs -i st_genus.biom -m st_genus_metadata.csv -o st_genus_out --jobs 6 --source_rarefaction_depth 0 --sink_rarefaction_depth 0
Reading sourcetracker results
[72]:
st_pred = pd.read_csv("st_genus_out/mixing_proportions.txt", sep = "\t", index_col=0)
[73]:
st2_pred = st_pred.idxmax(axis=1).to_frame(name='predicted').merge(test_labels_genus, left_index=True, right_index=True).rename(columns={'labels':'actual'})
[74]:
st2_pred.head()
[74]:
predicted | actual | |
---|---|---|
SRR5578953 | healthy | healthy |
SRR5578924 | healthy | healthy |
SRR5579094 | healthy | healthy |
SRR5579106 | healthy | healthy |
SRR5579030 | healthy | healthy |
[75]:
st_acc_genus = accuracy_score(y_true=st2_pred['actual'], y_pred=st2_pred['predicted'])
print(f"Accuracy: {round(st_acc_genus,2)}")
Accuracy: 0.8