Compute Pearson correlation between 2 groups of samples for each feature
Author: Andrei Todor
import pandas as pd
import numpy as np
from pylab import *
from scipy.stats.stats import pearsonr
from statsmodels.sandbox.stats.multicomp import multipletests
infile = "../results/features_avg_pos.txt"
mydata = pd.read_table(infile)
header = [x for x in mydata]
g = []
for type in ['mock','yf']:
g.append([sample for sample in header if type in sample])
corr = []
for idx in range(mydata.shape[0]):
corr.append(pearsonr(mydata.iloc[idx][g[0]], mydata.iloc[idx][g[1]]))
bh = multipletests([x[1] for x in corr], method = 'fdr_bh')
#Type I Manhattan plot
figure(figsize = (15,10))
x = [mydata.iloc[i].mz for i in range(len(corr)) if corr[i][0] >= 0]
y = [bh[1][i] for i in range(len(corr)) if corr[i][0] >= 0]
matplotlib.pyplot.scatter(x, -np.log10(y), c = 'red', label = "Positive")
x = [mydata.iloc[i].mz for i in range(len(corr)) if corr[i][0] < 0]
y = [bh[1][i] for i in range(len(corr)) if corr[i][0] < 0]
matplotlib.pyplot.scatter(x, -np.log10(y), c = 'blue', label = "Negative")
xlabel("m/z")
ylabel("-log p")
legend()
show()
#Type II Manhattan plot
figure(figsize = (15,10))
x = [mydata.iloc[i].time for i in range(len(corr)) if corr[i][0] >= 0]
y = [bh[1][i] for i in range(len(corr)) if corr[i][0] >= 0]
matplotlib.pyplot.scatter(x, -np.log10(y), c = 'red', label = "Positive")
x = [mydata.iloc[i].time for i in range(len(corr)) if corr[i][0] < 0]
y = [bh[1][i] for i in range(len(corr)) if corr[i][0] < 0]
matplotlib.pyplot.scatter(x, -np.log10(y), c = 'blue', label = "Negative")
xlabel("m/z")
ylabel("-log p")
legend()
show()