Each biological sample was run in triplicates. Taking geometric mean of technical replicates.
Example data are from Li et al. (2013) Predicting network activity from high throughput metabolomics. PLoS computational biology 9.7 (2013): e1003123.
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
INDIR = "../input_data/"
datafile = "modc_ae_2012.txt"
data = pd.read_table(os.path.join(INDIR + datafile))
data.head()
This is a dataset of human monocyte-derived dendritic cells (moDC). Three experimental groups are baseline control, mock infection at 6 hours, YFV infection at 6 hours. Each group contains three biological replicates. Each biological replicate was analyzed by LC-MS in three technical replicates. This above table was extracted using apLCMS.
new_header = [x for x in data]
# if needed to transform to log2, shift positive 1 to avoid log2(0)
# new_data = np.log2(mydata[ new_header[4:] ] + 1)
new_data = data[new_header[4:]]
mean_TIC = new_data.mean(0)
# plot bars for first N columns, useful if large number of samples
N = 27
ind = np.arange(N)
plt.figure(figsize=(15,3))
plt.bar(ind, mean_TIC[:N], width=0.7, color="purple", alpha=0.4)
plt.ylabel("Average log2(intensity)")
plt.title("Ave Intensity [%d:]" %N)
plt.xticks(ind, new_data.columns[:N], rotation=-90)
plt.show()
# one should be alarmed if irregularity is spotted. Replicate of low intensity may be subject of removal.
# Alternatively, pandas dataframe has built-in plotting too (also via matplotlib)
# We are making a new dictionary to map samples to replicates
def remove_tail(x):
return("_".join(x.split("_")[:-1]))
samples = [remove_tail(x) for x in new_header[4:]]
print("After remove_tail: ", samples[:6])
samples = set(samples)
samDict = {}
for x in new_header[4:]:
k = remove_tail(x)
if samDict.has_key(k):
samDict[k].append(x)
else:
samDict[k] = [x,]
print("Example sample dictionary:")
print(samDict.items()[:3])
# Pearson correlation btw technical replicates
# plot first before deciding CUTOFF value
def get_flat_corr(CorrDataFrame):
return [CorrDataFrame.values[0, 1], CorrDataFrame.values[0, 2], CorrDataFrame.values[1, 2]]
all_Pearson_coefficients = []
for k,v in samDict.items():
if len(v) == 3: # only do if three replicates
Pr = new_data[v].corr()
all_Pearson_coefficients.append((v, Pr))
all_corr_values = []
for X in all_Pearson_coefficients:
all_corr_values += get_flat_corr(X[1])
# This is useful where sample nubmer is large
all_corr_values.sort()
plt.figure(figsize=(8,4))
plt.plot(range(len(all_corr_values)), all_corr_values, 'bo', markersize=1)
plt.title("Distribution of replicate correlation")
plt.xlabel("Feature")
plt.ylabel("Pearson Correlation Coeff")
plt.show()
# This block is to remove bad replicates.
#
# CUTOFF is based on plot above; this is usually dragged down by too many zeros
# This does nothing because the worst is 0.78 in above figure
corr_CUTOFF = 0.7
# Deterine which replicate to remove
def call_bad_replicate(CorrDataFrame, corr_CUTOFF):
# assuming three replicates here,
# return list of bad replicates
names = [x for x in CorrDataFrame]
good = []
if CorrDataFrame.values[0, 1] > corr_CUTOFF:
good += [names[0], names[1]]
if CorrDataFrame.values[0, 2] > corr_CUTOFF:
good += [names[0], names[2]]
if CorrDataFrame.values[1, 2] > corr_CUTOFF:
good += [names[1], names[2]]
return [x for x in names if x not in good]
bad_replicates = []
for X in all_Pearson_coefficients:
bad = call_bad_replicate(X[1], corr_CUTOFF)
if bad:
bad_replicates += bad
print("- Bad corr - %s" %str(bad))
# uncomment below to see the whole corr matrix
#print(Pr)
# In some dataset, many Qstd samples have the same name. Skipping them for now.
# Alternative method can use stats.pearsonr and itertools.combinations
Since data are already in log2, myave function is actually taking geometric means.
def myave(L):
new = [x for x in L if x >1]
if new:
return np.mean(new)
else:
return 0
# update samDict
# this is used when bad replicates need to be excluded
# black_list = set(low_intensity_replicates + bad_replicates)
#
black_list = []
samDict2 = {}
for k,v in samDict.items():
new = [x for x in v if x not in black_list]
if new:
# We may want to keep NIST and Q-std samples for other reasons including downstream QC
if 'nist' not in k and 'q3' not in k:
samDict2[k] = new
# Write ave_log2_ data
use_samples = samDict2.keys()
use_samples.sort()
s = 'mz\tretention_time\t' + '\t'.join(use_samples) + '\n'
# mydata contains mz, this set of data is already in log2
print(data.shape)
num_features = data.shape[0]
print("use_samples", use_samples[:3])
print(samDict2[use_samples[2]])
# mapper is faster than DataFrame lookup
new_data_header = [x for x in new_data]
mapper = []
for x in use_samples:
mapper.append([new_data_header.index(ii) for ii in samDict2[x]])
for ii in range(num_features):
mz, rtime = data.values[ii, :2]
d = new_data.values[ii, :]
ave_data = [myave( [d[ii] for ii in x] ) for x in mapper]
s += '\t'.join( [str(mz), str(int(rtime)), ] + [str(round(x,2)) for x in ave_data] ) + '\n'
new_file = "ave_log2_" + datafile
with open(new_file, 'w') as file:
file.write(s)
The above code illustrates simple QC and averaging replicates.
Not covered here, but filtering, normalization and imputation is often needed. Missing values can be problematic for some statistical methods, including linear regression. Data should be filtered to have fewer missing values. The missing values can be imputed by replacement of minimal values of a feature row (i.e. minimal detected values in the same feature). Other imputation methods are available too. Imputation may be performed prior to normalization, as the normalization method may be sensitive to missing data.