A metabolic model describes the relationship between metabolites (compounds) through metabolic reactions.
A reaction is the conversion from reactants to products. These reactions may be catalyzed by enzymes, and they may belong to predefined metabolic pathways. Although broadly speaking, there's a class of transport reactions, which move metabolites across cellular compartments by the help of transporter proteins.
Because the products of one reaction are often the reactants of another, most reactions are connected into a large metabolic network. Reactants and products can also be shared or competed between reactions. The metabolic network is often described via a bipartite graph, with metabolites and enzymes as the two node types.
These data for a metabolic model can be from a database, e.g. BioCyc or other published models, e.g. RECON2. This notebook shows how the model from mummichog 1 is converted to JSON format for mummichog 2. The design change in version 2 is to separate the model more cleanly from the algorithm, easier to do web integration and model update.
By Shuzhao Li, 2017-07-17
import json
# this human_model_mfn model is from mummichog version 1
# Currently, mummichog is a statistical approach that does not take into account of reaction rates.
import human_model_mfn as model
dir(model)
for x in [
model.dict_cpds_def,
model.cpd_edges,
model.dict_cpds_mass,
model.metabolic_rxns,
model.metabolic_pathways,
]:
print (len(x))
# also see what's in cpd def
print(model.dict_cpds_def.items()[:3])
# will need fix a bug when cpd name doesn't show...
No formula was in the MFN model in mummichog 1. Let's get from RECON2 and HMDB. RECON2 is a compartmentalized model, where ''' c = cytoplasm e = extracellular space g = Golgi apparatus l = lysosome m = mitochondrion n = nucleus r = endoplasmic reticulum x = peroxisome '''
# get relevant compounds in model
cpdsInModel = []
for x in model.cpd_edges:
cpdsInModel += x
cpds2include = model.cpd2pathways.keys() + cpdsInModel
cpds2include = set(cpds2include)
print( len(cpds2include) )
# These are previously parsed data.
hmdb = 'parsed_HMDBmetabolites3.5.txt'
recon2 = 'parsed_species_recon2.txt'
hmdb = open(hmdb).read().splitlines()
recon2 = open(recon2).read().splitlines()
h = hmdb[0].split('\t')
print(h)
print("chemical_formula", h.index("chemical_formula"))
print("kegg_id", h.index("kegg_id"))
print(recon2[0].split('\t'))
# build dictionary, {KEGG-like ID: chemical_formula}
dict_formula = {}
for line in hmdb:
a = line.split('\t')
dict_formula[a[21]] = a[2]
# RECON2 as id format like `M_34dhoxmand_c`
for line in recon2:
a = line.split('\t')
if a[0][0] == "M":
dict_formula[a[0].split('_')[1]] = a[3]
print(len(dict_formula))
print("Overlap number of cpds = ", len(cpds2include.intersection( set(dict_formula.keys()) )))
print(len( set(model.dict_cpds_mass.keys()).intersection(cpds2include) ))
print(len( set(model.dict_cpds_mass.keys()).intersection(set(dict_formula.keys())) ))
print(len( set(model.dict_cpds_mass.keys()).intersection(set(dict_formula.keys())).intersection(cpds2include) ))
# How many cpds don't have mass but have formula?
len([x for x in cpds2include if x not in model.dict_cpds_mass.keys() and x in dict_formula.keys()])
The dict_cpds_mass may originally come from KEGG, not sure if mass deficiency from chemical bonding was considered.
# new mass calculation
# using pychemy module. It failed to install using pip because of `Open Babel` binding.
# But the mass calcuation function is not depdendent on `Open Babel`,
# thus pychemy folder is copied here and it's usable as below
from pychemy.molmass import Formula
massdict_cpds2include = {}
unassigned = []
for x in cpds2include:
if model.dict_cpds_mass.has_key(x):
massdict_cpds2include[x] = model.dict_cpds_mass[x]
elif dict_formula.has_key(x):
try:
massdict_cpds2include[x] = Formula( dict_formula[x] ).isotope.mass
except:
print(dict_formula[x])
unassigned.append(x)
else:
unassigned.append(x)
print(len(massdict_cpds2include))
print("unassigned, ", len(unassigned))
The above mass calculation raised the number of compounds with mass from 2019 to 2447.
Next, compute adducts.
# computing adducts filtered by formula
# if no formula is available, should use default adduct calculation
# make sure mono mass is present
import re
def parse_chemformula(x):
'''This does not deal with nested groups in chemical formula.
Formula from HMDB 3.5 are compatible.
'''
p = re.findall(r'([A-Z][a-z]*)(\d*)', x)
d = {}
for pair in p: d[pair[0]] = int( pair[1] or 1 )
return d
def check_sub(Fm1, Fm2):
'''Check if Fm2 a subunit of Fm1; Dictionary representation of chem formula
'''
Fm2_in_Fm1 = True
Elements1 = Fm1.keys()
Elements2 = Fm2.keys()
if [x for x in Elements2 if x not in Elements1]: #element not in Fm1
return False
else:
for e in Elements2:
if Fm2[e] > Fm1[e]: Fm2_in_Fm1 = False
return Fm2_in_Fm1
def compute_adducts(mw, cFormula):
'''
# new function calculating adducts
# A few rules here:
# C13, S34, Cl37 pos only applicable if the atom is in formula
# and others in `required subgroup`, as the last item in the tuples
# Better way is to based on chemical structure - future direction.
'''
PROTON = 1.00727646677
addList = [(mw, 'M[1+]', ''),
(mw + PROTON, 'M+H[1+]', ''),
(mw/2 + PROTON, 'M+2H[2+]', ''),
(mw/3 + PROTON, 'M+3H[3+]', ''),
(mw +1.0034 + PROTON, 'M(C13)+H[1+]', 'C'),
(mw/2 + 0.5017 + PROTON, 'M(C13)+2H[2+]', 'C'),
(mw/3 + 0.3344 + PROTON, 'M(C13)+3H[3+]', 'C'),
(mw +1.9958 + PROTON, 'M(S34)+H[1+]', 'S'),
(mw +1.9972 + PROTON, 'M(Cl37)+H[1+]', 'Cl'),
(mw + 21.9820 + PROTON, 'M+Na[1+]', ''),
(mw/2 + 10.991 + PROTON, 'M+H+Na[2+]', ''),
(mw + 37.9555 + PROTON, 'M+K[1+]', ''),
(mw + 18.0106 + PROTON, 'M+H2O+H[1+]', ''),
(mw - 18.0106 + PROTON, 'M-H2O+H[1+]', 'H2O'),
(mw - 36.0212 + PROTON, 'M-H4O2+H[1+]', 'H4O2'),
(mw - 17.0265 + PROTON, 'M-NH3+H[1+]', 'NH3'),
(mw - 27.9950 + PROTON, 'M-CO+H[1+]', 'CO'),
(mw - 43.9898 + PROTON, 'M-CO2+H[1+]', 'CO2'),
(mw - 46.0054 + PROTON, 'M-HCOOH+H[1+]', 'H2CO2'),
(mw + 67.9874 + PROTON, 'M+HCOONa[1+]', ''),
(mw - 67.9874 + PROTON, 'M-HCOONa+H[1+]', 'HCO2Na'),
(mw + 57.9586 + PROTON, 'M+NaCl[1+]', ''),
(mw - 72.0211 + PROTON, 'M-C3H4O2+H[1+]', 'C3H4O2'),
(mw + 83.9613 + PROTON, 'M+HCOOK[1+]', ''),
(mw - 83.9613 + PROTON, 'M-HCOOK+H[1+]', 'HCO2K'),
] + [(mw - PROTON, 'M-H[-]', ''),
(mw/2 - PROTON, 'M-2H[2-]', ''),
(mw + 1.0034 - PROTON, 'M(C13)-H[-]', 'C'),
(mw + 1.9958 - PROTON, 'M(S34)-H[-]', 'S'),
(mw + 1.9972 - PROTON, 'M(Cl37)-H[-]', 'Cl'),
(mw + 21.9820 - 2*PROTON, 'M+Na-2H[-]', ''),
(mw + 37.9555 - 2*PROTON, 'M+K-2H[-]', ''),
(mw - 18.0106 - PROTON, 'M-H2O-H[-]', 'H2O'),
(mw + 34.9689, 'M+Cl[-]', ''),
(mw + 36.9659, 'M+Cl37[-]', ''),
(mw + 78.9183, 'M+Br[-]', ''),
(mw + 80.9163, 'M+Br81[-]', ''),
(mw + 2*12 + 3*1.007825 + 14.00307 - PROTON, 'M+ACN-H[-]', ''),
(mw + 1.007825 + 12 + 2*15.99491, 'M+HCOO[-]', ''),
(mw + 3*1.007825 + 2*12 + 2*15.99491, 'M+CH3COO[-]', ''),
(mw - PROTON + 15.99491, 'M-H+O[-]', ''),
]
dict_cFormula = parse_chemformula(cFormula)
mydict = {}
for x in addList:
if check_sub(dict_cFormula, parse_chemformula(x[2])):
mydict[x[1]] = x[0]
return mydict
# test
print(dict_formula['C05587'])
print(model.dict_cpds_def['C05587'])
print(model.dict_cpds_mass['C05587'])
print (compute_adducts( model.dict_cpds_mass['C05587'], dict_formula['C05587'] ))
#
missing_names = [x for x in cpds2include if not model.dict_cpds_def.get(x, '')]
print (len(missing_names))
print(missing_names[:20])
# a few drugs are in missing, but can be left out, 3560
cpds2include = [x for x in cpds2include if x[1]!='D']
print(len(cpds2include))
# add name dict from RECON2
# RECON2 as id format like `M_34dhoxmand_c`
names_recon2 = {}
for line in recon2:
a = line.split('\t')
if a[0][0] == "M":
names_recon2[a[0].split('_')[1]] = a[1]
still_missing = [x for x in missing_names if x not in names_recon2.keys()]
print(len(still_missing))
#for x in still_missing:
# print x
Manually finding other names...
# OK add more name from KEGG adn EHMN model
names_extra = {}
# cpd:C01620 Threonate; L-Threonate; (2R,3S)-2,3,4-Trihydroxybutanoic acid
w = open('additional_KEGG_names.txt').read()
for line in w.splitlines():
if 'No such data' not in line:
names_extra[line[:13].split(':')[1].strip()] = line[13:].strip()
print(names_extra.items()[-2:])
# <species id="CE1941" name="isoputreanine"/>
for line in open('additional_EHMN_names.txt').readlines()[1:]:
a = line.split('" name="')
names_extra[a[0].split('id="')[1]] = a[1].split('"')[0]
print(len(names_extra))
print(names_extra.items()[-2:])
still_missing2 = [x for x in still_missing if x not in names_extra.keys()]
print(len(still_missing2))
print(still_missing2)
# so only 15-3 names still missing. These are likely to be obsolete IDs from KEGG
# this is the new compound dict
newDict = {}
for c in cpds2include:
name = model.dict_cpds_def.get(c, '') or names_recon2.get(c, '') or names_extra.get(c, '')
mw = massdict_cpds2include.get(c, 0)
f = dict_formula.get(c, '')
if mw:
newDict[c] = {'name': name, 'mw': mw, 'formula':f, 'adducts':compute_adducts(mw, f)}
else:
newDict[c] = {'name': name, 'mw': mw, 'formula':f, 'adducts':{}}
print(len(newDict))
print(newDict['C04878'])
print(newDict['CE4849'])
# Convert to JSON format
# In [8]: '_'.join(sorted(('C00999', 'C00231')))
# Out[8]: 'C00231_C00999'
metabolicModels = {}
metabolicModels['human_model_mfn'] = {}
metabolicModels['human_model_mfn']['version'] = 'MFN_1.10.4'
metabolicModels['human_model_mfn']['Compounds'] = newDict
metabolicModels['human_model_mfn']['cpd2pathways'] = model.cpd2pathways
metabolicModels['human_model_mfn']['cpd_edges'] = model.cpd_edges
metabolicModels['human_model_mfn']['dict_cpds_def'] = model.dict_cpds_def
metabolicModels['human_model_mfn']['dict_cpds_mass'] = model.dict_cpds_mass
metabolicModels['human_model_mfn']['metabolic_pathways'] = model.metabolic_pathways
metabolicModels['human_model_mfn']['metabolic_rxns'] = model.metabolic_rxns
#metabolicModels['human_model_mfn']['pathdotdict'] = model.pathdotdict
# edges can't be tuples in JSON format
new_e2rxn = {}
for k,v in model.edge2rxn.items():
new_e2rxn[','.join(sorted(k))] = v
metabolicModels['human_model_mfn']['edge2rxn'] = new_e2rxn
new_e2enzyme = {}
for k,v in model.edge2enzyme.items():
new_e2enzyme[','.join(sorted(k))] = v
metabolicModels['human_model_mfn']['edge2enzyme'] = new_e2enzyme
out = open('JSON_metabolicModels.py', 'w')
out.write( 'metabolicModels = ' + json.dumps(metabolicModels) )
out.close()
# test import
from JSON_metabolicModels import metabolicModels
metabolicModels['human_model_mfn'].keys()
Adding this new version of model to mummichog 2.0.2-beta