1
/* $Revision: 9050 $ $Author: egonw $ $Date: 2007-10-14 20:23:40 +0200 (Sun, 14 Oct 2007) $
3
* Copyright (C) 2002-2007 Christoph Steinbeck <steinbeck@users.sf.net>
5
* Contact: cdk-devel@lists.sourceforge.net
7
* This program is free software; you can redistribute it and/or
8
* modify it under the terms of the GNU Lesser General Public License
9
* as published by the Free Software Foundation; either version 2.1
10
* of the License, or (at your option) any later version.
11
* All I ask is that proper credit is given for my work, which includes
12
* - but is not limited to - adding the above copyright notice to the beginning
13
* of your source code files, and to any copyright notice that you may distribute
14
* with programs based on this work.
16
* This program is distributed in the hope that it will be useful,
17
* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
* GNU Lesser General Public License for more details.
21
* You should have received a copy of the GNU Lesser General Public License
22
* along with this program; if not, write to the Free Software
23
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
25
package org.openscience.cdk.smiles;
27
import java.util.Enumeration;
28
import java.util.Stack;
29
import java.util.StringTokenizer;
31
import org.openscience.cdk.CDKConstants;
32
import org.openscience.cdk.DefaultChemObjectBuilder;
33
import org.openscience.cdk.aromaticity.HueckelAromaticityDetector;
34
import org.openscience.cdk.exception.CDKException;
35
import org.openscience.cdk.exception.InvalidSmilesException;
36
import org.openscience.cdk.graph.ConnectivityChecker;
37
import org.openscience.cdk.interfaces.IAtom;
38
import org.openscience.cdk.interfaces.IAtomContainer;
39
import org.openscience.cdk.interfaces.IBond;
40
import org.openscience.cdk.interfaces.IChemObjectBuilder;
41
import org.openscience.cdk.interfaces.IMolecule;
42
import org.openscience.cdk.interfaces.IMoleculeSet;
43
import org.openscience.cdk.interfaces.IReaction;
44
import org.openscience.cdk.tools.HydrogenAdder;
45
import org.openscience.cdk.tools.LoggingTool;
46
import org.openscience.cdk.tools.ValencyHybridChecker;
48
* Parses a SMILES {@cdk.cite SMILESTUT} string and an AtomContainer. The full
49
* SSMILES subset {@cdk.cite SSMILESTUT} and the '%' tag for more than 10 rings
50
* at a time are supported. An example:
53
* SmilesParser sp = new SmilesParser(DefaultChemObjectBuilder.getInstance());
54
* IMolecule m = sp.parseSmiles("c1ccccc1");
55
* } catch (InvalidSmilesException ise) {
59
* <p>This parser does not parse stereochemical information, but the following
60
* features are supported: reaction smiles, partitioned structures, charged
61
* atoms, implicit hydrogen count, '*' and isotope information.
63
* <p>See {@cdk.cite WEI88} for further information.
65
* @author Christoph Steinbeck
66
* @author Egon Willighagen
68
* @cdk.created 2002-04-29
69
* @cdk.keyword SMILES, parser
81
* @see org.openscience.cdk.smiles.InterruptableSmilesParser
83
public class SmilesParser {
85
private LoggingTool logger;
86
private HydrogenAdder hAdder;
87
// private SmilesValencyChecker valencyChecker;
88
private ValencyHybridChecker valencyChecker;
90
private int status = 0;
91
protected IChemObjectBuilder builder;
95
* Constructor for the SmilesParser object.
97
* @deprecated Use SmilesParser(IChemObjectBuilder instead)
101
this(DefaultChemObjectBuilder.getInstance());
105
* Constructor for the SmilesParser object.
107
* @param builder IChemObjectBuilder used to create the IMolecules from
109
public SmilesParser(IChemObjectBuilder builder)
111
logger = new LoggingTool(this);
112
this.builder = builder;
115
valencyChecker = new ValencyHybridChecker();
116
hAdder = new HydrogenAdder(valencyChecker);
117
} catch (Exception exception)
119
logger.error("Could not instantiate valencyChecker or hydrogenAdder: ",
120
exception.getMessage());
121
logger.debug(exception);
126
int nodeCounter = -1;
127
String smiles = null;
128
double bondStatus = -1;
129
double bondStatusForRingClosure = 1;
130
boolean bondIsAromatic = false;
131
IAtom[] rings = null;
132
double[] ringbonds = null;
134
IMolecule molecule = null;
135
String currentSymbol = null;
137
public IReaction parseReactionSmiles(String smiles) throws InvalidSmilesException
139
StringTokenizer tokenizer = new StringTokenizer(smiles, ">");
140
String reactantSmiles = tokenizer.nextToken();
141
String agentSmiles = "";
142
String productSmiles = tokenizer.nextToken();
143
if (tokenizer.hasMoreTokens())
145
agentSmiles = productSmiles;
146
productSmiles = tokenizer.nextToken();
149
IReaction reaction = builder.newReaction();
152
IMolecule reactantContainer = parseSmiles(reactantSmiles);
153
IMoleculeSet reactantSet = ConnectivityChecker.partitionIntoMolecules(reactantContainer);
154
for (int i = 0; i < reactantSet.getAtomContainerCount(); i++)
156
reaction.addReactant(reactantSet.getMolecule(i));
160
if (agentSmiles.length() > 0)
162
IMolecule agentContainer = parseSmiles(agentSmiles);
163
IMoleculeSet agentSet = ConnectivityChecker.partitionIntoMolecules(agentContainer);
164
for (int i = 0; i < agentSet.getAtomContainerCount(); i++)
166
reaction.addAgent(agentSet.getMolecule(i));
171
IMolecule productContainer = parseSmiles(productSmiles);
172
IMoleculeSet productSet = ConnectivityChecker.partitionIntoMolecules(productContainer);
173
for (int i = 0; i < productSet.getAtomContainerCount(); i++)
175
reaction.addProduct(productSet.getMolecule(i));
183
* Parses a SMILES string and returns a Molecule object.
185
*@param smiles A SMILES string
186
*@return A Molecule representing the constitution
187
* given in the SMILES string
188
*@exception InvalidSmilesException Exception thrown when the SMILES string
191
public IMolecule parseSmiles(String smiles) throws InvalidSmilesException {
192
setInterrupted(false);
194
DeduceBondSystemTool dbst=new DeduceBondSystemTool();
196
IMolecule m2=this.parseString(smiles);
201
m=(IMolecule)m2.clone();
203
} catch (java.lang.CloneNotSupportedException exception) {
204
logger.debug(exception);
207
// add implicit hydrogens
208
this.addImplicitHydrogens(m);
210
// setup missing bond orders
211
this.setupMissingBondOrders(m);
213
// conceive aromatic perception
214
this.conceiveAromaticPerception(m);
216
boolean HaveSP2=false;
218
for (int j=0;j<=m.getAtomCount()-1;j++) {
219
if (m.getAtom(j).getHybridization()==2) {
225
if (HaveSP2) { // have lower case (aromatic) element symbols that may need to be fixed
227
dbst.setInterrupted(isInterrupted());
228
if (!(dbst.isOK(m))) {
231
m = (IMolecule) dbst.fixAromaticBondOrders(m2);
233
if (!(m instanceof IMolecule)) {
234
throw new InvalidSmilesException("Could not deduce aromatic bond orders.");
237
// doesnt need to fix aromatic bond orders
240
} catch (CDKException ex) {
241
throw new InvalidSmilesException(ex.getMessage(), ex);
249
* This routine parses the smiles string into a molecule but does not add hydrogens, saturate, or perceive aromaticity
252
* @throws InvalidSmilesException
254
private IMolecule parseString(String smiles) throws InvalidSmilesException
256
logger.debug("parseSmiles()...");
260
bondIsAromatic = false;
261
boolean bondExists = true;
263
currentSymbol = null;
264
molecule = builder.newMolecule();
266
// we don't want more than 1024 rings
267
rings = new IAtom[1024];
268
ringbonds = new double[1024];
269
for (int f = 0; f < 1024; f++)
276
char[] chars = new char[1];
277
IAtom lastNode = null;
278
Stack atomStack = new Stack();
279
Stack bondStack = new Stack();
285
mychar = smiles.charAt(position);
287
logger.debug("Processing: " + mychar);
288
if (lastNode != null)
290
logger.debug("Lastnode: ", lastNode.hashCode());
292
if ((mychar >= 'A' && mychar <= 'Z') || (mychar >= 'a' && mychar <= 'z') ||
296
logger.debug("Found a must-be 'organic subset' element");
297
// only 'organic subset' elements allowed
302
atom = builder.newPseudoAtom("*");
305
currentSymbol = getSymbolForOrganicSubsetElement(smiles, position);
306
if (currentSymbol != null)
308
if (currentSymbol.length() == 1)
310
if (!(currentSymbol.toUpperCase()).equals(currentSymbol))
312
currentSymbol = currentSymbol.toUpperCase();
313
atom = builder.newAtom(currentSymbol);
314
atom.setHybridization(CDKConstants.HYBRIDIZATION_SP2);
317
atom = builder.newAtom(currentSymbol);
321
atom = builder.newAtom(currentSymbol);
323
logger.debug("Made atom: ", atom);
326
throw new InvalidSmilesException(
327
"Found element which is not a 'organic subset' element. You must " +
328
"use [" + mychar + "].");
332
molecule.addAtom(atom);
333
logger.debug("Adding atom ", atom.hashCode());
334
if ((lastNode != null) && bondExists)
336
logger.debug("Creating bond between ", atom.getSymbol(), " and ", lastNode.getSymbol());
337
bond = builder.newBond(atom, lastNode, bondStatus);
338
if (bondIsAromatic) {
339
bond.setFlag(CDKConstants.ISAROMATIC, true);
341
molecule.addBond(bond);
343
bondStatus = CDKConstants.BONDORDER_SINGLE;
346
position = position + currentSymbol.length();
348
bondIsAromatic = false;
349
} else if (mychar == '=')
352
if (status == 2 || !((smiles.charAt(position) >= '0' && smiles.charAt(position) <= '9') || smiles.charAt(position) == '%'))
354
bondStatus = CDKConstants.BONDORDER_DOUBLE;
357
bondStatusForRingClosure = CDKConstants.BONDORDER_DOUBLE;
359
} else if (mychar == '#')
362
if (status == 2 || !((smiles.charAt(position) >= '0' && smiles.charAt(position) <= '9') || smiles.charAt(position) == '%'))
364
bondStatus = CDKConstants.BONDORDER_TRIPLE;
367
bondStatusForRingClosure = CDKConstants.BONDORDER_TRIPLE;
369
} else if (mychar == '(')
371
atomStack.push(lastNode);
372
logger.debug("Stack:");
373
Enumeration ses = atomStack.elements();
374
while (ses.hasMoreElements())
376
IAtom a = (IAtom) ses.nextElement();
377
logger.debug("", a.hashCode());
379
logger.debug("------");
380
bondStack.push(new Double(bondStatus));
382
} else if (mychar == ')')
384
lastNode = (IAtom) atomStack.pop();
385
logger.debug("Stack:");
386
Enumeration ses = atomStack.elements();
387
while (ses.hasMoreElements())
389
IAtom a = (IAtom) ses.nextElement();
390
logger.debug("", a.hashCode());
392
logger.debug("------");
393
bondStatus = ((Double) bondStack.pop()).doubleValue();
395
} else if (mychar >= '0' && mychar <= '9')
399
currentSymbol = new String(chars);
400
thisRing = (new Integer(currentSymbol)).intValue();
401
handleRing(lastNode);
403
} else if (mychar == '%')
405
currentSymbol = getRingNumber(smiles, position);
406
thisRing = (new Integer(currentSymbol)).intValue();
407
handleRing(lastNode);
408
position += currentSymbol.length() + 1;
409
} else if (mychar == '[')
411
currentSymbol = getAtomString(smiles, position);
412
atom = assembleAtom(currentSymbol);
413
molecule.addAtom(atom);
414
logger.debug("Added atom: ", atom);
415
if (lastNode != null && bondExists)
417
bond = builder.newBond(atom, lastNode, bondStatus);
418
if (bondIsAromatic) {
419
bond.setFlag(CDKConstants.ISAROMATIC, true);
421
molecule.addBond(bond);
422
logger.debug("Added bond: ", bond);
424
bondStatus = CDKConstants.BONDORDER_SINGLE;
425
bondIsAromatic = false;
428
position = position + currentSymbol.length() + 2;
429
// plus two for [ and ]
431
} else if (mychar == '.')
435
} else if (mychar == '-')
438
// a simple single bond
440
} else if (mychar == ':') {
442
bondIsAromatic = true;
444
} else if (mychar == '/' || mychar == '\\')
446
logger.warn("Ignoring stereo information for double bond");
448
} else if (mychar == '@')
450
if (position < smiles.length() - 1 && smiles.charAt(position + 1) == '@')
454
logger.warn("Ignoring stereo information for atom");
458
throw new InvalidSmilesException("Unexpected character found: " + mychar);
460
} catch (InvalidSmilesException exc)
462
logger.error("InvalidSmilesException while parsing char (in parseSmiles()): " + mychar);
465
} catch (Exception exception)
467
logger.error("Error while parsing char: " + mychar);
468
logger.debug(exception);
469
throw new InvalidSmilesException("Error while parsing char: " + mychar, exception);
471
logger.debug("Parsing next char");
472
} while (position < smiles.length());
477
private String getAtomString(String smiles, int pos) throws InvalidSmilesException
479
logger.debug("getAtomString()");
480
StringBuffer atomString = new StringBuffer();
483
for (int f = pos + 1; f < smiles.length(); f++)
485
char character = smiles.charAt(f);
486
if (character == ']')
491
atomString.append(character);
494
} catch (Exception exception)
496
String message = "Problem parsing Atom specification given in brackets.\n";
497
message += "Invalid SMILES string was: " + smiles;
498
logger.error(message);
499
logger.debug(exception);
500
throw new InvalidSmilesException(message, exception);
502
return atomString.toString();
505
private int getCharge(String chargeString, int position)
507
logger.debug("getCharge(): Parsing charge from: ", chargeString.substring(position));
509
if (chargeString.charAt(position) == '+')
513
} else if (chargeString.charAt(position) == '-')
521
StringBuffer multiplier = new StringBuffer();
522
while (position < chargeString.length() && Character.isDigit(chargeString.charAt(position)))
524
multiplier.append(chargeString.charAt(position));
527
if (multiplier.length() > 0)
529
logger.debug("Found multiplier: ", multiplier);
532
charge = charge * Integer.parseInt(multiplier.toString());
533
} catch (Exception exception)
535
logger.error("Could not parse positive atomic charge!");
536
logger.debug(exception);
539
logger.debug("Found charge: ", charge);
543
private int getImplicitHydrogenCount(String s, int position)
545
logger.debug("getImplicitHydrogenCount(): Parsing implicit hydrogens from: " + s);
547
if (s.charAt(position) == 'H')
549
StringBuffer multiplier = new StringBuffer();
550
while (position < (s.length() - 1) && Character.isDigit(s.charAt(position + 1)))
552
multiplier.append(position + 1);
555
if (multiplier.length() > 0)
559
count = count + Integer.parseInt(multiplier.toString());
560
} catch (Exception exception)
562
logger.error("Could not parse number of implicit hydrogens!");
563
logger.debug(exception);
570
private String getElementSymbol(String s, int pos)
572
logger.debug("getElementSymbol(): Parsing element symbol (pos=" + pos + ") from: " + s);
573
// try to match elements not in the organic subset.
574
// first, the two char elements
575
if (pos < s.length() - 1)
577
String possibleSymbol = s.substring(pos, pos + 2);
578
logger.debug("possibleSymbol: ", possibleSymbol);
579
if (("HeLiBeNeNaMgAlSiClArCaScTiCrMnFeCoNiCuZnGaGeAsSe".indexOf(possibleSymbol) >= 0) ||
580
("BrKrRbSrZrNbMoTcRuRhPdAgCdInSnSbTeXeCsBaLuHfTaRe".indexOf(possibleSymbol) >= 0) ||
581
("OsIrPtAuHgTlPbBiPoAtRnFrRaLrRfDbSgBhHsMtDs".indexOf(possibleSymbol) >= 0))
583
return possibleSymbol;
586
// if that fails, the one char elements
587
String possibleSymbol = s.substring(pos, pos + 1);
588
logger.debug("possibleSymbol: ", possibleSymbol);
589
if (("HKUVY".indexOf(possibleSymbol) >= 0))
591
return possibleSymbol;
593
// if that failed too, then possibly a organic subset element
594
return getSymbolForOrganicSubsetElement(s, pos);
599
* Gets the ElementSymbol for an element in the 'organic subset' for which
600
* brackets may be omited. <p>
602
* See: <a href="http://www.daylight.com/dayhtml/smiles/smiles-atoms.html">
603
* http://www.daylight.com/dayhtml/smiles/smiles-atoms.html</a> .
605
private String getSymbolForOrganicSubsetElement(String s, int pos)
607
logger.debug("getSymbolForOrganicSubsetElement(): Parsing organic subset element from: ", s);
608
if (pos < s.length() - 1)
610
String possibleSymbol = s.substring(pos, pos + 2);
611
if (("ClBr".indexOf(possibleSymbol) >= 0))
613
return possibleSymbol;
616
if ("BCcNnOoFPSsI".indexOf((s.charAt(pos))) >= 0)
618
return s.substring(pos, pos + 1);
620
if ("fpi".indexOf((s.charAt(pos))) >= 0)
622
logger.warn("Element ", s, " is normally not sp2 hybridisized!");
623
return s.substring(pos, pos + 1);
625
logger.warn("Subset element not found!");
631
* Gets the RingNumber attribute of the SmilesParser object
633
private String getRingNumber(String s, int pos) throws InvalidSmilesException {
634
logger.debug("getRingNumber()");
637
// Two digits impossible due to end of string
638
if (pos >= s.length() - 1)
639
throw new InvalidSmilesException("Percent sign ring closure numbers must be two-digit.");
641
String retString = s.substring(pos, pos + 2);
643
if (retString.charAt(0) < '0' || retString.charAt(0) > '9' ||
644
retString.charAt(1) < '0' || retString.charAt(1) > '9')
645
throw new InvalidSmilesException("Percent sign ring closure numbers must be two-digit.");
650
private IAtom assembleAtom(String s) throws InvalidSmilesException
652
logger.debug("assembleAtom(): Assembling atom from: ", s);
655
String currentSymbol = null;
656
StringBuffer isotopicNumber = new StringBuffer();
658
logger.debug("Parse everythings before and including element symbol");
663
mychar = s.charAt(position);
664
logger.debug("Parsing char: " + mychar);
665
if ((mychar >= 'A' && mychar <= 'Z') || (mychar >= 'a' && mychar <= 'z'))
667
currentSymbol = getElementSymbol(s, position);
668
if (currentSymbol == null)
670
throw new InvalidSmilesException(
671
"Expected element symbol, found null!"
675
logger.debug("Found element symbol: ", currentSymbol);
676
position = position + currentSymbol.length();
677
if (currentSymbol.length() == 1)
679
if (!(currentSymbol.toUpperCase()).equals(currentSymbol))
681
currentSymbol = currentSymbol.toUpperCase();
682
atom = builder.newAtom(currentSymbol);
683
atom.setHybridization(CDKConstants.HYBRIDIZATION_SP2);
684
if (atom.getHydrogenCount() > 0)
686
atom.setHydrogenCount(atom.getHydrogenCount() - 1);
690
atom = builder.newAtom(currentSymbol);
694
atom = builder.newAtom(currentSymbol);
696
logger.debug("Made atom: ", atom);
699
} else if (mychar >= '0' && mychar <= '9')
701
isotopicNumber.append(mychar);
703
} else if (mychar == '*')
706
atom = builder.newPseudoAtom(currentSymbol);
707
logger.debug("Made atom: ", atom);
712
throw new InvalidSmilesException("Found unexpected char: " + mychar);
714
} catch (InvalidSmilesException exc)
716
logger.error("InvalidSmilesException while parsing atom string: " + s);
719
} catch (Exception exception)
721
logger.error("Could not parse atom string: ", s);
722
logger.debug(exception);
723
throw new InvalidSmilesException("Could not parse atom string: " + s, exception);
725
} while (position < s.length());
726
if (isotopicNumber.toString().length() > 0)
730
atom.setMassNumber(Integer.parseInt(isotopicNumber.toString()));
731
} catch (Exception exception)
733
logger.error("Could not set atom's atom number.");
734
logger.debug(exception);
737
logger.debug("Parsing part after element symbol (like charge): ", s.substring(position));
739
int implicitHydrogens = 0;
740
while (position < s.length())
744
mychar = s.charAt(position);
745
logger.debug("Parsing char: " + mychar);
748
// count implicit hydrogens
749
implicitHydrogens = getImplicitHydrogenCount(s, position);
751
if (implicitHydrogens > 1)
755
atom.setHydrogenCount(implicitHydrogens);
756
} else if (mychar == '+' || mychar == '-')
758
charge = getCharge(s, position);
760
if (charge < -1 || charge > 1)
764
atom.setFormalCharge(charge);
765
} else if (mychar == '@')
767
if (position < s.length() - 1 && s.charAt(position + 1) == '@')
771
logger.warn("Ignoring stereo information for atom");
775
throw new InvalidSmilesException("Found unexpected char: " + mychar);
777
} catch (InvalidSmilesException exc)
779
logger.error("InvalidSmilesException while parsing atom string: ", s);
782
} catch (Exception exception)
784
logger.error("Could not parse atom string: ", s);
785
logger.debug(exception);
786
throw new InvalidSmilesException("Could not parse atom string: " + s, exception);
794
* We call this method when a ring (depicted by a number) has been found.
796
private void handleRing(IAtom atom)
798
logger.debug("handleRing():");
799
double bondStat = bondStatusForRingClosure;
800
if (ringbonds[thisRing] > bondStat)
801
bondStat = ringbonds[thisRing];
803
IAtom partner = null;
804
IAtom thisNode = rings[thisRing];
806
if (thisNode != null)
809
bond = builder.newBond(atom, partner, bondStat);
810
if (bondIsAromatic) {
812
bond.setFlag(CDKConstants.ISAROMATIC, true);
814
molecule.addBond(bond);
815
bondIsAromatic = false;
816
rings[thisRing] = null;
817
ringbonds[thisRing] = -1;
822
* First occurence of this ring:
823
* - add current atom to list
825
rings[thisRing] = atom;
826
ringbonds[thisRing] = bondStatusForRingClosure;
828
bondStatusForRingClosure = 1;
831
private void addImplicitHydrogens(IMolecule m) {
833
logger.debug("before H-adding: ", m);
834
hAdder.addImplicitHydrogensToSatisfyValency(m);
835
logger.debug("after H-adding: ", m);
836
} catch (Exception exception) {
837
logger.error("Error while calculation Hcount for SMILES atom: ", exception.getMessage());
841
private void setupMissingBondOrders(IMolecule m) {
843
valencyChecker.saturate(m);
844
logger.debug("after adding missing bond orders: ", m);
845
} catch (Exception exception) {
846
logger.error("Error while calculation Hcount for SMILES atom: ", exception.getMessage());
850
private void conceiveAromaticPerception(IMolecule m) {
851
IMoleculeSet moleculeSet = ConnectivityChecker.partitionIntoMolecules(m);
852
logger.debug("#mols ", moleculeSet.getAtomContainerCount());
853
for (int i = 0; i < moleculeSet.getAtomContainerCount(); i++) {
854
IAtomContainer molecule = moleculeSet.getAtomContainer(i);
855
logger.debug("mol: ", molecule);
857
valencyChecker.saturate(molecule);
858
logger.debug(" after saturation: ", molecule);
859
if (HueckelAromaticityDetector
860
.detectAromaticity(molecule)) {
861
logger.debug("Structure is aromatic...");
863
} catch (Exception exception) {
864
logger.error("Could not perceive aromaticity: ", exception
866
logger.debug(exception);
871
public boolean isInterrupted() {
872
return valencyChecker.isInterrupted();
875
public void setInterrupted(boolean interrupted) {
876
valencyChecker.setInterrupted(interrupted);