Skip to content

Commit

Permalink
Merge pull request #106 from denoptim-project/devel_smilesReader_fix
Browse files Browse the repository at this point in the history
Devel smiles reader fix
  • Loading branch information
marcellocostamagna authored Nov 11, 2022
2 parents 190c7f4 + 0ee985e commit 5188862
Show file tree
Hide file tree
Showing 15 changed files with 354 additions and 57 deletions.
38 changes: 21 additions & 17 deletions src/main/java/denoptim/fragmenter/FragmenterTools.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
import denoptim.graph.Vertex;
import denoptim.graph.Vertex.BBType;
import denoptim.io.DenoptimIO;
import denoptim.io.IteractingAtomContainerReader;
import denoptim.io.IteratingAtomContainerReader;
import denoptim.programs.fragmenter.CuttingRule;
import denoptim.programs.fragmenter.FragmenterParameters;
import denoptim.programs.fragmenter.MatchedBond;
Expand Down Expand Up @@ -239,8 +239,8 @@ public static boolean fragmentation(File input, FragmenterParameters settings,
File output, Logger logger) throws CDKException, IOException,
DENOPTIMException, IllegalArgumentException, UndetectedFileFormatException
{
IteractingAtomContainerReader iterator =
new IteractingAtomContainerReader(input);
IteratingAtomContainerReader iterator =
new IteratingAtomContainerReader(input);

int totalProd = 0;
int totalKept = 0;
Expand Down Expand Up @@ -927,23 +927,27 @@ public static boolean filterFragment(Fragment frag,
+ smb + "'");
return false;
}
// Incomplete fragmentation: an atom has the same coords of an AP.
for (AttachmentPoint ap : frag.getAttachmentPoints())
{
Point3d ap3d = ap.getDirectionVector();
if (ap3d!=null)

if (settings.isWorkingIn3D())
{
// Incomplete 3D fragmentation: an atom has the same coords of an AP.
for (AttachmentPoint ap : frag.getAttachmentPoints())
{
for (IAtom atm : frag.atoms())
Point3d ap3d = ap.getDirectionVector();
if (ap3d!=null)
{
Point3d atm3d = MoleculeUtils.getPoint3d(atm);
double dist = ap3d.distance(atm3d);
if (dist < 0.0002)
for (IAtom atm : frag.atoms())
{
logger.log(Level.FINE,"Removing fragment with AP"
+ frag.getIAtomContainer().indexOf(atm)
+ " and atom " + MoleculeUtils.getSymbolOrLabel(atm)
+ " coincide.");
return false;
Point3d atm3d = MoleculeUtils.getPoint3d(atm);
double dist = ap3d.distance(atm3d);
if (dist < 0.0002)
{
logger.log(Level.FINE,"Removing fragment with AP"
+ frag.getIAtomContainer().indexOf(atm)
+ " and atom " + MoleculeUtils.getSymbolOrLabel(atm)
+ " coincide.");
return false;
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
Expand All @@ -34,16 +33,15 @@
import java.util.stream.Collectors;

import org.apache.commons.io.FileUtils;
import org.openscience.cdk.DefaultChemObjectBuilder;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.interfaces.IBond;
import org.openscience.cdk.io.iterator.IteratingSDFReader;

import org.openscience.cdk.io.iterator.IteratingSMILESReader;
import denoptim.constants.DENOPTIMConstants;
import denoptim.exception.DENOPTIMException;
import denoptim.files.FileFormat;
import denoptim.io.DenoptimIO;
import denoptim.io.IteratingAtomContainerReader;
import denoptim.programs.RunTimeParameters.ParametersType;
import denoptim.programs.fragmenter.FragmenterParameters;
import denoptim.task.ParallelAsynchronousTaskExecutor;
Expand Down Expand Up @@ -86,6 +84,22 @@ public ParallelFragmentationAlgorithm(FragmenterParameters settings)

protected boolean doPreFlightOperations()
{
IteratingAtomContainerReader reader;
try
{
reader = new IteratingAtomContainerReader
(new File(settings.getStructuresFile()));

} catch (IOException | CDKException e1)
{
throw new Error("Error reading file '" + settings.getStructuresFile()
+ "'. " + e1.getMessage());
}
// Detect dimensionality of the molecules
if (reader.getIteratorType().equals(IteratingSMILESReader.class))
{
settings.setWorkingIn3D(false);
}
// Split data in batches for parallelization

// This is the collector of the mutating pathname to the file collecting
Expand All @@ -94,9 +108,7 @@ protected boolean doPreFlightOperations()
structures[0] = new File(settings.getStructuresFile());
if (settings.getNumTasks()>1 || settings.doCheckFormula())
{
settings.getLogger().log(Level.INFO, "Combining structures and "
+ "formulae...");
splitInputForThreads(settings);
splitInputForThreads(settings, reader);
for (int i=0; i<settings.getNumTasks(); i++)
{
structures[i] = new File(getStructureFileNameBatch(settings, i));
Expand Down Expand Up @@ -291,28 +303,20 @@ public int compare(File o1, File o2)
* @throws DENOPTIMException
* @throws FileNotFoundException
*/
static void splitInputForThreads(FragmenterParameters settings)
static void splitInputForThreads(FragmenterParameters settings,
IteratingAtomContainerReader reader)
{
int maxBuffersSize = 50000;
int numBatches = settings.getNumTasks();

IteratingSDFReader reader;
try
{
reader = new IteratingSDFReader(
new FileInputStream(settings.getStructuresFile()),
DefaultChemObjectBuilder.getInstance());
} catch (FileNotFoundException e1)
{
// Cannot happen: we ensured the file exist, but it might have been
// removed after the check
throw new Error("File '" + settings.getStructuresFile() + "' can "
+ "not be found anymore.");
}

//If available we record CSD formula in properties of atom container
LinkedHashMap<String,String> formulae = settings.getFormulae();

if (settings.doCheckFormula())
{
settings.getLogger().log(Level.INFO, "Combining structures and "
+ "formulae...");
}
int index = -1;
int batchId = 0;
int buffersSize = 0;
Expand All @@ -336,7 +340,8 @@ static void splitInputForThreads(FragmenterParameters settings)
// expected to be found (but CSD uses them...)
try
{
MoleculeUtils.setZeroImplicitHydrogensToAllAtoms(mol);
// MoleculeUtils.setZeroImplicitHydrogensToAllAtoms(mol);
MoleculeUtils.explicitHydrogens(mol);
MoleculeUtils.ensureNoUnsetBondOrders(mol);
} catch (CDKException e)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.io.FormatFactory;
import org.openscience.cdk.io.formats.IChemFormat;
import org.openscience.cdk.io.formats.INChIPlainTextFormat;
import org.openscience.cdk.io.formats.MDLV2000Format;
import org.openscience.cdk.io.formats.MDLV3000Format;
import org.openscience.cdk.io.formats.SMILESFIXFormat;
import org.openscience.cdk.io.formats.SMILESFormat;
import org.openscience.cdk.io.iterator.DefaultIteratingChemObjectReader;
import org.openscience.cdk.io.iterator.IteratingSDFReader;
import org.openscience.cdk.io.iterator.IteratingSMILESReader;

/**
* An iterator that take {@link IAtomContainer}s from a file, possibly using
Expand All @@ -30,7 +34,7 @@
*
* @author Marco Foscato
*/
public class IteractingAtomContainerReader implements Iterator<IAtomContainer>
public class IteratingAtomContainerReader implements Iterator<IAtomContainer>
{

/**
Expand Down Expand Up @@ -66,22 +70,33 @@ public class IteractingAtomContainerReader implements Iterator<IAtomContainer>
* @throws IOException
* @throws CDKException
*/
public IteractingAtomContainerReader(File input)
public IteratingAtomContainerReader(File input)
throws FileNotFoundException, IOException, CDKException
{
IChemFormat chemFormat = new FormatFactory().guessFormat(
new BufferedReader(new FileReader(input)));
FormatFactory factory = new FormatFactory();
factory.registerFormat(new SMILESListFormat());

BufferedReader headReader = new BufferedReader(new FileReader(input));
IChemFormat chemFormat = factory.guessFormat(headReader);
headReader.close();

if (chemFormat instanceof MDLV2000Format
|| chemFormat instanceof MDLV3000Format)
{
FileInputStream fis = new FileInputStream(input);
fileIterator = new IteratingSDFReader(fis,
DefaultChemObjectBuilder.getInstance());
usingIteratingReader = true;
} else {
} else if (chemFormat instanceof SMILESListFormat) {

FileInputStream fis = new FileInputStream(input);
fileIterator = new IteratingSMILESReader(fis,
DefaultChemObjectBuilder.getInstance());
usingIteratingReader = true;
} else {
results = DenoptimIO.readAllAtomContainers(input);
listIterator = results.iterator();
}
}
}

//------------------------------------------------------------------------------
Expand Down Expand Up @@ -121,4 +136,14 @@ public void close() throws IOException

//------------------------------------------------------------------------------

/**
* @return the class of the iterator defined upon creating a reader
*/
public Class<?> getIteratorType()
{
if (usingIteratingReader)
return fileIterator.getClass();
else
return listIterator.getClass();
}
}
80 changes: 80 additions & 0 deletions src/main/java/denoptim/io/SMILESListFormat.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package denoptim.io;

import java.util.List;
import org.openscience.cdk.io.formats.IChemFormatMatcher;
import org.openscience.cdk.io.formats.IChemFormatMatcher.MatchResult;
import org.openscience.cdk.tools.DataFeatures;

/**
* Class for recognizing file containing a list of SMILES .
* One SMILES string in each line. Since SMILES do not contain spaces, absence
* of spaces in each line is the condition identifying a list of SMILES
*
* @author marcellocostamagna
*
*/
public class SMILESListFormat implements IChemFormatMatcher
{
@Override
public String getReaderClassName()
{
return null;
}

@Override
public String getWriterClassName()
{
return null;
}

@Override
public int getSupportedDataFeatures()
{
return DataFeatures.NONE;
}

@Override
public int getRequiredDataFeatures()
{
return DataFeatures.NONE;
}

@Override
public String getFormatName()
{
return "SMILES List";
}

@Override
public String getPreferredNameExtension()
{
return null;
}

@Override
public String[] getNameExtensions()
{
return new String[0];
}

@Override
public String getMIMEType()
{
return "chemical/smiles";
}

@Override
public boolean isXMLBased()
{
return false;
}

@Override
public final MatchResult matches(final List<String> lines) {
for (int i = 0; i < Math.min(lines.size(), 100); i++)
{
if (lines.get(i).contains(" ")) return NO_MATCH;
}
return new MatchResult(true, this, lines.size());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,11 @@ public class FragmenterParameters extends RunTimeParameters
*/
private boolean isStandaloneFragmentClustering = false;

/**
* Flag activating operations depending on 3D structure
*/
private boolean workingIn3D = true;


//------------------------------------------------------------------------------

Expand Down Expand Up @@ -1430,5 +1435,23 @@ public boolean isStandaloneFragmentClustering()
}

//------------------------------------------------------------------------------

/**
*
* @return <code>true</code> if we are dealing with 3D structures
*/
public boolean isWorkingIn3D()
{
return workingIn3D;
}
//------------------------------------------------------------------------------

/**
* Sets boolean variable workingIn3D
* @param workingIn3D
*/
public void setWorkingIn3D(boolean workingIn3D)
{
this.workingIn3D = workingIn3D;
}
}
11 changes: 11 additions & 0 deletions src/main/java/denoptim/utils/MoleculeUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,17 @@ public static void setZeroImplicitHydrogensToAllAtoms(IAtomContainer iac)
}

//------------------------------------------------------------------------------

/**
* Converts all the implicit hydrogens to explicit
*/
public static void explicitHydrogens(IAtomContainer mol)
{
AtomContainerManipulator.convertImplicitToExplicitHydrogens(mol);

}

//------------------------------------------------------------------------------

/**
* Sets bond order = single to all otherwise unset bonds. In case of failed
Expand Down
Loading

0 comments on commit 5188862

Please sign in to comment.