#!/usr/bin/env groovy // Usage: ./extractMetadata.groovy [batch directory] import groovy.xml.DOMBuilder import java.io.File import java.io.FileOutputStream import javax.xml.parsers.DocumentBuilder import javax.xml.parsers.DocumentBuilderFactory import javax.xml.transform.sax.SAXSource import net.sf.saxon.xpath.XPathEvaluator import net.sf.saxon.xpath.XPathExpression import org.apache.xml.serialize.OutputFormat import org.apache.xml.serialize.XMLSerializer import org.xml.sax.InputSource // Set up the dom builder factory for later docBuilderFactory = DocumentBuilderFactory.newInstance() docBuilderFactory.setNamespaceAware(false) // The main logic to handle an item directory // It's most convenient to implement this as a closure mainAction = { | itemDir | itemDir.eachFile({ | f | // This does a regexp match if(f.name ==~ "nsc.*_original\.cml") { print(".") xpe = new XPathEvaluator(new SAXSource(new InputSource(f.toURL().toString()))) ichi = xpe.createExpression("//entry/identifier[@version='0.932Beta' and @tautomeric='0']/basic/text()") nsc = xpe.createExpression("//entry/molecule/@id") name = xpe.createExpression("//entry/name/text()") id = nsc.evaluate().get(0).getValue() docBuilder = docBuilderFactory.newDocumentBuilder() builder = DOMBuilder.newInstance() // Inline method definitions on the builder 'dc' result in // xml elements and attributes being created. dc = builder.dublin_core() { dcvalue(element:"identifier", qualifier:"none", id) dcvalue(element:"identifier", qualifier:"ichi", ichi.evaluate().get(0).getValue()) dcvalue(element:"title", qualifier:"none", id) dcvalue(element:"creator", qualifier:"none", "US National Cancer Institute") dcvalue(element:"publisher", qualifier:"none", "Unilever Center for Molecular Informatics, Cambridge University") dcvalue(element:"date", qualifier:"created", "2003-02-01") dcvalue(element:"type", qualifier:"none", "Other") for(nm in name.evaluate()) { dcvalue(element:"title", qualifier:"alternative", nm.getValue()) } } fos = new FileOutputStream(itemDir.toString() +"/dublin_core.xml") fileserializer = new XMLSerializer(fos, new OutputFormat()) fileserializer.serialize(dc) fos.close() } }) } println("--- Extracting metadata for ${args[0]} ---") new File("${args[0]}").eachFile(mainAction); println("")