diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java index a3147b68463..b3c65ceefcc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java @@ -33,6 +33,7 @@ import edu.harvard.iq.dataverse.engine.command.impl.CreateHarvestedDatasetCommand; import edu.harvard.iq.dataverse.engine.command.impl.CreateNewDatasetCommand; import edu.harvard.iq.dataverse.engine.command.impl.DestroyDatasetCommand; +import edu.harvard.iq.dataverse.engine.command.impl.UpdateHarvestedDatasetCommand; import edu.harvard.iq.dataverse.harvest.client.HarvestingClient; import edu.harvard.iq.dataverse.search.IndexServiceBean; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; @@ -43,6 +44,7 @@ import edu.harvard.iq.dataverse.license.LicenseServiceBean; import edu.harvard.iq.dataverse.pidproviders.PidUtil; import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile; +import edu.harvard.iq.dataverse.util.DatasetFieldUtil; import java.io.File; import java.io.FileOutputStream; @@ -366,25 +368,29 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve f.setSingleValue(DatasetField.NA_VALUE); } } - + + // @todo? - is this the right place to call tidyUpFields()? + // usually it is called within the body of the create/update commands. + DatasetFieldUtil.tidyUpFields(harvestedVersion.getDatasetFields(), true); + // Check data against validation constraints // Similarly to how we handle missing required values (above), we // replace invalid values with NA when harvesting. - boolean sanitized = validateDatasetVersion(harvestedVersion, true, cleanupLog); + boolean sanitized = validateAndSanitizeVersionMetadata(harvestedVersion, cleanupLog); // Note: this sanitizing approach, of replacing invalid values with // "NA" does not work with certain fields. For example, using it to // populate a GeoBox coordinate value will result in an invalid - // field. So we will attempt to validate the santized version again, - // this time around, it will throw an exception if still invalid, so - // we'll stop before proceeding any further. + // field. So we will attempt to re-validate the santized version. + // This time around, it will throw an exception if still invalid, so + // that we'll stop before proceeding any further: if (sanitized) { - + validateVersionMetadata(harvestedVersion, cleanupLog); } - Set invalidViolations = harvestedVersion.validate(); + /*Set invalidViolations = harvestedVersion.validate(); if (!invalidViolations.isEmpty()) { for (ConstraintViolation v : invalidViolations) { DatasetFieldValue f = v.getRootBean(); @@ -397,7 +403,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve // using it to populate a GeoBox coordinate value is going // to result in an invalid field. @todo? - see below } - } + }*/ // @todo? - re-validate the version before we do anything else? // something along the lines of @@ -407,6 +413,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve if (existingDataset != null) { // @todo // ... do the magic - parse the version json, do the switcheroo ... + /* DatasetVersion existingVersion = existingDataset.getVersions().get(0); Map existingFilesIndex = new HashMap<>(); @@ -490,6 +497,9 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve // UpdateHarvestedDatasetCommand() ? (later on) importedDataset = em.merge(existingDataset); //@todo reindex + */ + + importedDataset = engineSvc.submit(new UpdateHarvestedDatasetCommand(existingDataset, harvestedVersion, dataverseRequest)); } else { importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest)); @@ -512,49 +522,6 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve } return importedDataset; } - /** - * Shortcut method for validating AND attempting to sanitize a DatasetVersion - * @param version - * @param cleanupLog - any invalid values and their replacements are logged there - * @return true if any invalid values were encountered and sanitized - * @throws ImportException (although it should never happen in this mode) - */ - private boolean validateAndSanitizeVersionMetadata(DatasetVersion version, PrintWriter cleanupLog) throws ImportException { - return validateVersionMetadata(version, true, cleanupLog); - } - - private void validateVersionMetadata(DatasetVersion version, PrintWriter log) throws ImportException { - validateVersionMetadata(version, false, log); - } - - private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize, PrintWriter cleanupLog) throws ImportException { - boolean fixed = false; - Set invalidViolations = version.validate(); - if (!invalidViolations.isEmpty()) { - for (ConstraintViolation v : invalidViolations) { - DatasetFieldValue f = v.getRootBean(); - - String msg = "Invalid metadata field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; " - + "Invalid value: '" + f.getValue() + "'"; - if (sanitize) { - msg += ", replaced with '" + DatasetField.NA_VALUE + "'"; - f.setValue(DatasetField.NA_VALUE); - fixed = true; - } - cleanupLog.println(msg); - - // Note: "NA" does not work with certain fields. For example, - // using it to populate a GeoBox coordinate value is going - // to result in an invalid field. So we'll need to validate the - // version again after the first, sanitizing pass and see if it - // helped or not. - } - if (!sanitize) { - throw new ImportException("Version was still failing validation after the first attempt to sanitize the invalid values."); - } - } - return fixed; - } public JsonObject ddiToJson(String xmlToParse) throws ImportException, XMLStreamException { DatasetDTO dsDTO = importDDIService.doImport(ImportType.IMPORT, xmlToParse); @@ -855,6 +822,67 @@ private String convertInvalidDateString(String inString){ return null; } + /** + * A shortcut method for validating AND attempting to sanitize a DatasetVersion + * @param version + * @param cleanupLog - any invalid values and their replacements are logged there + * @return true if any invalid values were encountered and sanitized + * @throws ImportException (although it should never happen in this mode) + */ + private boolean validateAndSanitizeVersionMetadata(DatasetVersion version, PrintWriter cleanupLog) throws ImportException { + return validateVersionMetadata(version, true, cleanupLog); + } + + /** + * A shortcut method for validating a DatasetVersion; will throw an exception + * if invalid, without attempting to sanitize the invalid values. + * @param version + * @param log - will log the invalid fields encountered there + * @throws ImportException + */ + private void validateVersionMetadata(DatasetVersion version, PrintWriter log) throws ImportException { + validateVersionMetadata(version, false, log); + } + + /** + * Validate the metadata fields of a newly-created version, and depending on + * the "sanitize" flag supplied, may or may not attempt to sanitize the supplied + * values by replacing them with "NA"s. + * @param version + * @param sanitize - boolean indicating whether to attempt to fix invalid values + * @param cleanupLog - to log any invalid values encountered will be logged + * @return - true if any invalid values have been replaced + * @throws ImportException + */ + private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize, PrintWriter cleanupLog) throws ImportException { + boolean fixed = false; + Set invalidViolations = version.validate(); + if (!invalidViolations.isEmpty()) { + for (ConstraintViolation v : invalidViolations) { + DatasetFieldValue f = v.getRootBean(); + + String msg = "Invalid metadata field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; " + + "Invalid value: '" + f.getValue() + "'"; + if (sanitize) { + msg += ", replaced with '" + DatasetField.NA_VALUE + "'"; + f.setValue(DatasetField.NA_VALUE); + fixed = true; + } + cleanupLog.println(msg); + + // Note: "NA" does not work with certain fields. For example, + // using it to populate a GeoBox coordinate value is going + // to result in an invalid field. So we'll need to validate the + // version again after the first, sanitizing pass and see if it + // helped or not. + } + if (!sanitize) { + throw new ImportException("Version was still failing validation after the first attempt to sanitize the invalid values."); + } + } + return fixed; + } + private static class MyCustomFormatter extends Formatter { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java new file mode 100644 index 00000000000..d28950e4d9d --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java @@ -0,0 +1,159 @@ +package edu.harvard.iq.dataverse.engine.command.impl; + +import edu.harvard.iq.dataverse.authorization.Permission; +import edu.harvard.iq.dataverse.engine.command.CommandContext; +import edu.harvard.iq.dataverse.engine.command.DataverseRequest; +import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; +import edu.harvard.iq.dataverse.engine.command.exception.CommandException; +import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException; +import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.DatasetVersion; +import edu.harvard.iq.dataverse.DataFile; +import edu.harvard.iq.dataverse.FileMetadata; +import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile; +import java.io.IOException; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.apache.solr.client.solrj.SolrServerException; + +/** + * + * @author landreev + * + * Much simplified version of UpdateDatasetVersionCommand, + * but with some extra twists. + */ +@RequiredPermissions(Permission.EditDataset) +public class UpdateHarvestedDatasetCommand extends AbstractDatasetCommand { + + private static final Logger logger = Logger.getLogger(UpdateHarvestedDatasetCommand.class.getCanonicalName()); + private final DatasetVersion newHarvestedVersion; + final private boolean validateLenient = true; + + public UpdateHarvestedDatasetCommand(Dataset theDataset, DatasetVersion newHarvestedVersion, DataverseRequest aRequest) { + super(aRequest, theDataset); + this.newHarvestedVersion = newHarvestedVersion; + } + + public boolean isValidateLenient() { + return validateLenient; + } + + @Override + public Dataset execute(CommandContext ctxt) throws CommandException { + + // ... do the magic - parse the version json, do the switcheroo ... + Dataset existingDataset = getDataset(); + + if (existingDataset == null + || existingDataset.getId() == null + || !existingDataset.isHarvested() + || existingDataset.getVersions().size() != 1) { + throw new IllegalCommandException("The command can only be called on an existing harvested dataset with only 1 version", this); + } + DatasetVersion existingVersion = existingDataset.getVersions().get(0); + + if (newHarvestedVersion == null || newHarvestedVersion.getId() != null) { + throw new IllegalCommandException("The command can only be called with a newly-harvested, not yet saved DatasetVersion supplied", this); + } + + Map existingFilesIndex = new HashMap<>(); + + for (int i = 0; i < existingDataset.getFiles().size(); i++) { + String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier(); + if (storageIdentifier != null) { + existingFilesIndex.put(storageIdentifier, i); + } + } + + for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) { + // is it safe to assume that each new FileMetadata will be + // pointing to a non-null DataFile here? + String location = newFileMetadata.getDataFile().getStorageIdentifier(); + if (location != null && existingFilesIndex.containsKey(location)) { + newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>()); + + int fileIndex = existingFilesIndex.get(location); + newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex)); + existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata); + existingFilesIndex.remove(location); + } + } + // @todo check that the newly-harvested DataFiles have the same checksums + // and mime types etc. These values are supposed to be immutable, normally, + // but who knows - they may have fixed something invalid on the other end + // @todo check if there's anything special that needs to be done for things + // like file categories + + List solrIdsOfDocumentsToDelete = new ArrayList<>(); + + // Go through the existing files and delete the ones that are + // no longer present in the version that we have just harvesed: + for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) { + DataFile oldDataFile = oldFileMetadata.getDataFile(); + solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId()); + existingDataset.getFiles().remove(oldDataFile); + // Files from harvested datasets are removed unceremoniously, + // directly in the database. No need to bother calling the + // DeleteFileCommand on them. + ctxt.em().remove(ctxt.em().merge(oldDataFile)); + ctxt.em().remove(ctxt.em().merge(oldFileMetadata)); + oldDataFile = null; + oldFileMetadata = null; + } + + // purge all the SOLR documents associated with the files + // we have just deleted: + if (!solrIdsOfDocumentsToDelete.isEmpty()) { + ctxt.index().deleteHarvestedDocuments(solrIdsOfDocumentsToDelete); + } + + // ... And now delete the existing version itself: + existingDataset.setVersions(new ArrayList<>()); + ctxt.em().remove(ctxt.em().merge(existingVersion)); + + // Now attach the newly-harvested version to the dataset: + existingDataset.getVersions().add(newHarvestedVersion); + newHarvestedVersion.setDataset(existingDataset); + + // ... There's one more thing to do - go through the new files, + // that are not in the database yet, and make sure they are + // attached to this existing dataset: + for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) { + if (newFileMetadata.getDataFile().getId() == null) { + existingDataset.getFiles().add(newFileMetadata.getDataFile()); + newFileMetadata.getDataFile().setOwner(existingDataset); + } + } + + ctxt.em().persist(newHarvestedVersion); + + Dataset savedDataset = ctxt.em().merge(existingDataset); + ctxt.em().flush(); + + //@todo reindex + + return savedDataset; + } + + @Override + public boolean onSuccess(CommandContext ctxt, Object r) { + boolean retVal = true; + Dataset d = (Dataset) r; + + try { + // Note that we index harvested datasets synchronously: + ctxt.index().indexDataset(d, true); + } catch (SolrServerException|IOException solrServerEx) { + logger.log(Level.WARNING, "Exception while trying to index the updated Harvested dataset " + d.getGlobalId().asString(), solrServerEx.getMessage()); + retVal = false; + } + + return retVal; + } +}