From bd08b698859db5c5662f8a742504832023a632d7 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Wed, 11 Sep 2024 14:37:34 -0400 Subject: [PATCH] Largely finalized versions of the refactored harvesting classes. #10734 --- .../api/imports/ImportServiceBean.java | 142 ++++-------------- .../impl/UpdateHarvestedDatasetCommand.java | 82 +++++++--- 2 files changed, 90 insertions(+), 134 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java index b3c65ceefcc..8c7ff5acf6d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java @@ -300,7 +300,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve String globalIdString = globalId.asString(); if (StringUtils.isEmpty(globalIdString)) { - // @todo this check shouldn't be necessary, now that there's a null check above + // @todo this check may not be necessary, now that there's a null check above throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global identifier this Dataverse recognizes, skipping."); } @@ -347,6 +347,11 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve } harvestedVersion = parser.parseDatasetVersion(obj.getJsonObject("datasetVersion")); + Dataset tempDataset = createTemporaryHarvestedDataset(harvestedVersion); + // Temporarily attach the temporary dataset to the parent Collection: + // (this will be needed for the purposes of looking up the configured + // metadatablocks and such) + tempDataset.setOwner(owner); } // Either a full new import, or an update of an existing harvested @@ -362,15 +367,16 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve // Check data against required contraints List> violations = harvestedVersion.validateRequired(); if (!violations.isEmpty()) { - // When harvesting, we add NA for missing required values: + // ... and fill the missing required values with "NA"s: for (ConstraintViolation v : violations) { DatasetField f = v.getRootBean(); f.setSingleValue(DatasetField.NA_VALUE); } } - // @todo? - is this the right place to call tidyUpFields()? - // usually it is called within the body of the create/update commands. + // is this the right place to call tidyUpFields()? + // usually it is called within the body of the create/update commands + // later on. DatasetFieldUtil.tidyUpFields(harvestedVersion.getDatasetFields(), true); // Check data against validation constraints @@ -379,7 +385,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve boolean sanitized = validateAndSanitizeVersionMetadata(harvestedVersion, cleanupLog); - // Note: this sanitizing approach, of replacing invalid values with + // Note: our sanitizing approach, of replacing invalid values with // "NA" does not work with certain fields. For example, using it to // populate a GeoBox coordinate value will result in an invalid // field. So we will attempt to re-validate the santized version. @@ -390,117 +396,8 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve validateVersionMetadata(harvestedVersion, cleanupLog); } - /*Set invalidViolations = harvestedVersion.validate(); - if (!invalidViolations.isEmpty()) { - for (ConstraintViolation v : invalidViolations) { - DatasetFieldValue f = v.getRootBean(); - - String msg = "Invalid metadata: " + metadataFile.getName() + "; Field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; " - + "Invalid value: '" + f.getValue() + "'" + ", replaced with '" + DatasetField.NA_VALUE + "'"; - cleanupLog.println(msg); - f.setValue(DatasetField.NA_VALUE); - // Note: "NA" does not work with certain fields. For example, - // using it to populate a GeoBox coordinate value is going - // to result in an invalid field. @todo? - see below - } - }*/ - - // @todo? - re-validate the version before we do anything else? - // something along the lines of - // if (this.validate) {validateOrDie(newVersion, false);} - // DatasetFieldUtil.tidyUpFields(newVersion.getDatasetFields(), true); - if (existingDataset != null) { - // @todo - // ... do the magic - parse the version json, do the switcheroo ... - /* - DatasetVersion existingVersion = existingDataset.getVersions().get(0); - - Map existingFilesIndex = new HashMap<>(); - - for (int i = 0; i < existingDataset.getFiles().size(); i++) { - String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier(); - if (storageIdentifier != null) { - existingFilesIndex.put(storageIdentifier, i); - } - } - - for (FileMetadata newFileMetadata : harvestedVersion.getFileMetadatas()) { - // is it safe to assume that each new FileMetadata will be - // pointing to a non-null DataFile - String location = newFileMetadata.getDataFile().getStorageIdentifier(); - if (location != null && existingFilesIndex.containsKey(location)) { - newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>()); - - int fileIndex = existingFilesIndex.get(location); - newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex)); - existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata); - existingFilesIndex.remove(location); - } - } - - List solrIdsOfDocumentsToDelete = new ArrayList<>(); - - // Go through the existing files and delete the ones that are - // no longer present in the version that we have just harvesed: - for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) { - DataFile oldDataFile = oldFileMetadata.getDataFile(); - solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId()); - existingDataset.getFiles().remove(oldDataFile); - // Files from harvested datasets are removed unceremoniously, - // directly in the database. No need to bother calling the - // DeleteFileCommand on them. - em.remove(em.merge(oldDataFile)); - em.remove(em.merge(oldFileMetadata)); - oldDataFile = null; - oldFileMetadata = null; - } - - // purge all the SOLR documents associated with the files - // we have just deleted: - if (!solrIdsOfDocumentsToDelete.isEmpty()) { - indexService.deleteHarvestedDocuments(solrIdsOfDocumentsToDelete); - } - - // ... And now delete the existing version itself: - - existingDataset.setVersions(new ArrayList<>()); - em.remove(em.merge(existingVersion)); - - // Now attach the newly-harvested version to the dataset: - existingDataset.getVersions().add(harvestedVersion); - harvestedVersion.setDataset(existingDataset); - - // ... There's one more thing to do - go through the new files, - // that are not in the database yet, and make sure they are - // attached to this existing dataset: - - for (FileMetadata newFileMetadata : harvestedVersion.getFileMetadatas()) { - if (newFileMetadata.getDataFile().getId() == null) { - existingDataset.getFiles().add(newFileMetadata.getDataFile()); - newFileMetadata.getDataFile().setOwner(existingDataset); - } - } - - em.persist(harvestedVersion); - - ///@todo remove for (DataFile harvestedFile : existingDataset.getFiles()) { - ///@todo remove DataFile merged = em.merge(harvestedFile); - ///@todo remove em.remove(merged); - ///@todo remove harvestedFile = null; - ///@todo remove } - ///@todo remove existingDataset.setFiles(null); - ///@todo remove Dataset merged = em.merge(existingDataset); - // harvested datasets don't have physical files - so no need to worry about that. - ///@todo remove engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest)); - // @todo! - // UpdateHarvestedDatasetCommand() ? (later on) - importedDataset = em.merge(existingDataset); - //@todo reindex - */ - importedDataset = engineSvc.submit(new UpdateHarvestedDatasetCommand(existingDataset, harvestedVersion, dataverseRequest)); - } else { importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest)); } @@ -883,6 +780,23 @@ private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize return fixed; } + /** + * Helper method that creates a throwaway Harvested Dataset to temporarily + * attach the newly-harvested version to. We need this when, instead of + * importing a brand-new harvested dataset from scratch, we are planning to + * attempt to update an already existing dataset harvested from the same + * archival location. + * @param harvestedVersion - a newly created Version imported from harvested metadata + * @return - a temporary dataset to which the new version has been attached + */ + private Dataset createTemporaryHarvestedDataset(DatasetVersion harvestedVersion) { + Dataset tempDataset = new Dataset(); + harvestedVersion.setDataset(tempDataset); + tempDataset.setVersions(new ArrayList<>(1)); + tempDataset.getVersions().add(harvestedVersion); + + return tempDataset; + } private static class MyCustomFormatter extends Formatter { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java index d28950e4d9d..9d7461b9b30 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java @@ -26,7 +26,10 @@ * @author landreev * * Much simplified version of UpdateDatasetVersionCommand, - * but with some extra twists. + * but with some extra twists. The goal is to avoid creating new Dataset and + * DataFile objects, and to instead preserve the database ids of the re-harvested + * datasets and files, whenever possible. This in turn allows us to avoid deleting + * and rebuilding from scratch the Solr documents for these objects. */ @RequiredPermissions(Permission.EditDataset) public class UpdateHarvestedDatasetCommand extends AbstractDatasetCommand { @@ -47,7 +50,6 @@ public boolean isValidateLenient() { @Override public Dataset execute(CommandContext ctxt) throws CommandException { - // ... do the magic - parse the version json, do the switcheroo ... Dataset existingDataset = getDataset(); if (existingDataset == null @@ -62,15 +64,30 @@ public Dataset execute(CommandContext ctxt) throws CommandException { throw new IllegalCommandException("The command can only be called with a newly-harvested, not yet saved DatasetVersion supplied", this); } + newHarvestedVersion.setCreateTime(getTimestamp()); + newHarvestedVersion.setLastUpdateTime(getTimestamp()); + + Map existingFilesIndex = new HashMap<>(); + /* + Create a map of the files that are currently part of this existing + harvested dataset. We assume that a harvested file can be uniquely + defined by its storageidentifier. Which, in the case of a datafile + harvested from another Dataverse should be its data access api url. + */ for (int i = 0; i < existingDataset.getFiles().size(); i++) { String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier(); if (storageIdentifier != null) { existingFilesIndex.put(storageIdentifier, i); } } - + + /* + Go through the files in the newly-harvested version and check if any of + them are the files (potentially new/updated versions thereof) of the files + we already have, harvested previously from the same archive location. + */ for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) { // is it safe to assume that each new FileMetadata will be // pointing to a non-null DataFile here? @@ -79,14 +96,32 @@ public Dataset execute(CommandContext ctxt) throws CommandException { newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>()); int fileIndex = existingFilesIndex.get(location); + + // Make sure to update the existing DataFiles that we are going + // to keep in case their newly-harvested versions have different + // checksums, mime types etc. These values are supposed to be + // immutable, normally - but who knows, errors happen, the source + // Dataverse may have had to fix these in their database to + // correct a data integrity issue (for ex.): + existingDataset.getFiles().get(fileIndex).setContentType(newFileMetadata.getDataFile().getContentType()); + existingDataset.getFiles().get(fileIndex).setFilesize(newFileMetadata.getDataFile().getFilesize()); + existingDataset.getFiles().get(fileIndex).setChecksumType(newFileMetadata.getDataFile().getChecksumType()); + existingDataset.getFiles().get(fileIndex).setChecksumValue(newFileMetadata.getDataFile().getChecksumValue()); + + // Point the newly-harvested filemetadata to the existing datafile: newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex)); + + // Make sure this new FileMetadata is the only one attached to this existing file: + existingDataset.getFiles().get(fileIndex).setFileMetadatas(new ArrayList<>(1)); existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata); + // (we don't want any cascade relationships left between this existing + // dataset and its version, since we are going to attemp to delete it). + + // Drop the file from the index map: existingFilesIndex.remove(location); } } - // @todo check that the newly-harvested DataFiles have the same checksums - // and mime types etc. These values are supposed to be immutable, normally, - // but who knows - they may have fixed something invalid on the other end + // @todo check if there's anything special that needs to be done for things // like file categories @@ -96,15 +131,20 @@ public Dataset execute(CommandContext ctxt) throws CommandException { // no longer present in the version that we have just harvesed: for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) { DataFile oldDataFile = oldFileMetadata.getDataFile(); - solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId()); - existingDataset.getFiles().remove(oldDataFile); - // Files from harvested datasets are removed unceremoniously, - // directly in the database. No need to bother calling the - // DeleteFileCommand on them. - ctxt.em().remove(ctxt.em().merge(oldDataFile)); - ctxt.em().remove(ctxt.em().merge(oldFileMetadata)); - oldDataFile = null; - oldFileMetadata = null; + String location = oldDataFile.getStorageIdentifier(); + // Is it still in the existing files map? - that means it is no longer + // present in the newly-harvested version + if (location != null && existingFilesIndex.containsKey(location)) { + solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId()); + existingDataset.getFiles().remove(oldDataFile); + // Files from harvested datasets are removed unceremoniously, + // directly in the database. No need to bother calling the + // DeleteFileCommand on them. We'll just need to remember to purge + // them from Solr as well (right below) + ctxt.em().remove(ctxt.em().merge(oldDataFile)); + // (no need to explicitly remove the oldFileMetadata; it will be + // removed with the entire old version is deleted) + } } // purge all the SOLR documents associated with the files @@ -115,7 +155,10 @@ public Dataset execute(CommandContext ctxt) throws CommandException { // ... And now delete the existing version itself: existingDataset.setVersions(new ArrayList<>()); - ctxt.em().remove(ctxt.em().merge(existingVersion)); + existingVersion.setDataset(null); + + existingVersion = ctxt.em().merge(existingVersion); + ctxt.em().remove(existingVersion); // Now attach the newly-harvested version to the dataset: existingDataset.getVersions().add(newHarvestedVersion); @@ -123,7 +166,8 @@ public Dataset execute(CommandContext ctxt) throws CommandException { // ... There's one more thing to do - go through the new files, // that are not in the database yet, and make sure they are - // attached to this existing dataset: + // attached to this existing dataset, instead of the dummy temp + // dataset into which they were originally imported: for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) { if (newFileMetadata.getDataFile().getId() == null) { existingDataset.getFiles().add(newFileMetadata.getDataFile()); @@ -135,9 +179,7 @@ public Dataset execute(CommandContext ctxt) throws CommandException { Dataset savedDataset = ctxt.em().merge(existingDataset); ctxt.em().flush(); - - //@todo reindex - + return savedDataset; }