From bd08b698859db5c5662f8a742504832023a632d7 Mon Sep 17 00:00:00 2001
From: Leonid Andreev <leonid@hmdc.harvard.edu>
Date: Wed, 11 Sep 2024 14:37:34 -0400
Subject: [PATCH] Largely finalized versions of the refactored harvesting
 classes. #10734

---
 .../api/imports/ImportServiceBean.java        | 142 ++++--------------
 .../impl/UpdateHarvestedDatasetCommand.java   |  82 +++++++---
 2 files changed, 90 insertions(+), 134 deletions(-)
diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
index b3c65ceefcc..8c7ff5acf6d 100644
--- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
+++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
@@ -300,7 +300,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
         String globalIdString = globalId.asString();
         
         if (StringUtils.isEmpty(globalIdString)) {
-            // @todo this check shouldn't be necessary, now that there's a null check above
+            // @todo this check may not be necessary, now that there's a null check above
             throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global identifier this Dataverse recognizes, skipping.");
         }
         
@@ -347,6 +347,11 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                 }
                                 
                 harvestedVersion = parser.parseDatasetVersion(obj.getJsonObject("datasetVersion"));
+                Dataset tempDataset = createTemporaryHarvestedDataset(harvestedVersion);
+                // Temporarily attach the temporary dataset to the parent Collection: 
+                // (this will be needed for the purposes of looking up the configured
+                // metadatablocks and such)
+                tempDataset.setOwner(owner);
             }
         
             // Either a full new import, or an update of an existing harvested
@@ -362,15 +367,16 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
             // Check data against required contraints
             List<ConstraintViolation<DatasetField>> violations = harvestedVersion.validateRequired();
             if (!violations.isEmpty()) {
-                // When harvesting, we add NA for missing required values:
+                // ... and fill the missing required values with "NA"s:
                 for (ConstraintViolation<DatasetField> v : violations) {
                     DatasetField f =  v.getRootBean();
                     f.setSingleValue(DatasetField.NA_VALUE);
                 }
             }
             
-            // @todo? - is this the right place to call tidyUpFields()? 
-            // usually it is called within the body of the create/update commands.
+            // is this the right place to call tidyUpFields()? 
+            // usually it is called within the body of the create/update commands
+            // later on.
             DatasetFieldUtil.tidyUpFields(harvestedVersion.getDatasetFields(), true);
             
             // Check data against validation constraints
@@ -379,7 +385,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
             
             boolean sanitized = validateAndSanitizeVersionMetadata(harvestedVersion, cleanupLog);
             
-            // Note: this sanitizing approach, of replacing invalid values with 
+            // Note: our sanitizing approach, of replacing invalid values with 
             // "NA" does not work with certain fields. For example, using it to 
             // populate a GeoBox coordinate value will result in an invalid 
             // field. So we will attempt to re-validate the santized version.
@@ -390,117 +396,8 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                 validateVersionMetadata(harvestedVersion, cleanupLog);
             }
             
-            /*Set<ConstraintViolation> invalidViolations = harvestedVersion.validate();
-            if (!invalidViolations.isEmpty()) {
-                for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
-                    DatasetFieldValue f = v.getRootBean();
-
-                    String msg = "Invalid metadata: " + metadataFile.getName() + "; Field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; "
-                            + "Invalid value:  '" + f.getValue() + "'" + ", replaced with '" + DatasetField.NA_VALUE + "'";
-                    cleanupLog.println(msg);
-                    f.setValue(DatasetField.NA_VALUE);
-                    // Note: "NA" does not work with certain fields. For example, 
-                    // using it to populate a GeoBox coordinate value is going 
-                    // to result in an invalid field. @todo? - see below
-                }
-            }*/
-            
-            // @todo? - re-validate the version before we do anything else? 
-            // something along the lines of 
-            // if (this.validate) {validateOrDie(newVersion, false);}
-            // DatasetFieldUtil.tidyUpFields(newVersion.getDatasetFields(), true);
-            
             if (existingDataset != null) {
-                // @todo
-                // ... do the magic - parse the version json, do the switcheroo ...
-                /*
-                DatasetVersion existingVersion = existingDataset.getVersions().get(0);
-
-                Map<String, Integer> existingFilesIndex = new HashMap<>(); 
-                
-                for (int i = 0; i < existingDataset.getFiles().size(); i++) {
-                    String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier();
-                    if (storageIdentifier != null) {
-                        existingFilesIndex.put(storageIdentifier, i);
-                    }
-                }
-                
-                for (FileMetadata newFileMetadata : harvestedVersion.getFileMetadatas()) {
-                    // is it safe to assume that each new FileMetadata will be 
-                    // pointing to a non-null DataFile
-                    String location = newFileMetadata.getDataFile().getStorageIdentifier(); 
-                    if (location != null && existingFilesIndex.containsKey(location)) {
-                        newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>());
-                        
-                        int fileIndex = existingFilesIndex.get(location);
-                        newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex));
-                        existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata);
-                        existingFilesIndex.remove(location);
-                    }
-                }
-                
-                List<String> solrIdsOfDocumentsToDelete = new ArrayList<>();
-                
-                // Go through the existing files and delete the ones that are 
-                // no longer present in the version that we have just harvesed:
-                for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) {
-                    DataFile oldDataFile = oldFileMetadata.getDataFile();
-                    solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId());
-                    existingDataset.getFiles().remove(oldDataFile);
-                    // Files from harvested datasets are removed unceremoniously, 
-                    // directly in the database. No need to bother calling the 
-                    // DeleteFileCommand on them.
-                    em.remove(em.merge(oldDataFile));
-                    em.remove(em.merge(oldFileMetadata));
-                    oldDataFile = null;
-                    oldFileMetadata = null; 
-                }
-                
-                // purge all the SOLR documents associated with the files
-                // we have just deleted:
-                if (!solrIdsOfDocumentsToDelete.isEmpty()) {
-                    indexService.deleteHarvestedDocuments(solrIdsOfDocumentsToDelete);
-                }
-
-                // ... And now delete the existing version itself: 
-                
-                existingDataset.setVersions(new ArrayList<>());
-                em.remove(em.merge(existingVersion));
-                
-                // Now attach the newly-harvested version to the dataset:
-                existingDataset.getVersions().add(harvestedVersion);
-                harvestedVersion.setDataset(existingDataset);
-                
-                // ... There's one more thing to do - go through the new files, 
-                // that are not in the database yet, and make sure they are 
-                // attached to this existing dataset:
-                
-                for (FileMetadata newFileMetadata : harvestedVersion.getFileMetadatas()) {
-                    if (newFileMetadata.getDataFile().getId() == null) {
-                        existingDataset.getFiles().add(newFileMetadata.getDataFile());
-                        newFileMetadata.getDataFile().setOwner(existingDataset);
-                    }
-                }
-                
-                em.persist(harvestedVersion);
-                
-                ///@todo remove for (DataFile harvestedFile : existingDataset.getFiles()) {
-                ///@todo remove     DataFile merged = em.merge(harvestedFile);
-                ///@todo remove     em.remove(merged);
-                ///@todo remove     harvestedFile = null; 
-                ///@todo remove }
-                ///@todo remove existingDataset.setFiles(null);
-                ///@todo remove Dataset merged = em.merge(existingDataset);
-                // harvested datasets don't have physical files - so no need to worry about that.
-                ///@todo remove engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest));
-                // @todo!
-                // UpdateHarvestedDatasetCommand() ? (later on)
-                importedDataset = em.merge(existingDataset);
-                //@todo reindex 
-                */
-                
                 importedDataset = engineSvc.submit(new UpdateHarvestedDatasetCommand(existingDataset, harvestedVersion, dataverseRequest));
-
             } else {
                 importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest));
             }
@@ -883,6 +780,23 @@ private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize
         return fixed;
     }
 
+    /**
+     * Helper method that creates a throwaway Harvested Dataset to temporarily 
+     * attach the newly-harvested version to. We need this when, instead of 
+     * importing a brand-new harvested dataset from scratch, we are planning to
+     * attempt to update an already existing dataset harvested from the same 
+     * archival location.
+     * @param harvestedVersion - a newly created Version imported from harvested metadata
+     * @return - a temporary dataset to which the new version has been attached
+     */
+    private Dataset createTemporaryHarvestedDataset(DatasetVersion harvestedVersion) {
+        Dataset tempDataset = new Dataset();
+        harvestedVersion.setDataset(tempDataset);
+        tempDataset.setVersions(new ArrayList<>(1));
+        tempDataset.getVersions().add(harvestedVersion);
+        
+        return tempDataset;
+    }
 
     private static class MyCustomFormatter extends Formatter {
 
diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
index d28950e4d9d..9d7461b9b30 100644
--- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
+++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
@@ -26,7 +26,10 @@
  * @author landreev
  * 
  * Much simplified version of UpdateDatasetVersionCommand, 
- * but with some extra twists. 
+ * but with some extra twists. The goal is to avoid creating new Dataset and 
+ * DataFile objects, and to instead preserve the database ids of the re-harvested 
+ * datasets and files, whenever possible. This in turn allows us to avoid deleting
+ * and rebuilding from scratch the Solr documents for these objects. 
  */
 @RequiredPermissions(Permission.EditDataset)
 public class UpdateHarvestedDatasetCommand extends AbstractDatasetCommand<Dataset> {
@@ -47,7 +50,6 @@ public boolean isValidateLenient() {
     @Override
     public Dataset execute(CommandContext ctxt) throws CommandException {
         
-        // ... do the magic - parse the version json, do the switcheroo ...
         Dataset existingDataset = getDataset();
 
         if (existingDataset == null
@@ -62,15 +64,30 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
             throw new IllegalCommandException("The command can only be called with a newly-harvested, not yet saved DatasetVersion supplied", this);
         }
         
+        newHarvestedVersion.setCreateTime(getTimestamp());
+        newHarvestedVersion.setLastUpdateTime(getTimestamp());
+
+        
         Map<String, Integer> existingFilesIndex = new HashMap<>();
 
+        /* 
+        Create a map of the files that are currently part of this existing
+        harvested dataset. We assume that a harvested file can be uniquely 
+        defined by its storageidentifier. Which, in the case of a datafile
+        harvested from another Dataverse should be its data access api url. 
+        */
         for (int i = 0; i < existingDataset.getFiles().size(); i++) {
             String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier();
             if (storageIdentifier != null) {
                 existingFilesIndex.put(storageIdentifier, i);
             }
         }
-                
+        
+        /*
+        Go through the files in the newly-harvested version and check if any of 
+        them are the files (potentially new/updated versions thereof) of the files
+        we already have, harvested previously from the same archive location. 
+        */
         for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) {
             // is it safe to assume that each new FileMetadata will be 
             // pointing to a non-null DataFile here?
@@ -79,14 +96,32 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
                 newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>());
 
                 int fileIndex = existingFilesIndex.get(location);
+                
+                // Make sure to update the existing DataFiles that we are going 
+                // to keep in case their newly-harvested versions have different 
+                // checksums, mime types etc. These values are supposed to be 
+                // immutable, normally - but who knows, errors happen, the source 
+                // Dataverse may have had to fix these in their database to 
+                // correct a data integrity issue (for ex.):
+                existingDataset.getFiles().get(fileIndex).setContentType(newFileMetadata.getDataFile().getContentType());
+                existingDataset.getFiles().get(fileIndex).setFilesize(newFileMetadata.getDataFile().getFilesize());
+                existingDataset.getFiles().get(fileIndex).setChecksumType(newFileMetadata.getDataFile().getChecksumType());
+                existingDataset.getFiles().get(fileIndex).setChecksumValue(newFileMetadata.getDataFile().getChecksumValue());
+                
+                // Point the newly-harvested filemetadata to the existing datafile:
                 newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex));
+                
+                // Make sure this new FileMetadata is the only one attached to this existing file:
+                existingDataset.getFiles().get(fileIndex).setFileMetadatas(new ArrayList<>(1));
                 existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata);
+                // (we don't want any cascade relationships left between this existing
+                // dataset and its version, since we are going to attemp to delete it).
+                
+                // Drop the file from the index map:
                 existingFilesIndex.remove(location);
             }
         }
-        // @todo check that the newly-harvested DataFiles have the same checksums
-        // and mime types etc. These values are supposed to be immutable, normally, 
-        // but who knows - they may have fixed something invalid on the other end
+        
         // @todo check if there's anything special that needs to be done for things
         // like file categories
                 
@@ -96,15 +131,20 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
         // no longer present in the version that we have just harvesed:
         for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) {
             DataFile oldDataFile = oldFileMetadata.getDataFile();
-            solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId());
-            existingDataset.getFiles().remove(oldDataFile);
-            // Files from harvested datasets are removed unceremoniously, 
-            // directly in the database. No need to bother calling the 
-            // DeleteFileCommand on them.
-            ctxt.em().remove(ctxt.em().merge(oldDataFile));
-            ctxt.em().remove(ctxt.em().merge(oldFileMetadata));
-            oldDataFile = null;
-            oldFileMetadata = null;
+            String location = oldDataFile.getStorageIdentifier();
+            // Is it still in the existing files map? - that means it is no longer
+            // present in the newly-harvested version
+            if (location != null && existingFilesIndex.containsKey(location)) {
+                solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId());
+                existingDataset.getFiles().remove(oldDataFile);
+                // Files from harvested datasets are removed unceremoniously, 
+                // directly in the database. No need to bother calling the 
+                // DeleteFileCommand on them. We'll just need to remember to purge
+                // them from Solr as well (right below)
+                ctxt.em().remove(ctxt.em().merge(oldDataFile));
+                // (no need to explicitly remove the oldFileMetadata; it will be 
+                // removed with the entire old version is deleted)
+            }
         }
                 
         // purge all the SOLR documents associated with the files
@@ -115,7 +155,10 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
 
         // ... And now delete the existing version itself: 
         existingDataset.setVersions(new ArrayList<>());
-        ctxt.em().remove(ctxt.em().merge(existingVersion));
+        existingVersion.setDataset(null);
+        
+        existingVersion = ctxt.em().merge(existingVersion);
+        ctxt.em().remove(existingVersion);
 
         // Now attach the newly-harvested version to the dataset:
         existingDataset.getVersions().add(newHarvestedVersion);
@@ -123,7 +166,8 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
                 
         // ... There's one more thing to do - go through the new files, 
         // that are not in the database yet, and make sure they are 
-        // attached to this existing dataset:
+        // attached to this existing dataset, instead of the dummy temp 
+        // dataset into which they were originally imported:
         for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) {
             if (newFileMetadata.getDataFile().getId() == null) {
                 existingDataset.getFiles().add(newFileMetadata.getDataFile());
@@ -135,9 +179,7 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
                 
         Dataset savedDataset = ctxt.em().merge(existingDataset);
         ctxt.em().flush();
-        
-        //@todo reindex 
-        
+                
         return savedDataset;
     }