Skip to content

Commit

Permalink
Largely finalized versions of the refactored harvesting classes. #10734
Browse files Browse the repository at this point in the history
  • Loading branch information
landreev committed Sep 11, 2024
1 parent 3fab4c8 commit bd08b69
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 134 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
String globalIdString = globalId.asString();

if (StringUtils.isEmpty(globalIdString)) {
// @todo this check shouldn't be necessary, now that there's a null check above
// @todo this check may not be necessary, now that there's a null check above
throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global identifier this Dataverse recognizes, skipping.");
}

Expand Down Expand Up @@ -347,6 +347,11 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
}

harvestedVersion = parser.parseDatasetVersion(obj.getJsonObject("datasetVersion"));
Dataset tempDataset = createTemporaryHarvestedDataset(harvestedVersion);
// Temporarily attach the temporary dataset to the parent Collection:
// (this will be needed for the purposes of looking up the configured
// metadatablocks and such)
tempDataset.setOwner(owner);
}

// Either a full new import, or an update of an existing harvested
Expand All @@ -362,15 +367,16 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
// Check data against required contraints
List<ConstraintViolation<DatasetField>> violations = harvestedVersion.validateRequired();
if (!violations.isEmpty()) {
// When harvesting, we add NA for missing required values:
// ... and fill the missing required values with "NA"s:
for (ConstraintViolation<DatasetField> v : violations) {
DatasetField f = v.getRootBean();
f.setSingleValue(DatasetField.NA_VALUE);
}
}

// @todo? - is this the right place to call tidyUpFields()?
// usually it is called within the body of the create/update commands.
// is this the right place to call tidyUpFields()?
// usually it is called within the body of the create/update commands
// later on.
DatasetFieldUtil.tidyUpFields(harvestedVersion.getDatasetFields(), true);

// Check data against validation constraints
Expand All @@ -379,7 +385,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve

boolean sanitized = validateAndSanitizeVersionMetadata(harvestedVersion, cleanupLog);

// Note: this sanitizing approach, of replacing invalid values with
// Note: our sanitizing approach, of replacing invalid values with
// "NA" does not work with certain fields. For example, using it to
// populate a GeoBox coordinate value will result in an invalid
// field. So we will attempt to re-validate the santized version.
Expand All @@ -390,117 +396,8 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
validateVersionMetadata(harvestedVersion, cleanupLog);
}

/*Set<ConstraintViolation> invalidViolations = harvestedVersion.validate();
if (!invalidViolations.isEmpty()) {
for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
DatasetFieldValue f = v.getRootBean();
String msg = "Invalid metadata: " + metadataFile.getName() + "; Field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; "
+ "Invalid value: '" + f.getValue() + "'" + ", replaced with '" + DatasetField.NA_VALUE + "'";
cleanupLog.println(msg);
f.setValue(DatasetField.NA_VALUE);
// Note: "NA" does not work with certain fields. For example,
// using it to populate a GeoBox coordinate value is going
// to result in an invalid field. @todo? - see below
}
}*/

// @todo? - re-validate the version before we do anything else?
// something along the lines of
// if (this.validate) {validateOrDie(newVersion, false);}
// DatasetFieldUtil.tidyUpFields(newVersion.getDatasetFields(), true);

if (existingDataset != null) {
// @todo
// ... do the magic - parse the version json, do the switcheroo ...
/*
DatasetVersion existingVersion = existingDataset.getVersions().get(0);
Map<String, Integer> existingFilesIndex = new HashMap<>();
for (int i = 0; i < existingDataset.getFiles().size(); i++) {
String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier();
if (storageIdentifier != null) {
existingFilesIndex.put(storageIdentifier, i);
}
}
for (FileMetadata newFileMetadata : harvestedVersion.getFileMetadatas()) {
// is it safe to assume that each new FileMetadata will be
// pointing to a non-null DataFile
String location = newFileMetadata.getDataFile().getStorageIdentifier();
if (location != null && existingFilesIndex.containsKey(location)) {
newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>());
int fileIndex = existingFilesIndex.get(location);
newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex));
existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata);
existingFilesIndex.remove(location);
}
}
List<String> solrIdsOfDocumentsToDelete = new ArrayList<>();
// Go through the existing files and delete the ones that are
// no longer present in the version that we have just harvesed:
for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) {
DataFile oldDataFile = oldFileMetadata.getDataFile();
solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId());
existingDataset.getFiles().remove(oldDataFile);
// Files from harvested datasets are removed unceremoniously,
// directly in the database. No need to bother calling the
// DeleteFileCommand on them.
em.remove(em.merge(oldDataFile));
em.remove(em.merge(oldFileMetadata));
oldDataFile = null;
oldFileMetadata = null;
}
// purge all the SOLR documents associated with the files
// we have just deleted:
if (!solrIdsOfDocumentsToDelete.isEmpty()) {
indexService.deleteHarvestedDocuments(solrIdsOfDocumentsToDelete);
}
// ... And now delete the existing version itself:
existingDataset.setVersions(new ArrayList<>());
em.remove(em.merge(existingVersion));
// Now attach the newly-harvested version to the dataset:
existingDataset.getVersions().add(harvestedVersion);
harvestedVersion.setDataset(existingDataset);
// ... There's one more thing to do - go through the new files,
// that are not in the database yet, and make sure they are
// attached to this existing dataset:
for (FileMetadata newFileMetadata : harvestedVersion.getFileMetadatas()) {
if (newFileMetadata.getDataFile().getId() == null) {
existingDataset.getFiles().add(newFileMetadata.getDataFile());
newFileMetadata.getDataFile().setOwner(existingDataset);
}
}
em.persist(harvestedVersion);
///@todo remove for (DataFile harvestedFile : existingDataset.getFiles()) {
///@todo remove DataFile merged = em.merge(harvestedFile);
///@todo remove em.remove(merged);
///@todo remove harvestedFile = null;
///@todo remove }
///@todo remove existingDataset.setFiles(null);
///@todo remove Dataset merged = em.merge(existingDataset);
// harvested datasets don't have physical files - so no need to worry about that.
///@todo remove engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest));
// @todo!
// UpdateHarvestedDatasetCommand() ? (later on)
importedDataset = em.merge(existingDataset);
//@todo reindex
*/

importedDataset = engineSvc.submit(new UpdateHarvestedDatasetCommand(existingDataset, harvestedVersion, dataverseRequest));

} else {
importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest));
}
Expand Down Expand Up @@ -883,6 +780,23 @@ private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize
return fixed;
}

/**
* Helper method that creates a throwaway Harvested Dataset to temporarily
* attach the newly-harvested version to. We need this when, instead of
* importing a brand-new harvested dataset from scratch, we are planning to
* attempt to update an already existing dataset harvested from the same
* archival location.
* @param harvestedVersion - a newly created Version imported from harvested metadata
* @return - a temporary dataset to which the new version has been attached
*/
private Dataset createTemporaryHarvestedDataset(DatasetVersion harvestedVersion) {
Dataset tempDataset = new Dataset();
harvestedVersion.setDataset(tempDataset);
tempDataset.setVersions(new ArrayList<>(1));
tempDataset.getVersions().add(harvestedVersion);

return tempDataset;
}

private static class MyCustomFormatter extends Formatter {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@
* @author landreev
*
* Much simplified version of UpdateDatasetVersionCommand,
* but with some extra twists.
* but with some extra twists. The goal is to avoid creating new Dataset and
* DataFile objects, and to instead preserve the database ids of the re-harvested
* datasets and files, whenever possible. This in turn allows us to avoid deleting
* and rebuilding from scratch the Solr documents for these objects.
*/
@RequiredPermissions(Permission.EditDataset)
public class UpdateHarvestedDatasetCommand extends AbstractDatasetCommand<Dataset> {
Expand All @@ -47,7 +50,6 @@ public boolean isValidateLenient() {
@Override
public Dataset execute(CommandContext ctxt) throws CommandException {

// ... do the magic - parse the version json, do the switcheroo ...
Dataset existingDataset = getDataset();

if (existingDataset == null
Expand All @@ -62,15 +64,30 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
throw new IllegalCommandException("The command can only be called with a newly-harvested, not yet saved DatasetVersion supplied", this);
}

newHarvestedVersion.setCreateTime(getTimestamp());
newHarvestedVersion.setLastUpdateTime(getTimestamp());


Map<String, Integer> existingFilesIndex = new HashMap<>();

/*
Create a map of the files that are currently part of this existing
harvested dataset. We assume that a harvested file can be uniquely
defined by its storageidentifier. Which, in the case of a datafile
harvested from another Dataverse should be its data access api url.
*/
for (int i = 0; i < existingDataset.getFiles().size(); i++) {
String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier();
if (storageIdentifier != null) {
existingFilesIndex.put(storageIdentifier, i);
}
}


/*
Go through the files in the newly-harvested version and check if any of
them are the files (potentially new/updated versions thereof) of the files
we already have, harvested previously from the same archive location.
*/
for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) {
// is it safe to assume that each new FileMetadata will be
// pointing to a non-null DataFile here?
Expand All @@ -79,14 +96,32 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>());

int fileIndex = existingFilesIndex.get(location);

// Make sure to update the existing DataFiles that we are going
// to keep in case their newly-harvested versions have different
// checksums, mime types etc. These values are supposed to be
// immutable, normally - but who knows, errors happen, the source
// Dataverse may have had to fix these in their database to
// correct a data integrity issue (for ex.):
existingDataset.getFiles().get(fileIndex).setContentType(newFileMetadata.getDataFile().getContentType());
existingDataset.getFiles().get(fileIndex).setFilesize(newFileMetadata.getDataFile().getFilesize());
existingDataset.getFiles().get(fileIndex).setChecksumType(newFileMetadata.getDataFile().getChecksumType());
existingDataset.getFiles().get(fileIndex).setChecksumValue(newFileMetadata.getDataFile().getChecksumValue());

// Point the newly-harvested filemetadata to the existing datafile:
newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex));

// Make sure this new FileMetadata is the only one attached to this existing file:
existingDataset.getFiles().get(fileIndex).setFileMetadatas(new ArrayList<>(1));
existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata);
// (we don't want any cascade relationships left between this existing
// dataset and its version, since we are going to attemp to delete it).

// Drop the file from the index map:
existingFilesIndex.remove(location);
}
}
// @todo check that the newly-harvested DataFiles have the same checksums
// and mime types etc. These values are supposed to be immutable, normally,
// but who knows - they may have fixed something invalid on the other end

// @todo check if there's anything special that needs to be done for things
// like file categories

Expand All @@ -96,15 +131,20 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
// no longer present in the version that we have just harvesed:
for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) {
DataFile oldDataFile = oldFileMetadata.getDataFile();
solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId());
existingDataset.getFiles().remove(oldDataFile);
// Files from harvested datasets are removed unceremoniously,
// directly in the database. No need to bother calling the
// DeleteFileCommand on them.
ctxt.em().remove(ctxt.em().merge(oldDataFile));
ctxt.em().remove(ctxt.em().merge(oldFileMetadata));
oldDataFile = null;
oldFileMetadata = null;
String location = oldDataFile.getStorageIdentifier();
// Is it still in the existing files map? - that means it is no longer
// present in the newly-harvested version
if (location != null && existingFilesIndex.containsKey(location)) {
solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId());
existingDataset.getFiles().remove(oldDataFile);
// Files from harvested datasets are removed unceremoniously,
// directly in the database. No need to bother calling the
// DeleteFileCommand on them. We'll just need to remember to purge
// them from Solr as well (right below)
ctxt.em().remove(ctxt.em().merge(oldDataFile));
// (no need to explicitly remove the oldFileMetadata; it will be
// removed with the entire old version is deleted)
}
}

// purge all the SOLR documents associated with the files
Expand All @@ -115,15 +155,19 @@ public Dataset execute(CommandContext ctxt) throws CommandException {

// ... And now delete the existing version itself:
existingDataset.setVersions(new ArrayList<>());
ctxt.em().remove(ctxt.em().merge(existingVersion));
existingVersion.setDataset(null);

existingVersion = ctxt.em().merge(existingVersion);
ctxt.em().remove(existingVersion);

// Now attach the newly-harvested version to the dataset:
existingDataset.getVersions().add(newHarvestedVersion);
newHarvestedVersion.setDataset(existingDataset);

// ... There's one more thing to do - go through the new files,
// that are not in the database yet, and make sure they are
// attached to this existing dataset:
// attached to this existing dataset, instead of the dummy temp
// dataset into which they were originally imported:
for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) {
if (newFileMetadata.getDataFile().getId() == null) {
existingDataset.getFiles().add(newFileMetadata.getDataFile());
Expand All @@ -135,9 +179,7 @@ public Dataset execute(CommandContext ctxt) throws CommandException {

Dataset savedDataset = ctxt.em().merge(existingDataset);
ctxt.em().flush();

//@todo reindex


return savedDataset;
}

Expand Down

0 comments on commit bd08b69

Please sign in to comment.