Skip to content

Commit

Permalink
Preemptively fail unparsable/unusable Europeana URLs (#3845)
Browse files Browse the repository at this point in the history
  • Loading branch information
AetherUnbound authored Mar 6, 2024
1 parent 433d92c commit 1934688
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 1 deletion.
10 changes: 9 additions & 1 deletion catalog/dags/providers/provider_api_scripts/europeana.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,15 @@ def get_record_data(self, data: dict) -> dict | None:
@raise_if_empty
def _get_image_url(self, data: dict) -> str | None:
group = data.get("edmIsShownBy")
return group[0] if group else None
if not group:
return None
url = group[0]
# Some Europeana URLs may have prefixes, or reference Dropbox (which we can't
# include in our catalog because we cannot access them directly ourselves).
# E.g.: L-APC248-https://www.dropbox.com/s/i1pqizm1joof8y1/Belgium_Diptyque%20_MAR-SGP-CO1.jpg?raw=1
if "dropbox.com" in url:
return None
return url

@raise_if_empty
def _get_foreign_identifier(self, data: dict) -> str | None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from common.loader import provider_details as prov
from common.storage.image import ImageStore
from providers.provider_api_scripts.europeana import (
EmptyRequiredFieldException,
EuropeanaDataIngester,
EuropeanaRecordBuilder,
)
Expand Down Expand Up @@ -254,6 +255,19 @@ def test_get_foreign_landing_url_without_edmIsShownAt(record_builder):
)


@pytest.mark.parametrize(
"data",
[
{},
{"edmIsShownBy": None},
{"edmIsShownBy": ["dropbox.com/value"]},
],
)
def test_get_image_url_empty(data, record_builder):
with pytest.raises(EmptyRequiredFieldException):
assert record_builder._get_image_url(data)


@pytest.mark.parametrize(
"item_data, expected",
[
Expand Down

0 comments on commit 1934688

Please sign in to comment.