Skip to content

Commit

Permalink
feat/add include_slide_notes parameter (#455)
Browse files Browse the repository at this point in the history
## Description
* Added `include_slide_notes` parameter, default is `True`. Works for
`.ppt` and `.pptx` file extensions.
* Added two new files in `sample-docs`: `sample-docs/notes.ppt`,
`sample-docs/notes.pptx` that include notes on their slides. This is to
easily test the functionality, as there are no existing PowerPoint files
that include slide notes.

## Testing
```
#  using default value (True) returns additional NarrativeText element that contains notes
curl -X 'POST'   'http://localhost:8000/general/v0/general'   -H 'accept: application/json'   -H 'Content-Type: multipart/form-data'   -F 'files=@sample-docs/notes.pptx'   -F 'output_format="text/csv"' 

# explicit include_slide_notes=True returns additional NarrativeText element that contains notes
curl -X 'POST'   'http://localhost:8000/general/v0/general'   -H 'accept: application/json'   -H 'Content-Type: multipart/form-data'   -F 'files=@sample-docs/notes.pptx'   -F 'output_format="text/csv"' -F 'include_slide_notes=True'

# explicit include_slide_notes=False returns no NarrativeText element 
curl -X 'POST'   'http://localhost:8000/general/v0/general'   -H 'accept: application/json'   -H 'Content-Type: multipart/form-data'   -F 'files=@sample-docs/notes.pptx'   -F 'output_format="text/csv"' -F 'include_slide_notes=False'
```

Same with file `notes.ppt`
  • Loading branch information
mackurzawa authored Sep 9, 2024
1 parent 843d68a commit 3c3b75a
Show file tree
Hide file tree
Showing 8 changed files with 63 additions and 4 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.0.80-dev0

* Add `include_slide_notes` parameter, indicating whether slide notes in `ppt` and `pptx` files should be partitioned. Default is `True`. Now, when slide notes are present in the file, they will be included alongside other elements, which may shift the index numbers of non-note elements.

## 0.0.79

* Bump to `unstructured` 0.15.7
Expand Down
2 changes: 1 addition & 1 deletion prepline_general/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
app = FastAPI(
title="Unstructured Pipeline API",
summary="Partition documents with the Unstructured library",
version="0.0.79",
version="0.0.80",
docs_url="/general/docs",
openapi_url="/general/openapi.json",
servers=[
Expand Down
9 changes: 7 additions & 2 deletions prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ def pipeline_api(
extract_image_block_types: Optional[List[str]] = None,
unique_element_ids: Optional[bool] = False,
starting_page_number: Optional[int] = None,
include_slide_notes: Optional[bool] = True,
) -> List[Dict[str, Any]] | str:
if filename.endswith(".msg"):
# Note(yuming): convert file type for msg files
Expand Down Expand Up @@ -316,6 +317,7 @@ def pipeline_api(
"overlap": overlap,
"overlap_all": overlap_all,
"starting_page_number": starting_page_number,
"include_slide_notes": include_slide_notes,
},
default=str,
)
Expand Down Expand Up @@ -373,6 +375,7 @@ def pipeline_api(
"extract_image_block_types": extract_image_block_types,
"extract_image_block_to_payload": extract_image_block_to_payload,
"unique_element_ids": unique_element_ids,
"include_slide_notes": include_slide_notes,
},
default=str,
)
Expand Down Expand Up @@ -403,6 +406,7 @@ def pipeline_api(
"extract_image_block_to_payload": extract_image_block_to_payload,
"unique_element_ids": unique_element_ids,
"starting_page_number": starting_page_number,
"include_slide_notes": include_slide_notes,
}

if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
Expand Down Expand Up @@ -649,7 +653,7 @@ def return_content_type(filename: str):


@router.get("/general/v0/general", include_in_schema=False)
@router.get("/general/v0.0.79/general", include_in_schema=False)
@router.get("/general/v0.0.80/general", include_in_schema=False)
async def handle_invalid_get_request():
raise HTTPException(
status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
Expand All @@ -664,7 +668,7 @@ async def handle_invalid_get_request():
description="Description",
operation_id="partition_parameters",
)
@router.post("/general/v0.0.79/general", include_in_schema=False)
@router.post("/general/v0.0.80/general", include_in_schema=False)
def general_partition(
request: Request,
# cannot use annotated type here because of a bug described here:
Expand Down Expand Up @@ -747,6 +751,7 @@ def response_generator(is_multipart: bool):
overlap=form_params.overlap,
overlap_all=form_params.overlap_all,
starting_page_number=form_params.starting_page_number,
include_slide_notes=form_params.include_slide_notes,
)

yield (
Expand Down
13 changes: 13 additions & 0 deletions prepline_general/api/models/form_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class GeneralFormParams(BaseModel):
overlap: int
overlap_all: bool
starting_page_number: Optional[int] = None
include_slide_notes: bool

@classmethod
def as_form(
Expand Down Expand Up @@ -246,6 +247,17 @@ def as_form(
example=3,
),
] = None,
include_slide_notes: Annotated[
bool,
Form(
title="include_slide_notes",
description=(
"When `True`, slide notes from .ppt and .pptx files"
" will be included in the response. Default: `True`"
),
example=False,
),
] = True,
) -> "GeneralFormParams":
return cls(
xml_keep_tags=xml_keep_tags,
Expand Down Expand Up @@ -273,4 +285,5 @@ def as_form(
overlap_all=overlap_all,
unique_element_ids=unique_element_ids,
starting_page_number=starting_page_number,
include_slide_notes=include_slide_notes,
)
2 changes: 1 addition & 1 deletion preprocessing-pipeline-family.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
name: general
version: 0.0.79
version: 0.0.80
Binary file added sample-docs/notes.ppt
Binary file not shown.
Binary file added sample-docs/notes.pptx
Binary file not shown.
37 changes: 37 additions & 0 deletions test_general/api/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,7 @@ def test_parallel_mode_passes_params(monkeypatch):
"new_after_n_chars": "1501",
"overlap": "25",
"overlap_all": "true",
"include_slide_notes": "true",
},
)

Expand Down Expand Up @@ -733,6 +734,7 @@ def test_parallel_mode_passes_params(monkeypatch):
new_after_n_chars=1501,
overlap=25,
overlap_all=True,
include_slide_notes=True,
)


Expand Down Expand Up @@ -1147,3 +1149,38 @@ def test__set_pdf_infer_table_structure(
)
is expected
)


@pytest.mark.parametrize(
("test_default", "include_slide_notes", "test_file"),
[
(True, None, Path("sample-docs") / "notes.ppt"),
(True, None, Path("sample-docs") / "notes.pptx"),
(False, True, Path("sample-docs") / "notes.ppt"),
(False, True, Path("sample-docs") / "notes.pptx"),
(False, False, Path("sample-docs") / "notes.ppt"),
(False, False, Path("sample-docs") / "notes.pptx"),
],
)
def test_include_slide_notes(monkeypatch, test_default, include_slide_notes, test_file):
"""
Verifies that the output includes slide notes when the include_slide_notes parameter
is left as default or explicitly set to True.
"""
client = TestClient(app)
data = (
{"output_format": "text/csv"}
if test_default
else {"include_slide_notes": str(include_slide_notes), "output_format": "text/csv"}
)
response = client.post(
MAIN_API_ROUTE,
files=[("files", (str(test_file), open(test_file, "rb")))],
data=data,
)
df = pd.read_csv(io.StringIO(response.text))

if include_slide_notes or test_default:
assert "Here are important notes" == df["text"][0]
else:
assert "Here are important notes" != df["text"][0]

0 comments on commit 3c3b75a

Please sign in to comment.