Skip to content

Commit

Permalink
Use meta
Browse files Browse the repository at this point in the history
  • Loading branch information
pseudotensor committed Oct 30, 2024
1 parent 616db96 commit 35df517
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 12 deletions.
6 changes: 1 addition & 5 deletions docs/Dockerfile.delta2
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,12 @@ COPY openai_server/autogen_multi_agent_backend.py /workspace/openai_server/autog
COPY openai_server/autogen_utils.py /workspace/openai_server/autogen_utils.py
COPY openai_server/server.py /workspace/openai_server/server.py
COPY openai_server/agent_tools/download_web_video.py /workspace/openai_server/agent_tools/download_web_video.py
cp openai_server/agent_tools/convert_document_to_text.py /workspace/openai_server/agent_tools/convert_document_to_text.py

RUN chmod a+rwx /workspace/src/*.py
RUN chmod a+rwx /workspace/openai_server/*.py
RUN chmod a+rwx /workspace/openai_server/agent_tools/*.py

RUN wget https://fastdl.mongodb.org/linux/mongodb-linux-x86_64-ubuntu2204-7.0.4.tgz && \
tar xvzf mongodb-linux-x86_64-ubuntu2204-7.0.4.tgz && \
cp -r mongodb-linux-x86_64-ubuntu2204-7.0.4/bin /usr/lib/python3.10/site-packages/fiftyone/db/ && \
chmod -R a+rwx /usr/lib/python3.10/site-packages/fiftyone/db

RUN chmod a+rwx /workspace/.cache

USER h2ogpt
Expand Down
15 changes: 9 additions & 6 deletions openai_server/agent_tools/convert_document_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def convert_to_csv(file):


def sources_to_text(sources1):
each_content1 = []
all_content1 = ''
for source in sources1:
meta_str = ''
Expand All @@ -66,8 +67,10 @@ def sources_to_text(sources1):
meta_str += f"Title: {meta['title']}\n"
if 'page' in meta:
meta_str += f"Page: {meta['page']}\n"
all_content1 += f"""\n<document>\n{meta_str}\n<text>\n{source.page_content}\n</text>\n</document>\n"""
return all_content1
content1 = f"""\n<document>\n{meta_str}\n<text>\n{source.page_content}\n</text>\n</document>\n"""
each_content1.append(content1)
all_content1 += content1
return all_content1, each_content1


def process_files(files, urls):
Expand Down Expand Up @@ -149,7 +152,7 @@ def process_files(files, urls):
chunk=False,
enable_transcriptions=enable_transcriptions,
)
all_content1 = sources_to_text(sources1)
all_content1, each_content1 = sources_to_text(sources1)

if filename.lower().endswith('.pdf') and enable_pdf_doctr == 'off':
if use_pymupdf == 'on':
Expand All @@ -173,17 +176,17 @@ def process_files(files, urls):
enable_transcriptions=False,
)

all_content2 = sources_to_text(sources2)
all_content2, each_content2 = sources_to_text(sources2)
# choose one with more content in case pymupdf fails to find info
if len(all_content2) > len(all_content1):
sources1 = sources2
each_content1 = each_content2

if not sources1:
succeeded.append(False)
print(f"Unable to handle file type for {filename}")
else:
succeeded.append(True)
text_context_list.extend([x.page_content for x in sources1])
text_context_list.extend(each_content1)

return text_context_list, any(succeeded)

Expand Down
2 changes: 1 addition & 1 deletion src/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "e1caa2e0ed0cee558c7122872a7caf1be54a0226"
__version__ = "616db9603cf7d7f547736bed116bd1bc56a082ad"

0 comments on commit 35df517

Please sign in to comment.