Use meta

h2oai · Oct 30, 2024 · 35df517 · 35df517
1 parent 616db96
commit 35df517
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 12 deletions.
diff --git a/docs/Dockerfile.delta2 b/docs/Dockerfile.delta2
@@ -17,16 +17,12 @@ COPY openai_server/autogen_multi_agent_backend.py /workspace/openai_server/autog
 COPY openai_server/autogen_utils.py /workspace/openai_server/autogen_utils.py
 COPY openai_server/server.py /workspace/openai_server/server.py
 COPY openai_server/agent_tools/download_web_video.py /workspace/openai_server/agent_tools/download_web_video.py
+cp openai_server/agent_tools/convert_document_to_text.py /workspace/openai_server/agent_tools/convert_document_to_text.py
 
 RUN chmod a+rwx /workspace/src/*.py
 RUN chmod a+rwx /workspace/openai_server/*.py
 RUN chmod a+rwx /workspace/openai_server/agent_tools/*.py
 
-RUN wget https://fastdl.mongodb.org/linux/mongodb-linux-x86_64-ubuntu2204-7.0.4.tgz && \
-    tar xvzf mongodb-linux-x86_64-ubuntu2204-7.0.4.tgz && \
-    cp -r mongodb-linux-x86_64-ubuntu2204-7.0.4/bin /usr/lib/python3.10/site-packages/fiftyone/db/ && \
-    chmod -R a+rwx /usr/lib/python3.10/site-packages/fiftyone/db
-
 RUN chmod a+rwx /workspace/.cache
 
 USER h2ogpt

diff --git a/openai_server/agent_tools/convert_document_to_text.py b/openai_server/agent_tools/convert_document_to_text.py
@@ -54,6 +54,7 @@ def convert_to_csv(file):
 
 
 def sources_to_text(sources1):
+    each_content1 = []
     all_content1 = ''
     for source in sources1:
         meta_str = ''
@@ -66,8 +67,10 @@ def sources_to_text(sources1):
             meta_str += f"Title: {meta['title']}\n"
         if 'page' in meta:
             meta_str += f"Page: {meta['page']}\n"
-        all_content1 += f"""\n<document>\n{meta_str}\n<text>\n{source.page_content}\n</text>\n</document>\n"""
-    return all_content1
+        content1 = f"""\n<document>\n{meta_str}\n<text>\n{source.page_content}\n</text>\n</document>\n"""
+        each_content1.append(content1)
+        all_content1 += content1
+    return all_content1, each_content1
 
 
 def process_files(files, urls):
@@ -149,7 +152,7 @@ def process_files(files, urls):
                                                chunk=False,
                                                enable_transcriptions=enable_transcriptions,
                                                )
-        all_content1 = sources_to_text(sources1)
+        all_content1, each_content1 = sources_to_text(sources1)
 
         if filename.lower().endswith('.pdf') and enable_pdf_doctr == 'off':
             if use_pymupdf == 'on':
@@ -173,17 +176,17 @@ def process_files(files, urls):
                                                    enable_transcriptions=False,
                                                    )
 
-            all_content2 = sources_to_text(sources2)
+            all_content2, each_content2 = sources_to_text(sources2)
             # choose one with more content in case pymupdf fails to find info
             if len(all_content2) > len(all_content1):
-                sources1 = sources2
+                each_content1 = each_content2
 
         if not sources1:
             succeeded.append(False)
             print(f"Unable to handle file type for {filename}")
         else:
             succeeded.append(True)
-            text_context_list.extend([x.page_content for x in sources1])
+            text_context_list.extend(each_content1)
 
     return text_context_list, any(succeeded)
 

diff --git a/src/version.py b/src/version.py
@@ -1 +1 @@
-__version__ = "e1caa2e0ed0cee558c7122872a7caf1be54a0226"
+__version__ = "616db9603cf7d7f547736bed116bd1bc56a082ad"