This topic was automatically generated from Slack. You can find the original thread here.
Hi!
I’m using langchain to read documents with the python module, but I have an error and don’t know how to solve it.
Here is my code:
import langchain
import unstructured
import nltk
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders.image import UnstructuredImageLoader
def handler(pd: "pipedream"):
# Access the data store under the pd.inputs
path = pd.steps["Download_to_tmp"]["$return_value"]["tmpPath"]
# Image file formats
loader = UnstructuredImageLoader(path)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=10)
texts = text_splitter.split_documents(docs)
page_contents = [text.page_content for text in texts]
return page_contents
As I understand, I need to load the model “punkt” via nltk.download(‘punkt’) but I can’t since the temp folder in the pipedream side is in a read-only mode.
Here is the error:
Traceback (most recent call last):
File "/tmp/__pdg__/dist/python/unstructured/nlp/tokenize.py", line 21, in _download_nltk_package_if_not_present
nltk.find(f"{package_category}/{package_name}")
File "/tmp/__pdg__/dist/python/nltk/data.py", line 583, in find
raise LookupError(resource_not_found)
LookupError:
********************************************************************************************************************************************
Resource [93mpunkt[0m not found.
Please use the NLTK Downloader to obtain the resource:
[31m>>> import nltk
>>> nltk.download('punkt')
[0m
For more information see: https://www.nltk.org/data.html
Attempted to load [93mtokenizers/punkt[0m
Searched in:
- '/home/sbx_user1051/nltk_data'
- '/var/lang/nltk_data'
- '/var/lang/share/nltk_data'
- '/var/lang/lib/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
********************************************************************************************************************************************
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/nano-py/pipedream/worker.py", line 118, in execute
user_retval = handler(pd)
File "/tmp/__pdg__/dist/code/7999065291c2b703f7c7dd35804cadf422cc96dd1e5c5d63046755cb9b52d2cc/code.py", line 27, in handler
docs = loader.load()
File "/tmp/__pdg__/dist/python/langchain/document_loaders/unstructured.py", line 61, in load
elements = self._get_elements()
File "/tmp/__pdg__/dist/python/langchain/document_loaders/image.py", line 11, in _get_elements
from unstructured.partition.image import partition_image
File "/tmp/__pdg__/dist/python/unstructured/partition/image.py", line 4, in <module>
from unstructured.partition.pdf import partition_pdf_or_image
File "/tmp/__pdg__/dist/python/unstructured/partition/pdf.py", line 13, in <module>
from unstructured.partition.text import partition_text
File "/tmp/__pdg__/dist/python/unstructured/partition/text.py", line 16, in <module>
from unstructured.partition.text_type import (
File "/tmp/__pdg__/dist/python/unstructured/partition/text_type.py", line 20, in <module>
from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
File "/tmp/__pdg__/dist/python/unstructured/nlp/tokenize.py", line 32, in <module>
_download_nltk_package_if_not_present(package_name, package_category)
File "/tmp/__pdg__/dist/python/unstructured/nlp/tokenize.py", line 23, in _download_nltk_package_if_not_present
nltk.download(package_name)
File "/tmp/__pdg__/dist/python/nltk/downloader.py", line 777, in download
for msg in self.incr_download(info_or_id, download_dir, force):
File "/tmp/__pdg__/dist/python/nltk/downloader.py", line 642, in incr_download
yield from self._download_package(info, download_dir, force)
File "/tmp/__pdg__/dist/python/nltk/downloader.py", line 699, in _download_package
os.makedirs(download_dir)
File "/var/lang/lib/python3.9/os.py", line 215, in makedirs
makedirs(head, exist_ok=exist_ok)
File "/var/lang/lib/python3.9/os.py", line 225, in makedirs
mkdir(name, mode)
OSError: [Errno 30] Read-only file system: '/home/sbx_user1051'
Can you help me with that issue? Thanks!