■ BaseBlobParser 클래스를 사용해 커스텀 BLOB 파서를 만드는 방법을 보여준다.
▶ main.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
from langchain_core.document_loaders import BaseBlobParser from langchain_core.document_loaders import Blob from typing import Iterator from langchain_core.documents import Document class CustomBlobParser(BaseBlobParser): """A simple parser that creates a document from each line.""" def lazy_parse(self, blob : Blob) -> Iterator[Document]: """Parse a blob into a document line by line.""" lineNumber = 0 with blob.as_bytes_io() as bufferedReader: for line in bufferedReader: lineNumber += 1 yield Document( page_content = line.strip(), metadata = {"line_number" : lineNumber, "source" : blob.source} ) blob1 = Blob.from_path("meow.txt") customBlobParser = CustomBlobParser() generator1 = customBlobParser.lazy_parse(blob1) documentList1 = list(generator1) for document in documentList1: print(document) print("-" * 50) blob2 = Blob(data = b"some data from memory\nmeow") generator2 = customBlobParser.lazy_parse(blob2) documentList2 = list(generator2) for document in documentList2: print(document) """ page_content='meow meow1' metadata={'line_number': 1, 'source': './meow.txt'} page_content='meow meow2' metadata={'line_number': 2, 'source': './meow.txt'} page_content='meow meow3' metadata={'line_number': 3, 'source': './meow.txt'} -------------------------------------------------- page_content='some data from memory' metadata={'line_number': 1, 'source': None} page_content='meow' metadata={'line_number': 2, 'source': None} """ |
▶ requirements.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
aiohappyeyeballs==2.4.3 aiohttp==3.11.7 aiosignal==1.3.1 annotated-types==0.7.0 anyio==4.6.2.post1 attrs==24.2.0 certifi==2024.8.30 charset-normalizer==3.4.0 frozenlist==1.5.0 greenlet==3.1.1 h11==0.14.0 httpcore==1.0.7 httpx==0.27.2 idna==3.10 jsonpatch==1.33 jsonpointer==3.0.0 langchain==0.3.8 langchain-core==0.3.21 langchain-text-splitters==0.3.2 langsmith==0.1.145 multidict==6.1.0 numpy==1.26.4 orjson==3.10.12 packaging==24.2 propcache==0.2.0 pydantic==2.10.1 pydantic_core==2.27.1 PyYAML==6.0.2 requests==2.32.3 requests-toolbelt==1.0.0 sniffio==1.3.1 SQLAlchemy==2.0.36 tenacity==9.0.0 typing_extensions==4.12.2 urllib3==2.2.3 yarl==1.18.0 |
※ pip install langchain 명령을 실행했다.