■ MultiVectorRetriever 클래스를 사용해 가상 질문 생성 및 문서 연결해 검색 개선하기
※ OPENAI_API_KEY 환경 변수 값은 .env 파일에 정의한다.
▶ main.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import uuid from dotenv import load_dotenv from pydantic import BaseModel from typing import List from pydantic import Field from langchain_community.document_loaders import TextLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_core.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings from langchain_chroma import Chroma from langchain.storage import InMemoryByteStore from langchain.retrievers.multi_vector import MultiVectorRetriever from langchain_core.documents import Document load_dotenv() class HypotheticalQuestion(BaseModel): """Generate hypothetical questions.""" questionList : List[str] = Field(..., description = "List of questions") textLoaderList = [ TextLoader("paul_graham_essay.txt" , encoding = "utf-8"), TextLoader("state_of_the_union.txt", encoding = "utf-8") ] documentList = [] for textLoader in textLoaderList: documentList.extend(textLoader.load()) recursiveCharacterTextSplitter = RecursiveCharacterTextSplitter(chunk_size = 10000) splitDocumentList = recursiveCharacterTextSplitter.split_documents(documentList) splitDocumentIDList = [str(uuid.uuid4()) for _ in splitDocumentList] chatPromptTemplate = ChatPromptTemplate.from_template("Generate a list of exactly 3 hypothetical questions that the below document could be used to answer :\n\n{doc}") chatOpenAI = ChatOpenAI(max_retries = 0, model = "gpt-4o-mini") structuredOutputRunnableSequence = chatOpenAI.with_structured_output(HypotheticalQuestion) runnableSequence = ({"doc" : lambda document : document.page_content} | chatPromptTemplate | structuredOutputRunnableSequence | (lambda x : x.questionList)) hypotheticalQuestionListList = runnableSequence.batch(splitDocumentList, {"max_concurrency" : 5}) hypotheticalQuestionDocumentList = [] idKey = "doc_id" for i, hypotheticalQuestionList in enumerate(hypotheticalQuestionListList): hypotheticalQuestionDocumentList.extend([Document(page_content = hypotheticalQuestion, metadata = {idKey : splitDocumentIDList[i]}) for hypotheticalQuestion in hypotheticalQuestionList]) openAIEmbeddings = OpenAIEmbeddings() chroma = Chroma(collection_name = "hypo-questions", embedding_function = openAIEmbeddings) inMemoryByteStore = InMemoryByteStore() multiVectorRetriever = MultiVectorRetriever( vectorstore = chroma, byte_store = inMemoryByteStore, id_key = idKey ) multiVectorRetriever.vectorstore.add_documents(hypotheticalQuestionDocumentList) multiVectorRetriever.docstore.mset(list(zip(splitDocumentIDList, splitDocumentList))) resultHypotheticalQuestionDocumentList = multiVectorRetriever.vectorstore.similarity_search("justice breyer") print(len(resultHypotheticalQuestionDocumentList)) print(resultHypotheticalQuestionDocumentList[0]) resultDocument = multiVectorRetriever.invoke("justice breyer") print(resultDocument[0].metadata) |
▶ requirements.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
aiohappyeyeballs==2.4.0 aiohttp==3.10.5 aiosignal==1.3.1 annotated-types==0.7.0 anyio==4.4.0 asgiref==3.8.1 attrs==24.2.0 backoff==2.2.1 bcrypt==4.2.0 build==1.2.2 cachetools==5.5.0 certifi==2024.8.30 charset-normalizer==3.3.2 chroma-hnswlib==0.7.3 chromadb==0.5.3 click==8.1.7 colorama==0.4.6 coloredlogs==15.0.1 dataclasses-json==0.6.7 Deprecated==1.2.14 distro==1.9.0 fastapi==0.114.2 filelock==3.16.0 flatbuffers==24.3.25 frozenlist==1.4.1 fsspec==2024.9.0 google-auth==2.34.0 googleapis-common-protos==1.65.0 greenlet==3.1.0 grpcio==1.66.1 h11==0.14.0 httpcore==1.0.5 httptools==0.6.1 httpx==0.27.2 huggingface-hub==0.24.7 humanfriendly==10.0 idna==3.9 importlib_metadata==8.4.0 importlib_resources==6.4.5 jiter==0.5.0 jsonpatch==1.33 jsonpointer==3.0.0 kubernetes==30.1.0 langchain==0.3.0 langchain-chroma==0.1.4 langchain-community==0.3.0 langchain-core==0.3.0 langchain-openai==0.2.0 langchain-text-splitters==0.3.0 langsmith==0.1.120 markdown-it-py==3.0.0 marshmallow==3.22.0 mdurl==0.1.2 mmh3==4.1.0 monotonic==1.6 mpmath==1.3.0 multidict==6.1.0 mypy-extensions==1.0.0 numpy==1.26.4 oauthlib==3.2.2 onnxruntime==1.19.2 openai==1.45.0 opentelemetry-api==1.27.0 opentelemetry-exporter-otlp-proto-common==1.27.0 opentelemetry-exporter-otlp-proto-grpc==1.27.0 opentelemetry-instrumentation==0.48b0 opentelemetry-instrumentation-asgi==0.48b0 opentelemetry-instrumentation-fastapi==0.48b0 opentelemetry-proto==1.27.0 opentelemetry-sdk==1.27.0 opentelemetry-semantic-conventions==0.48b0 opentelemetry-util-http==0.48b0 orjson==3.10.7 overrides==7.7.0 packaging==24.1 posthog==3.6.5 protobuf==4.25.4 pyasn1==0.6.1 pyasn1_modules==0.4.1 pydantic==2.9.1 pydantic-settings==2.5.2 pydantic_core==2.23.3 Pygments==2.18.0 PyPika==0.48.9 pyproject_hooks==1.1.0 pyreadline3==3.5.0 python-dateutil==2.9.0.post0 python-dotenv==1.0.1 PyYAML==6.0.2 regex==2024.9.11 requests==2.32.3 requests-oauthlib==2.0.0 rich==13.8.1 rsa==4.9 setuptools==74.1.2 shellingham==1.5.4 six==1.16.0 sniffio==1.3.1 SQLAlchemy==2.0.34 starlette==0.38.5 sympy==1.13.2 tenacity==8.5.0 tiktoken==0.7.0 tokenizers==0.20.0 tqdm==4.66.5 typer==0.12.5 typing-inspect==0.9.0 typing_extensions==4.12.2 urllib3==2.2.3 uvicorn==0.30.6 watchfiles==0.24.0 websocket-client==1.8.0 websockets==13.0.1 wrapt==1.16.0 yarl==1.11.1 zipp==3.20.2 |
※ pip install python-dotenv langchain langchain-community langchain-chroma langchain-openai 명령을 실행했다.