[PYTHON/LANGCHAIN] MultiVectorRetriever 클래스 : 가상 질문 생성 및 문서 연결해 검색 개선하기

■ MultiVectorRetriever 클래스를 사용해 가상 질문 생성 및 문서 연결해 검색 개선하기

※ OPENAI_API_KEY 환경 변수 값은 .env 파일에 정의한다.

▶ main.py


import uuid

from dotenv                               import load_dotenv
from pydantic                             import BaseModel
from typing                               import List
from pydantic                             import Field
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters             import RecursiveCharacterTextSplitter
from langchain_core.prompts               import ChatPromptTemplate
from langchain_openai                     import ChatOpenAI
from langchain_openai                     import OpenAIEmbeddings
from langchain_chroma                     import Chroma
from langchain.storage                    import InMemoryByteStore
from langchain.retrievers.multi_vector    import MultiVectorRetriever
from langchain_core.documents             import Document

load_dotenv()

class HypotheticalQuestion(BaseModel):
    """Generate hypothetical questions."""
    questionList : List[str] = Field(..., description = "List of questions")

textLoaderList = [
    TextLoader("paul_graham_essay.txt" , encoding = "utf-8"),
    TextLoader("state_of_the_union.txt", encoding = "utf-8")
]

documentList = []

for textLoader in textLoaderList:
    documentList.extend(textLoader.load())

recursiveCharacterTextSplitter = RecursiveCharacterTextSplitter(chunk_size = 10000)

splitDocumentList   = recursiveCharacterTextSplitter.split_documents(documentList)
splitDocumentIDList = [str(uuid.uuid4()) for _ in splitDocumentList]

chatPromptTemplate = ChatPromptTemplate.from_template("Generate a list of exactly 3 hypothetical questions that the below document could be used to answer :\n\n{doc}")

chatOpenAI = ChatOpenAI(max_retries = 0, model = "gpt-4o-mini")

structuredOutputRunnableSequence = chatOpenAI.with_structured_output(HypotheticalQuestion)

runnableSequence = ({"doc" : lambda document : document.page_content} | chatPromptTemplate | structuredOutputRunnableSequence | (lambda x : x.questionList))

hypotheticalQuestionListList = runnableSequence.batch(splitDocumentList, {"max_concurrency" : 5})

hypotheticalQuestionDocumentList = []

idKey = "doc_id"

for i, hypotheticalQuestionList in enumerate(hypotheticalQuestionListList):
    hypotheticalQuestionDocumentList.extend([Document(page_content = hypotheticalQuestion, metadata = {idKey : splitDocumentIDList[i]}) for hypotheticalQuestion in hypotheticalQuestionList])

openAIEmbeddings = OpenAIEmbeddings()

chroma = Chroma(collection_name = "hypo-questions", embedding_function = openAIEmbeddings)

inMemoryByteStore = InMemoryByteStore()

multiVectorRetriever = MultiVectorRetriever(
    vectorstore = chroma,
    byte_store  = inMemoryByteStore,
    id_key      = idKey
)

multiVectorRetriever.vectorstore.add_documents(hypotheticalQuestionDocumentList)
multiVectorRetriever.docstore.mset(list(zip(splitDocumentIDList, splitDocumentList)))

resultHypotheticalQuestionDocumentList = multiVectorRetriever.vectorstore.similarity_search("justice breyer")

print(len(resultHypotheticalQuestionDocumentList))
print(resultHypotheticalQuestionDocumentList[0])

resultDocument = multiVectorRetriever.invoke("justice breyer")

print(resultDocument[0].metadata)

import uuid

from dotenv import load_dotenv

from pydantic import BaseModel

from typing import List

from pydantic import Field

from langchain_community.document_loaders import TextLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.prompts import ChatPromptTemplate

from langchain_openai import ChatOpenAI

from langchain_openai import OpenAIEmbeddings

from langchain_chroma import Chroma

from langchain.storage import InMemoryByteStore

from langchain.retrievers.multi_vector import MultiVectorRetriever

from langchain_core.documents import Document

load_dotenv()

class HypotheticalQuestion(BaseModel):

"""Generate hypothetical questions."""

questionList : List[str] = Field(..., description = "List of questions")

textLoaderList = [

TextLoader("paul_graham_essay.txt" , encoding = "utf-8"),

TextLoader("state_of_the_union.txt", encoding = "utf-8")

]

documentList = []

for textLoader in textLoaderList:

documentList.extend(textLoader.load())

recursiveCharacterTextSplitter = RecursiveCharacterTextSplitter(chunk_size = 10000)

splitDocumentList = recursiveCharacterTextSplitter.split_documents(documentList)

splitDocumentIDList = [str(uuid.uuid4()) for _ in splitDocumentList]

chatPromptTemplate = ChatPromptTemplate.from_template("Generate a list of exactly 3 hypothetical questions that the below document could be used to answer :\n\n{doc}")

chatOpenAI = ChatOpenAI(max_retries = 0, model = "gpt-4o-mini")

structuredOutputRunnableSequence = chatOpenAI.with_structured_output(HypotheticalQuestion)

runnableSequence = ({"doc" : lambda document : document.page_content} | chatPromptTemplate | structuredOutputRunnableSequence | (lambda x : x.questionList))

hypotheticalQuestionListList = runnableSequence.batch(splitDocumentList, {"max_concurrency" : 5})

hypotheticalQuestionDocumentList = []

idKey = "doc_id"

for i, hypotheticalQuestionList in enumerate(hypotheticalQuestionListList):

hypotheticalQuestionDocumentList.extend([Document(page_content = hypotheticalQuestion, metadata = {idKey : splitDocumentIDList[i]}) for hypotheticalQuestion in hypotheticalQuestionList])

openAIEmbeddings = OpenAIEmbeddings()

chroma = Chroma(collection_name = "hypo-questions", embedding_function = openAIEmbeddings)

inMemoryByteStore = InMemoryByteStore()

multiVectorRetriever = MultiVectorRetriever(

vectorstore = chroma,

byte_store = inMemoryByteStore,

id_key = idKey

)

multiVectorRetriever.vectorstore.add_documents(hypotheticalQuestionDocumentList)

multiVectorRetriever.docstore.mset(list(zip(splitDocumentIDList, splitDocumentList)))

resultHypotheticalQuestionDocumentList = multiVectorRetriever.vectorstore.similarity_search("justice breyer")

print(len(resultHypotheticalQuestionDocumentList))

print(resultHypotheticalQuestionDocumentList[0])

resultDocument = multiVectorRetriever.invoke("justice breyer")

print(resultDocument[0].metadata)

▶ requirements.txt


aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
annotated-types==0.7.0
anyio==4.4.0
asgiref==3.8.1
attrs==24.2.0
backoff==2.2.1
bcrypt==4.2.0
build==1.2.2
cachetools==5.5.0
certifi==2024.8.30
charset-normalizer==3.3.2
chroma-hnswlib==0.7.3
chromadb==0.5.3
click==8.1.7
colorama==0.4.6
coloredlogs==15.0.1
dataclasses-json==0.6.7
Deprecated==1.2.14
distro==1.9.0
fastapi==0.114.2
filelock==3.16.0
flatbuffers==24.3.25
frozenlist==1.4.1
fsspec==2024.9.0
google-auth==2.34.0
googleapis-common-protos==1.65.0
greenlet==3.1.0
grpcio==1.66.1
h11==0.14.0
httpcore==1.0.5
httptools==0.6.1
httpx==0.27.2
huggingface-hub==0.24.7
humanfriendly==10.0
idna==3.9
importlib_metadata==8.4.0
importlib_resources==6.4.5
jiter==0.5.0
jsonpatch==1.33
jsonpointer==3.0.0
kubernetes==30.1.0
langchain==0.3.0
langchain-chroma==0.1.4
langchain-community==0.3.0
langchain-core==0.3.0
langchain-openai==0.2.0
langchain-text-splitters==0.3.0
langsmith==0.1.120
markdown-it-py==3.0.0
marshmallow==3.22.0
mdurl==0.1.2
mmh3==4.1.0
monotonic==1.6
mpmath==1.3.0
multidict==6.1.0
mypy-extensions==1.0.0
numpy==1.26.4
oauthlib==3.2.2
onnxruntime==1.19.2
openai==1.45.0
opentelemetry-api==1.27.0
opentelemetry-exporter-otlp-proto-common==1.27.0
opentelemetry-exporter-otlp-proto-grpc==1.27.0
opentelemetry-instrumentation==0.48b0
opentelemetry-instrumentation-asgi==0.48b0
opentelemetry-instrumentation-fastapi==0.48b0
opentelemetry-proto==1.27.0
opentelemetry-sdk==1.27.0
opentelemetry-semantic-conventions==0.48b0
opentelemetry-util-http==0.48b0
orjson==3.10.7
overrides==7.7.0
packaging==24.1
posthog==3.6.5
protobuf==4.25.4
pyasn1==0.6.1
pyasn1_modules==0.4.1
pydantic==2.9.1
pydantic-settings==2.5.2
pydantic_core==2.23.3
Pygments==2.18.0
PyPika==0.48.9
pyproject_hooks==1.1.0
pyreadline3==3.5.0
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
PyYAML==6.0.2
regex==2024.9.11
requests==2.32.3
requests-oauthlib==2.0.0
rich==13.8.1
rsa==4.9
setuptools==74.1.2
shellingham==1.5.4
six==1.16.0
sniffio==1.3.1
SQLAlchemy==2.0.34
starlette==0.38.5
sympy==1.13.2
tenacity==8.5.0
tiktoken==0.7.0
tokenizers==0.20.0
tqdm==4.66.5
typer==0.12.5
typing-inspect==0.9.0
typing_extensions==4.12.2
urllib3==2.2.3
uvicorn==0.30.6
watchfiles==0.24.0
websocket-client==1.8.0
websockets==13.0.1
wrapt==1.16.0
yarl==1.11.1
zipp==3.20.2

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

aiohappyeyeballs==2.4.0

aiohttp==3.10.5

aiosignal==1.3.1

annotated-types==0.7.0

anyio==4.4.0

asgiref==3.8.1

attrs==24.2.0

backoff==2.2.1

bcrypt==4.2.0

build==1.2.2

cachetools==5.5.0

certifi==2024.8.30

charset-normalizer==3.3.2

chroma-hnswlib==0.7.3

chromadb==0.5.3

click==8.1.7

colorama==0.4.6

coloredlogs==15.0.1

dataclasses-json==0.6.7

Deprecated==1.2.14

distro==1.9.0

fastapi==0.114.2

filelock==3.16.0

flatbuffers==24.3.25

frozenlist==1.4.1

fsspec==2024.9.0

google-auth==2.34.0

googleapis-common-protos==1.65.0

greenlet==3.1.0

grpcio==1.66.1

h11==0.14.0

httpcore==1.0.5

httptools==0.6.1

httpx==0.27.2

huggingface-hub==0.24.7

humanfriendly==10.0

idna==3.9

importlib_metadata==8.4.0

importlib_resources==6.4.5

jiter==0.5.0

jsonpatch==1.33

jsonpointer==3.0.0

kubernetes==30.1.0

langchain==0.3.0

langchain-chroma==0.1.4

langchain-community==0.3.0

langchain-core==0.3.0

langchain-openai==0.2.0

langchain-text-splitters==0.3.0

langsmith==0.1.120

markdown-it-py==3.0.0

marshmallow==3.22.0

mdurl==0.1.2

mmh3==4.1.0

monotonic==1.6

mpmath==1.3.0

multidict==6.1.0

mypy-extensions==1.0.0

numpy==1.26.4

oauthlib==3.2.2

onnxruntime==1.19.2

openai==1.45.0

opentelemetry-api==1.27.0

opentelemetry-exporter-otlp-proto-common==1.27.0

opentelemetry-exporter-otlp-proto-grpc==1.27.0

opentelemetry-instrumentation==0.48b0

opentelemetry-instrumentation-asgi==0.48b0

opentelemetry-instrumentation-fastapi==0.48b0

opentelemetry-proto==1.27.0

opentelemetry-sdk==1.27.0

opentelemetry-semantic-conventions==0.48b0

opentelemetry-util-http==0.48b0

orjson==3.10.7

overrides==7.7.0

packaging==24.1

posthog==3.6.5

protobuf==4.25.4

pyasn1==0.6.1

pyasn1_modules==0.4.1

pydantic==2.9.1

pydantic-settings==2.5.2

pydantic_core==2.23.3

Pygments==2.18.0

PyPika==0.48.9

pyproject_hooks==1.1.0

pyreadline3==3.5.0

python-dateutil==2.9.0.post0

python-dotenv==1.0.1

PyYAML==6.0.2

regex==2024.9.11

requests==2.32.3

requests-oauthlib==2.0.0

rich==13.8.1

rsa==4.9

setuptools==74.1.2

shellingham==1.5.4

six==1.16.0

sniffio==1.3.1

SQLAlchemy==2.0.34

starlette==0.38.5

sympy==1.13.2

tenacity==8.5.0

tiktoken==0.7.0

tokenizers==0.20.0

tqdm==4.66.5

typer==0.12.5

typing-inspect==0.9.0

typing_extensions==4.12.2

urllib3==2.2.3

uvicorn==0.30.6

watchfiles==0.24.0

websocket-client==1.8.0

websockets==13.0.1

wrapt==1.16.0

yarl==1.11.1

zipp==3.20.2

※ pip install python-dotenv langchain langchain-community langchain-chroma langchain-openai 명령을 실행했다.