[PYTHON/LANGCHAIN] 정형화된 데이터 추출시 참조 예제 사용하기

■ 정형화된 데이터 추출시 참조 예제를 사용하는 방법을 보여준다.

※ OPENAI_API_KEY 환경 변수 값은 .env 파일에 정의한다.

▶ main.py


import uuid

from pydantic                import BaseModel
from typing                  import Optional
from pydantic                import Field
from typing                  import List
from typing                  import TypedDict
from langchain_core.messages import BaseMessage
from langchain_core.messages import HumanMessage
from langchain_core.messages import AIMessage
from langchain_core.messages import ToolMessage
from langchain_core.prompts  import ChatPromptTemplate
from langchain_core.prompts  import MessagesPlaceholder
from langchain_openai        import ChatOpenAI

class Person(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person, and it can help to improve extraction results.

    # Note that :
    # 1. Each field is an `optional`    -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name             : Optional[str] = Field(..., description = "The name of the person"                 )
    hair_color       : Optional[str] = Field(..., description = "The color of the person's hair if known")
    height_in_meters : Optional[str] = Field(..., description = "Height in METERs"                       )

class Data(BaseModel):
    """Extracted data about people."""

    # Creates a model so that we can extract multiple entities.
    people : List[Person]

class Example(TypedDict):
    """A representation of an example consisting of text input and expected tool calls.

    For extraction, the tool calls are represented as instances of pydantic model.
    """

    input          : str             # This is the example text
    tool_call_list : List[BaseModel] # Instances of pydantic model that should be extracted

def getMessageList(example : Example) -> List[BaseMessage]:
    """Convert an example into a list of messages that can be fed into an LLM.

    This code is an adapter that converts our example to a list of messages that can be fed into a chat model.

    The list of messages per example corresponds to:

    1) HumanMessage : contains the content from which content should be extracted.
    2) AIMessage    : contains the extracted information from the model
    3) ToolMessage  : contains confirmation to the model that the model requested a tool correctly.

    The ToolMessage is required because some of the chat models are hyper-optimized for agents rather than for an extraction use case.
    """
    messageList : List[BaseMessage] = [HumanMessage(content = example["input"])]
    toolCallList = []
    for toolCall in example["tool_call_list"]:
        toolCallList.append(
            {
                "id"   : str(uuid.uuid4()),
                "args" : toolCall.model_dump(),
                # The name of the function right now corresponds to the name of the pydantic model
                # This is implicit in the API right now, and will be improved over time.
                "name" : toolCall.__class__.__name__
            }
        )
    messageList.append(AIMessage(content = "", tool_calls = toolCallList))
    toolOutputList = example.get("tool_outputs") or ["You have correctly called this tool."] * len(toolCallList)
    for toolOutput, toolCall in zip(toolOutputList, toolCallList):
        toolMessage = ToolMessage(content = toolOutput, tool_call_id = toolCall["id"])
        messageList.append(toolMessage)
    print(messageList)
    return messageList

exampleTupleList = [
    (
        "The ocean is vast and blue. It's more than 20,000 feet deep. There are many fish in it.",
        Data(people = []),
    ),
    (
        "Fiona traveled far from France to Spain.",
        Data(people = [Person(name = "Fiona", height_in_meters = None, hair_color = None)])
    )
]

messageList = []

for text, data in exampleTupleList:
    messageList.extend(getMessageList({"input" : text, "tool_call_list" : [data]}))

chatPromptTemplate = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert extraction algorithm. Only extract relevant information from the text. If you do not know the value of an attribute asked to extract, return null for the attribute's value."),
        MessagesPlaceholder("examples"),
        ("human", "{text}")
    ]
)

chatOpenAI = ChatOpenAI(model = "gpt-4o-mini", temperature = 0)

runnableSequence1 = chatOpenAI.with_structured_output(schema = Data, method = "function_calling", include_raw = False)

runnableSequence2 = chatPromptTemplate | runnableSequence1

responseData = runnableSequence2.invoke(
    {
        "text"     : "My name is Harrison. My hair is black.",
        "examples" : messageList,
    }
)

print(responseData)

"""
people=[Person(name='Harrison', hair_color='black', height_in_meters=None)]
"""

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

import uuid

from pydantic import BaseModel

from typing import Optional

from pydantic import Field

from typing import List

from typing import TypedDict

from langchain_core.messages import BaseMessage

from langchain_core.messages import HumanMessage

from langchain_core.messages import AIMessage

from langchain_core.messages import ToolMessage

from langchain_core.prompts import ChatPromptTemplate

from langchain_core.prompts import MessagesPlaceholder

from langchain_openai import ChatOpenAI

class Person(BaseModel):

"""Information about a person."""

# ^ Doc-string for the entity Person.

# This doc-string is sent to the LLM as the description of the schema Person, and it can help to improve extraction results.

# Note that :

# 1. Each field is an `optional` -- this allows the model to decline to extract it!

# 2. Each field has a `description` -- this description is used by the LLM.

# Having a good description can help improve extraction results.

name : Optional[str] = Field(..., description = "The name of the person" )

hair_color : Optional[str] = Field(..., description = "The color of the person's hair if known")

height_in_meters : Optional[str] = Field(..., description = "Height in METERs" )

class Data(BaseModel):

"""Extracted data about people."""

# Creates a model so that we can extract multiple entities.

people : List[Person]

class Example(TypedDict):

"""A representation of an example consisting of text input and expected tool calls.

For extraction, the tool calls are represented as instances of pydantic model.

"""

input : str # This is the example text

tool_call_list : List[BaseModel] # Instances of pydantic model that should be extracted

def getMessageList(example : Example) -> List[BaseMessage]:

"""Convert an example into a list of messages that can be fed into an LLM.

This code is an adapter that converts our example to a list of messages that can be fed into a chat model.

The list of messages per example corresponds to:

1) HumanMessage : contains the content from which content should be extracted.

2) AIMessage : contains the extracted information from the model

3) ToolMessage : contains confirmation to the model that the model requested a tool correctly.

The ToolMessage is required because some of the chat models are hyper-optimized for agents rather than for an extraction use case.

"""

messageList : List[BaseMessage] = [HumanMessage(content = example["input"])]

toolCallList = []

for toolCall in example["tool_call_list"]:

toolCallList.append(

{

"id" : str(uuid.uuid4()),

"args" : toolCall.model_dump(),

# The name of the function right now corresponds to the name of the pydantic model

# This is implicit in the API right now, and will be improved over time.

"name" : toolCall.__class__.__name__

}

)

messageList.append(AIMessage(content = "", tool_calls = toolCallList))

toolOutputList = example.get("tool_outputs") or ["You have correctly called this tool."] * len(toolCallList)

for toolOutput, toolCall in zip(toolOutputList, toolCallList):

toolMessage = ToolMessage(content = toolOutput, tool_call_id = toolCall["id"])

messageList.append(toolMessage)

print(messageList)

return messageList

exampleTupleList = [

(

"The ocean is vast and blue. It's more than 20,000 feet deep. There are many fish in it.",

Data(people = []),

(

"Fiona traveled far from France to Spain.",

Data(people = [Person(name = "Fiona", height_in_meters = None, hair_color = None)])

)

]

messageList = []

for text, data in exampleTupleList:

messageList.extend(getMessageList({"input" : text, "tool_call_list" : [data]}))

chatPromptTemplate = ChatPromptTemplate.from_messages(

[

("system", "You are an expert extraction algorithm. Only extract relevant information from the text. If you do not know the value of an attribute asked to extract, return null for the attribute's value."),

MessagesPlaceholder("examples"),

("human", "{text}")

]

)

chatOpenAI = ChatOpenAI(model = "gpt-4o-mini", temperature = 0)

runnableSequence1 = chatOpenAI.with_structured_output(schema = Data, method = "function_calling", include_raw = False)

runnableSequence2 = chatPromptTemplate | runnableSequence1

responseData = runnableSequence2.invoke(

{

"text" : "My name is Harrison. My hair is black.",

"examples" : messageList,

}

)

print(responseData)

"""

people=[Person(name='Harrison', hair_color='black', height_in_meters=None)]

"""

▶ requirements.txt


annotated-types==0.7.0
anyio==4.6.2.post1
certifi==2024.8.30
charset-normalizer==3.4.0
colorama==0.4.6
distro==1.9.0
h11==0.14.0
httpcore==1.0.7
httpx==0.28.0
idna==3.10
jiter==0.8.0
jsonpatch==1.33
jsonpointer==3.0.0
langchain-core==0.3.21
langchain-openai==0.2.11
langsmith==0.1.147
openai==1.56.2
orjson==3.10.12
packaging==24.2
pydantic==2.10.3
pydantic_core==2.27.1
python-dotenv==1.0.1
PyYAML==6.0.2
regex==2024.11.6
requests==2.32.3
requests-toolbelt==1.0.0
sniffio==1.3.1
tenacity==9.0.0
tiktoken==0.8.0
tqdm==4.67.1
typing_extensions==4.12.2
urllib3==2.2.3

annotated-types==0.7.0

anyio==4.6.2.post1

certifi==2024.8.30

charset-normalizer==3.4.0

colorama==0.4.6

distro==1.9.0

h11==0.14.0

httpcore==1.0.7

httpx==0.28.0

idna==3.10

jiter==0.8.0

jsonpatch==1.33

jsonpointer==3.0.0

langchain-core==0.3.21

langchain-openai==0.2.11

langsmith==0.1.147

openai==1.56.2

orjson==3.10.12

packaging==24.2

pydantic==2.10.3

pydantic_core==2.27.1

python-dotenv==1.0.1

PyYAML==6.0.2

regex==2024.11.6

requests==2.32.3

requests-toolbelt==1.0.0

sniffio==1.3.1

tenacity==9.0.0

tiktoken==0.8.0

tqdm==4.67.1

typing_extensions==4.12.2

urllib3==2.2.3

※ pip install python-dotenv langchain-openai 명령을 실행했다.