■ 우분투 운영 체제에서 긴 텍스트에서 정형화된 데이터를 추출하는 방법을 보여준다. (RAG 기반 접근 방식)
※ OPENAI_API_KEY 환경 변수 값은 .env 파일에 정의한다.
▶ main.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import requests import re from dotenv import load_dotenv from langchain_community.document_loaders import BSHTMLLoader from langchain_text_splitters import CharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import FAISS from pydantic import BaseModel from pydantic import Field from typing import List from langchain_core.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI load_dotenv() response = requests.get("https://en.wikipedia.org/wiki/Car") with open("car.html", "w", encoding = "utf-8") as textIOWrapper: textIOWrapper.write(response.text) bsHTMLLoader = BSHTMLLoader("car.html") documentList = bsHTMLLoader.load() document = documentList[0] document.page_content = re.sub("\n\n+", "\n", document.page_content) characterTextSplitter = CharacterTextSplitter(chunk_size = 2000, chunk_overlap = 20) textList = characterTextSplitter.split_text(document.page_content) openAIEmbeddings = OpenAIEmbeddings() faiss = FAISS.from_texts(textList, embedding = openAIEmbeddings) vectorStoreRetriever = faiss.as_retriever(search_kwargs = {"k" : 1}) # 첫 번째 문서에서만 추출한다. class KeyDevelopment(BaseModel): """Information about a development in the history of cars.""" year : int = Field(..., description = "The year when there was an important historic development.") description : str = Field(..., description = "What happened in this year? What was the development?" ) evidence : str = Field(..., description = "Repeat in verbatim the sentence(s) from which the year and description information were extracted") class ExtractionData(BaseModel): """Extracted information about key developments in the history of cars.""" key_development_list : List[KeyDevelopment] # 지침과 추가적인 맥락을 제공하기 위한 사용자 정의 프롬프트를 정의한다. # 1) 추출 품질을 개선하기 위해 프롬프트 템플릿에 예를 추가할 수 있다. # 2) 맥락을 고려하기 위해 추가 매개 변수를 도입한다(예 : 텍스트가 추출된 문서에 대한 메타 데이터 포함) chatPromptTemplate = ChatPromptTemplate.from_messages( [ ("system", "You are an expert at identifying key historic development in text. Only extract important historic developments. Extract nothing if no important information can be found in the text."), ("human" , "{text}") ] ) chatOpenAI = ChatOpenAI(model = "gpt-4o-mini", temperature = 0) runnableSequence1 = chatOpenAI.with_structured_output(schema = ExtractionData, include_raw = False) runnableSequence2 = chatPromptTemplate | runnableSequence1 runnableSequence3 = vectorStoreRetriever | (lambda documentList : documentList[0].page_content) runnableSequence4 = { "text" : runnableSequence3 } | runnableSequence2 extractionData = runnableSequence4.invoke("Key developments associated with cars") print(extractionData) """ key_development_list = [ KeyDevelopment( year = 1769, description = 'Nicolas-Joseph Cugnot built the first full-scale, self-propelled mechanical vehicle, a steam-powered tricycle.', evidence = 'Nicolas-Joseph Cugnot is widely credited with building the first full-scale, self-propelled mechanical vehicle in about 1769; he created a steam-powered tricycle.' ), KeyDevelopment( year = 1808, description = 'François Isaac de Rivaz designed and constructed the first internal combustion-powered automobile.', evidence = 'the Swiss inventor François Isaac de Rivaz designed and constructed the first internal combustion-powered automobile in 1808.' ), KeyDevelopment( year = 1886, description = 'Carl Benz patented his Benz Patent-Motorwagen, marking the birth year of the modern car.', evidence = 'the year 1886 is regarded as the birth year of the modern car—a practical, marketable automobile for everyday use—when the German Carl Benz patented his Benz Patent-Motorwagen.' ), KeyDevelopment( year = 1901, description = 'Ransom Olds started large-scale, production-line manufacturing of affordable cars at his Oldsmobile factory.', evidence = 'Large-scale, production-line manufacturing of affordable cars was started by Ransom Olds in 1901 at his Oldsmobile factory in Lansing, Michigan.' ), KeyDevelopment( year = 1913, description = "Henry Ford introduced the world's first moving assembly line for cars at the Highland Park Ford Plant.", evidence = "the world's first moving assembly line for cars at the Highland Park Ford Plant." ), KeyDevelopment( year = 1888, description = "Bertha Benz undertook the first road trip by car to prove the road-worthiness of her husband's invention.", evidence = "In August 1888, Bertha Benz, the wife and business partner of Carl Benz, undertook the first road trip by car, to prove the road-worthiness of her husband's invention." ), KeyDevelopment( year = 1895, description = 'George Selden was granted a US patent for a two-stroke car engine, which hindered the development of cars in the United States.', evidence = 'on 5 November 1895, Selden was granted a US patent (U.S. patent 549,160) for a two-stroke car engine, which hindered, more than encouraged, development of cars in the United States.' ), KeyDevelopment( year = 1893, description = 'The first running, petrol-driven American car was built and road-tested by the Duryea brothers.', evidence = 'In 1893, the first running, petrol-driven American car was built and road-tested by the Duryea brothers of Springfield, Massachusetts.' ), KeyDevelopment( year = 1896, description = 'Benz designed and patented the first internal-combustion flat engine, called boxermotor.', evidence = 'In 1896, Benz designed and patented the first internal-combustion flat engine, called boxermotor.' ), KeyDevelopment( year = 1890, description = 'Daimler and Maybach founded Daimler Motoren Gesellschaft (DMG) and sold their first car in 1892.', evidence = 'Daimler and Maybach founded Daimler Motoren Gesellschaft (DMG) in Cannstatt in 1890, and sold their first car in 1892 under the brand name Daimler.' ) ] """ |
▶ requirements.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
aiohappyeyeballs==2.4.4 aiohttp==3.11.9 aiosignal==1.3.1 annotated-types==0.7.0 anyio==4.6.2.post1 async-timeout==4.0.3 attrs==24.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 certifi==2024.8.30 charset-normalizer==3.4.0 dataclasses-json==0.6.7 distro==1.9.0 exceptiongroup==1.2.2 faiss-gpu==1.7.2 frozenlist==1.5.0 greenlet==3.1.1 h11==0.14.0 httpcore==1.0.7 httpx==0.28.0 httpx-sse==0.4.0 idna==3.10 jiter==0.8.0 jsonpatch==1.33 jsonpointer==3.0.0 langchain==0.3.9 langchain-community==0.3.9 langchain-core==0.3.21 langchain-openai==0.2.11 langchain-text-splitters==0.3.2 langsmith==0.1.147 lxml==5.3.0 marshmallow==3.23.1 multidict==6.1.0 mypy-extensions==1.0.0 numpy==1.26.4 openai==1.56.2 orjson==3.10.12 packaging==24.2 propcache==0.2.1 pydantic==2.10.3 pydantic-settings==2.6.1 pydantic_core==2.27.1 python-dotenv==1.0.1 PyYAML==6.0.2 regex==2024.11.6 requests==2.32.3 requests-toolbelt==1.0.0 sniffio==1.3.1 soupsieve==2.6 SQLAlchemy==2.0.36 tenacity==9.0.0 tiktoken==0.8.0 tqdm==4.67.1 typing-inspect==0.9.0 typing_extensions==4.12.2 urllib3==2.2.3 yarl==1.18.3 |
※ pip install python-dotenv langchain-community langchain-openai bs4 lxml faiss-gpu 명령을 실행했다.