■ 윈도우즈 운영 체제에서 긴 텍스트에서 정형화된 데이터를 추출하는 방법을 보여준다. (무차별 대입 방식)
※ OPENAI_API_KEY 환경 변수 값은 .env 파일에 정의한다.
▶ main.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import requests import re from dotenv import load_dotenv from langchain_community.document_loaders import BSHTMLLoader from langchain.schema import Document from bs4 import BeautifulSoup from langchain_text_splitters import TokenTextSplitter from pydantic import BaseModel from pydantic import Field from typing import List from langchain_core.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI load_dotenv() response = requests.get("https://en.wikipedia.org/wiki/Car") with open("car.html", "w", encoding = "utf-8") as textIOWrapper: textIOWrapper.write(response.text) class CustomBSHTMLLoader(BSHTMLLoader): def load(self) -> list[Document]: try: with open(self.file_path, "r", encoding = "utf-8") as textIOWrapper: beautifulSoup = BeautifulSoup(textIOWrapper, "html.parser") text = beautifulSoup.get_text() except UnicodeDecodeError: print("UTF-8 디코딩 중 오류가 발생했습니다.") with open(self.file_path, "r", encoding = "cp949") as textIOWrapper: beautifulSoup = BeautifulSoup(textIOWrapper, "html.parser") text = beautifulSoup.get_text() metadata = {"source" : self.file_path} return [Document(page_content = text, metadata = metadata)] customBSHTMLLoader = CustomBSHTMLLoader("car.html") documentList = customBSHTMLLoader.load() document = documentList[0] document.page_content = re.sub("\n\n+", "\n", document.page_content) tokenTextSplitter = TokenTextSplitter(chunk_size = 2000, chunk_overlap = 20) textList = tokenTextSplitter.split_text(document.page_content) firstFewTextList = textList[:3] # 예제를 빨리 실행하기 위해 3개 텍스트만 사용한다. class KeyDevelopment(BaseModel): """Information about a development in the history of cars.""" year : int = Field(..., description = "The year when there was an important historic development.") description : str = Field(..., description = "What happened in this year? What was the development?" ) evidence : str = Field(..., description = "Repeat in verbatim the sentence(s) from which the year and description information were extracted") class ExtractionData(BaseModel): """Extracted information about key developments in the history of cars.""" key_development_list : List[KeyDevelopment] # 지침과 추가적인 맥락을 제공하기 위한 사용자 정의 프롬프트를 정의한다. # 1) 추출 품질을 개선하기 위해 프롬프트 템플릿에 예를 추가할 수 있다. # 2) 맥락을 고려하기 위해 추가 매개 변수를 도입한다(예 : 텍스트가 추출된 문서에 대한 메타 데이터 포함) chatPromptTemplate = ChatPromptTemplate.from_messages( [ ("system", "You are an expert at identifying key historic development in text. Only extract important historic developments. Extract nothing if no important information can be found in the text."), ("human" , "{text}") ] ) chatOpenAI = ChatOpenAI(model = "gpt-4o-mini", temperature = 0) runnableSequence1 = chatOpenAI.with_structured_output(schema = ExtractionData, include_raw = False) runnableSequence2 = chatPromptTemplate | runnableSequence1 extractionDataList = runnableSequence2.batch([{"text" : text} for text in firstFewTextList], {"max_concurrency" : 5}) keyDevelopmentList = [] for extractionData in extractionDataList: keyDevelopmentList.extend(extractionData.key_development_list) print(keyDevelopmentList[:10]) """ [ KeyDevelopment( year = 1769, description = 'The French inventor Nicolas-Joseph Cugnot built the first steam-powered road vehicle.', evidence = 'The French inventor Nicolas-Joseph Cugnot built the first steam-powered road vehicle in 1769.' ), KeyDevelopment( year = 1808, description = 'The Swiss inventor François Isaac de Rivaz designed and constructed the first internal combustion-powered automobile.', evidence = 'the Swiss inventor François Isaac de Rivaz designed and constructed the first internal combustion-powered automobile in 1808.' ), KeyDevelopment( year = 1886, description = 'The modern car was invented when the German inventor Carl Benz patented his Benz Patent-Motorwagen.', evidence = 'the modern car—a practical, marketable automobile for everyday use—was invented in 1886, when the German inventor Carl Benz patented his Benz Patent-Motorwagen.' ), KeyDevelopment( year = 1901, description = 'The 1901 Oldsmobile Curved Dash is widely considered the first mass-produced car.', evidence = 'The 1901 Oldsmobile Curved Dash and the 1908 Ford Model T, both American cars, are widely considered the first mass-produced and mass-affordable cars, respectively.' ), KeyDevelopment( year = 1908, description = 'The 1908 Ford Model T is widely considered the first mass-affordable car.', evidence = 'The 1901 Oldsmobile Curved Dash and the 1908 Ford Model T, both American cars, are widely considered the first mass-produced and mass-affordable cars, respectively.' ), KeyDevelopment( year = 1881, description = 'French inventor Gustave Trouvé demonstrated a three-wheeled car powered by electricity.', evidence = 'In November 1881, French inventor Gustave Trouvé demonstrated a three-wheeled car powered by electricity at the International Exposition of Electricity.' ), KeyDevelopment( year = 1879, description = 'Benz was granted a patent for his first engine, which had been designed in 1878.', evidence = 'In 1879, Benz was granted a patent for his first engine, which had been designed in 1878.' ), KeyDevelopment( year = 1885, description = 'Benz built his first Motorwagen in Mannheim, Germany.', evidence = 'His first Motorwagen was built in 1885 in Mannheim, Germany.' ), KeyDevelopment( year = 1888, description = 'Benz began promotion of the vehicle and about 25 Benz vehicles were sold between 1888 and 1893.', evidence = 'Benz began promotion of the vehicle on 3 July 1886, and about 25 Benz vehicles were sold between 1888 and 1893.' ), KeyDevelopment( year = 1893, description = 'Benz introduced his first four-wheeler along with a cheaper model.', evidence = 'when his first four-wheeler was introduced along with a cheaper model.' ) ] """ |
▶ requirements.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
aiohappyeyeballs==2.4.4 aiohttp==3.11.9 aiosignal==1.3.1 annotated-types==0.7.0 anyio==4.6.2.post1 attrs==24.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 certifi==2024.8.30 charset-normalizer==3.4.0 colorama==0.4.6 dataclasses-json==0.6.7 distro==1.9.0 frozenlist==1.5.0 greenlet==3.1.1 h11==0.14.0 httpcore==1.0.7 httpx==0.28.0 httpx-sse==0.4.0 idna==3.10 jiter==0.8.0 jsonpatch==1.33 jsonpointer==3.0.0 langchain==0.3.9 langchain-community==0.3.9 langchain-core==0.3.21 langchain-openai==0.2.11 langchain-text-splitters==0.3.2 langsmith==0.1.147 lxml==5.3.0 marshmallow==3.23.1 multidict==6.1.0 mypy-extensions==1.0.0 numpy==2.1.3 openai==1.56.2 orjson==3.10.12 packaging==24.2 propcache==0.2.1 pydantic==2.10.3 pydantic-settings==2.6.1 pydantic_core==2.27.1 python-dotenv==1.0.1 PyYAML==6.0.2 regex==2024.11.6 requests==2.32.3 requests-toolbelt==1.0.0 sniffio==1.3.1 soupsieve==2.6 SQLAlchemy==2.0.36 tenacity==9.0.0 tiktoken==0.8.0 tqdm==4.67.1 typing-inspect==0.9.0 typing_extensions==4.12.2 urllib3==2.2.3 yarl==1.18.3 |
※ pip install python-dotenv langchain-community langchain-openai bs4 lxml 명령을 실행했다.