■ 우분투 운영 체제에서 긴 텍스트에서 정형화된 데이터를 추출하는 방법을 보여준다. (무차별 대입 방식)
※ OPENAI_API_KEY 환경 변수 값은 .env 파일에 정의한다.
▶ main.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import requests import re from dotenv import load_dotenv from langchain_community.document_loaders import BSHTMLLoader from pydantic import BaseModel from pydantic import Field from typing import List from langchain_core.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI from langchain_text_splitters import TokenTextSplitter load_dotenv() response = requests.get("https://en.wikipedia.org/wiki/Car") with open("car.html", "w", encoding = "utf-8") as textIOWrapper: textIOWrapper.write(response.text) bsHTMLLoader = BSHTMLLoader("car.html") documentList = bsHTMLLoader.load() document = documentList[0] document.page_content = re.sub("\n\n+", "\n", document.page_content) tokenTextSplitter = TokenTextSplitter(chunk_size = 2000, chunk_overlap = 20) textList = tokenTextSplitter.split_text(document.page_content) firstFewTextList = textList[:3] class KeyDevelopment(BaseModel): """Information about a development in the history of cars.""" year : int = Field(..., description = "The year when there was an important historic development.") description : str = Field(..., description = "What happened in this year? What was the development?" ) evidence : str = Field(..., description = "Repeat in verbatim the sentence(s) from which the year and description information were extracted") class ExtractionData(BaseModel): """Extracted information about key developments in the history of cars.""" key_development_list : List[KeyDevelopment] # 지침과 추가적인 맥락을 제공하기 위한 사용자 정의 프롬프트를 정의한다. # 1) 추출 품질을 개선하기 위해 프롬프트 템플릿에 예를 추가할 수 있다. # 2) 맥락을 고려하기 위해 추가 매개 변수를 도입한다(예 : 텍스트가 추출된 문서에 대한 메타 데이터 포함) chatPromptTemplate = ChatPromptTemplate.from_messages( [ ("system", "You are an expert at identifying key historic development in text. Only extract important historic developments. Extract nothing if no important information can be found in the text."), ("human" , "{text}") ] ) chatOpenAI = ChatOpenAI(model = "gpt-4o-mini", temperature = 0) runnableSequence1 = chatOpenAI.with_structured_output(schema = ExtractionData, include_raw = False) runnableSequence2 = chatPromptTemplate | runnableSequence1 extractionDataList = runnableSequence2.batch([{"text" : text} for text in firstFewTextList], {"max_concurrency" : 5}) keyDevelopmentList = [] for extractionData in extractionDataList: keyDevelopmentList.extend(extractionData.key_development_list) print(keyDevelopmentList[:10]) """ [ KeyDevelopment( year = 1769, description = 'Nicolas-Joseph Cugnot built the first steam-powered road vehicle.', evidence = 'The French inventor Nicolas-Joseph Cugnot built the first steam-powered road vehicle in 1769.' ), KeyDevelopment( year = 1808, description = 'François Isaac de Rivaz designed and constructed the first internal combustion-powered automobile.', evidence = 'the Swiss inventor François Isaac de Rivaz designed and constructed the first internal combustion-powered automobile in 1808.' ), KeyDevelopment( year = 1886, description = 'Carl Benz patented his Benz Patent-Motorwagen, marking the invention of the modern car.', evidence = 'the modern car—a practical, marketable automobile for everyday use—was invented in 1886, when the German inventor Carl Benz patented his Benz Patent-Motorwagen.' ), KeyDevelopment( year = 1901, description = 'The Oldsmobile Curved Dash became widely considered the first mass-produced car.', evidence = 'The 1901 Oldsmobile Curved Dash and the 1908 Ford Model T, both American cars, are widely considered the first mass-produced and mass-affordable cars, respectively.' ), KeyDevelopment( year = 1908, description = 'The Ford Model T became widely considered the first mass-affordable car.', evidence = 'The 1901 Oldsmobile Curved Dash and the 1908 Ford Model T, both American cars, are widely considered the first mass-produced and mass-affordable cars, respectively.' ), KeyDevelopment( year = 1881, description = 'Gustave Trouvé demonstrated a three-wheeled car powered by electricity.', evidence = 'In November 1881, French inventor Gustave Trouvé demonstrated a three-wheeled car powered by electricity at the International Exposition of Electricity.' ), KeyDevelopment( year = 1879, description = 'Benz was granted a patent for his first engine, which made the use of the internal combustion engine feasible for powering a vehicle.', evidence = 'In 1879, Benz was granted a patent for his first engine, which had been designed in 1878.' ), KeyDevelopment( year = 1885, description = 'Benz built his first Motorwagen in Mannheim, Germany.', evidence = 'His first Motorwagen was built in 1885 in Mannheim, Germany.' ), KeyDevelopment( year = 1888, description = 'Benz began promotion of his vehicle and sold about 25 Benz vehicles between 1888 and 1893.', evidence = 'Benz began promotion of the vehicle on 3 July 1886, and about 25 Benz vehicles were sold between 1888 and 1893.' ), KeyDevelopment( year = 1893, description = 'Benz introduced his first four-wheeler along with a cheaper model.', evidence = 'when his first four-wheeler was introduced along with a cheaper model.' ) ] """ |
▶ requirements.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
aiohappyeyeballs==2.4.4 aiohttp==3.11.9 aiosignal==1.3.1 annotated-types==0.7.0 anyio==4.6.2.post1 async-timeout==4.0.3 attrs==24.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 certifi==2024.8.30 charset-normalizer==3.4.0 dataclasses-json==0.6.7 distro==1.9.0 exceptiongroup==1.2.2 frozenlist==1.5.0 greenlet==3.1.1 h11==0.14.0 httpcore==1.0.7 httpx==0.28.0 httpx-sse==0.4.0 idna==3.10 jiter==0.8.0 jsonpatch==1.33 jsonpointer==3.0.0 langchain==0.3.9 langchain-community==0.3.9 langchain-core==0.3.21 langchain-openai==0.2.11 langchain-text-splitters==0.3.2 langsmith==0.1.147 lxml==5.3.0 marshmallow==3.23.1 multidict==6.1.0 mypy-extensions==1.0.0 numpy==1.26.4 openai==1.56.2 orjson==3.10.12 packaging==24.2 propcache==0.2.1 pydantic==2.10.3 pydantic-settings==2.6.1 pydantic_core==2.27.1 python-dotenv==1.0.1 PyYAML==6.0.2 regex==2024.11.6 requests==2.32.3 requests-toolbelt==1.0.0 sniffio==1.3.1 soupsieve==2.6 SQLAlchemy==2.0.36 tenacity==9.0.0 tiktoken==0.8.0 tqdm==4.67.1 typing-inspect==0.9.0 typing_extensions==4.12.2 urllib3==2.2.3 yarl==1.18.3 |
※ pip install python-dotenv langchain-community langchain-openai bs4 lxml 명령을 실행했다.