■ BeautifulSoupWebReader 클래스의 load_data 메소드를 사용해 웹 사이트에서 텍스트를 가져오는 데이터 커넥터를 설정하는 방법을 보여준다.
▶ main.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
import os from llama_index.core import download_loader, GPTVectorStoreIndex os.environ["OPENAI_API_KEY"] = "<OPENAI_API_KEY>" BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader") beautifulSoupWebReader = BeautifulSoupWebReader() documentList = beautifulSoupWebReader.load_data(urls = ["https://openai.com/blog/planning-for-agi-and-beyond"]) vectorStoreIndex = GPTVectorStoreIndex.from_documents(documentList) retrieverQueryEngine = vectorStoreIndex.as_query_engine() response = retrieverQueryEngine.query("이 웹페이지에서 전하고 싶은 말은 무엇인가요? 한국어로 대답해 주세요.") print(response) |
▶ 실행 결과
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
Requirement already satisfied: llama-index-readers-web in ./env/lib/python3.10/site-packages (0.1.18) Requirement already satisfied: beautifulsoup4<5.0.0,>=4.12.3 in ./env/lib/python3.10/site-packages (from llama-index-readers-web) (4.12.3) Requirement already satisfied: spider-client<0.0.12,>=0.0.11 in ./env/lib/python3.10/site-packages (from llama-index-readers-web) (0.0.11) Requirement already satisfied: aiohttp<4.0.0,>=3.9.1 in ./env/lib/python3.10/site-packages (from llama-index-readers-web) (3.9.5) Requirement already satisfied: urllib3>=1.1.0 in ./env/lib/python3.10/site-packages (from llama-index-readers-web) (2.2.1) Requirement already satisfied: html2text<2021.0.0,>=2020.1.16 in ./env/lib/python3.10/site-packages (from llama-index-readers-web) (2020.1.16) Requirement already satisfied: chromedriver-autoinstaller<0.7.0,>=0.6.3 in ./env/lib/python3.10/site-packages (from llama-index-readers-web) (0.6.4) Requirement already satisfied: playwright<2.0,>=1.30 in ./env/lib/python3.10/site-packages (from llama-index-readers-web) (1.44.0) Requirement already satisfied: requests<3.0.0,>=2.31.0 in ./env/lib/python3.10/site-packages (from llama-index-readers-web) (2.32.3) Requirement already satisfied: llama-index-core<0.11.0,>=0.10.1 in ./env/lib/python3.10/site-packages (from llama-index-readers-web) (0.10.43) Requirement already satisfied: newspaper3k<0.3.0,>=0.2.8 in ./env/lib/python3.10/site-packages (from llama-index-readers-web) (0.2.8) Requirement already satisfied: selenium<5.0.0,>=4.17.2 in ./env/lib/python3.10/site-packages (from llama-index-readers-web) (4.21.0) Requirement already satisfied: aiosignal>=1.1.2 in ./env/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.9.1->llama-index-readers-web) (1.3.1) Requirement already satisfied: attrs>=17.3.0 in ./env/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.9.1->llama-index-readers-web) (23.2.0) Requirement already satisfied: yarl<2.0,>=1.0 in ./env/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.9.1->llama-index-readers-web) (1.9.4) Requirement already satisfied: async-timeout<5.0,>=4.0 in ./env/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.9.1->llama-index-readers-web) (4.0.3) Requirement already satisfied: multidict<7.0,>=4.5 in ./env/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.9.1->llama-index-readers-web) (6.0.5) Requirement already satisfied: frozenlist>=1.1.1 in ./env/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.9.1->llama-index-readers-web) (1.4.1) Requirement already satisfied: soupsieve>1.2 in ./env/lib/python3.10/site-packages (from beautifulsoup4<5.0.0,>=4.12.3->llama-index-readers-web) (2.5) Requirement already satisfied: packaging>=23.1 in ./env/lib/python3.10/site-packages (from chromedriver-autoinstaller<0.7.0,>=0.6.3->llama-index-readers-web) (23.2) Requirement already satisfied: PyYAML>=6.0.1 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (6.0.1) Requirement already satisfied: typing-inspect>=0.8.0 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (0.9.0) Requirement already satisfied: openai>=1.1.0 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (1.33.0) Requirement already satisfied: typing-extensions>=4.5.0 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (4.12.2) Requirement already satisfied: tenacity<9.0.0,>=8.2.0 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (8.3.0) Requirement already satisfied: tiktoken>=0.3.3 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (0.7.0) Requirement already satisfied: nltk<4.0.0,>=3.8.1 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (3.8.1) Requirement already satisfied: nest-asyncio<2.0.0,>=1.5.8 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (1.6.0) Requirement already satisfied: pillow>=9.0.0 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (10.3.0) Requirement already satisfied: dataclasses-json in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (0.6.6) Requirement already satisfied: numpy in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (1.26.4) Requirement already satisfied: deprecated>=1.2.9.3 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (1.2.14) Requirement already satisfied: pandas in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (2.2.2) Requirement already satisfied: tqdm<5.0.0,>=4.66.1 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (4.66.4) Requirement already satisfied: llamaindex-py-client<0.2.0,>=0.1.18 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (0.1.19) Requirement already satisfied: SQLAlchemy[asyncio]>=1.4.49 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (2.0.30) Requirement already satisfied: wrapt in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (1.16.0) Requirement already satisfied: dirtyjson<2.0.0,>=1.0.8 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (1.0.8) Requirement already satisfied: httpx in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (0.27.0) Requirement already satisfied: networkx>=3.0 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (3.3) Requirement already satisfied: fsspec>=2023.5.0 in ./env/lib/python3.10/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (2024.6.0) Requirement already satisfied: tldextract>=2.0.1 in ./env/lib/python3.10/site-packages (from newspaper3k<0.3.0,>=0.2.8->llama-index-readers-web) (5.1.2) Requirement already satisfied: feedparser>=5.2.1 in ./env/lib/python3.10/site-packages (from newspaper3k<0.3.0,>=0.2.8->llama-index-readers-web) (6.0.11) Requirement already satisfied: feedfinder2>=0.0.4 in ./env/lib/python3.10/site-packages (from newspaper3k<0.3.0,>=0.2.8->llama-index-readers-web) (0.0.4) Requirement already satisfied: jieba3k>=0.35.1 in ./env/lib/python3.10/site-packages (from newspaper3k<0.3.0,>=0.2.8->llama-index-readers-web) (0.35.1) Requirement already satisfied: lxml>=3.6.0 in ./env/lib/python3.10/site-packages (from newspaper3k<0.3.0,>=0.2.8->llama-index-readers-web) (5.2.2) Requirement already satisfied: tinysegmenter==0.3 in ./env/lib/python3.10/site-packages (from newspaper3k<0.3.0,>=0.2.8->llama-index-readers-web) (0.3) Requirement already satisfied: cssselect>=0.9.2 in ./env/lib/python3.10/site-packages (from newspaper3k<0.3.0,>=0.2.8->llama-index-readers-web) (1.2.0) Requirement already satisfied: python-dateutil>=2.5.3 in ./env/lib/python3.10/site-packages (from newspaper3k<0.3.0,>=0.2.8->llama-index-readers-web) (2.9.0.post0) Requirement already satisfied: greenlet==3.0.3 in ./env/lib/python3.10/site-packages (from playwright<2.0,>=1.30->llama-index-readers-web) (3.0.3) Requirement already satisfied: pyee==11.1.0 in ./env/lib/python3.10/site-packages (from playwright<2.0,>=1.30->llama-index-readers-web) (11.1.0) Requirement already satisfied: charset-normalizer<4,>=2 in ./env/lib/python3.10/site-packages (from requests<3.0.0,>=2.31.0->llama-index-readers-web) (3.3.2) Requirement already satisfied: certifi>=2017.4.17 in ./env/lib/python3.10/site-packages (from requests<3.0.0,>=2.31.0->llama-index-readers-web) (2024.6.2) Requirement already satisfied: idna<4,>=2.5 in ./env/lib/python3.10/site-packages (from requests<3.0.0,>=2.31.0->llama-index-readers-web) (3.7) Requirement already satisfied: trio-websocket~=0.9 in ./env/lib/python3.10/site-packages (from selenium<5.0.0,>=4.17.2->llama-index-readers-web) (0.11.1) Requirement already satisfied: trio~=0.17 in ./env/lib/python3.10/site-packages (from selenium<5.0.0,>=4.17.2->llama-index-readers-web) (0.25.1) Requirement already satisfied: six in ./env/lib/python3.10/site-packages (from feedfinder2>=0.0.4->newspaper3k<0.3.0,>=0.2.8->llama-index-readers-web) (1.16.0) Requirement already satisfied: sgmllib3k in ./env/lib/python3.10/site-packages (from feedparser>=5.2.1->newspaper3k<0.3.0,>=0.2.8->llama-index-readers-web) (1.0.0) Requirement already satisfied: pydantic>=1.10 in ./env/lib/python3.10/site-packages (from llamaindex-py-client<0.2.0,>=0.1.18->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (2.7.3) Requirement already satisfied: sniffio in ./env/lib/python3.10/site-packages (from httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (1.3.1) Requirement already satisfied: httpcore==1.* in ./env/lib/python3.10/site-packages (from httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (1.0.5) Requirement already satisfied: anyio in ./env/lib/python3.10/site-packages (from httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (4.4.0) Requirement already satisfied: h11<0.15,>=0.13 in ./env/lib/python3.10/site-packages (from httpcore==1.*->httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (0.14.0) Requirement already satisfied: click in ./env/lib/python3.10/site-packages (from nltk<4.0.0,>=3.8.1->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (8.1.7) Requirement already satisfied: joblib in ./env/lib/python3.10/site-packages (from nltk<4.0.0,>=3.8.1->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (1.4.2) Requirement already satisfied: regex>=2021.8.3 in ./env/lib/python3.10/site-packages (from nltk<4.0.0,>=3.8.1->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (2024.5.15) Requirement already satisfied: distro<2,>=1.7.0 in ./env/lib/python3.10/site-packages (from openai>=1.1.0->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (1.9.0) Requirement already satisfied: requests-file>=1.4 in ./env/lib/python3.10/site-packages (from tldextract>=2.0.1->newspaper3k<0.3.0,>=0.2.8->llama-index-readers-web) (2.1.0) Requirement already satisfied: filelock>=3.0.8 in ./env/lib/python3.10/site-packages (from tldextract>=2.0.1->newspaper3k<0.3.0,>=0.2.8->llama-index-readers-web) (3.14.0) Requirement already satisfied: exceptiongroup in ./env/lib/python3.10/site-packages (from trio~=0.17->selenium<5.0.0,>=4.17.2->llama-index-readers-web) (1.2.1) Requirement already satisfied: sortedcontainers in ./env/lib/python3.10/site-packages (from trio~=0.17->selenium<5.0.0,>=4.17.2->llama-index-readers-web) (2.4.0) Requirement already satisfied: outcome in ./env/lib/python3.10/site-packages (from trio~=0.17->selenium<5.0.0,>=4.17.2->llama-index-readers-web) (1.3.0.post0) Requirement already satisfied: wsproto>=0.14 in ./env/lib/python3.10/site-packages (from trio-websocket~=0.9->selenium<5.0.0,>=4.17.2->llama-index-readers-web) (1.2.0) Requirement already satisfied: mypy-extensions>=0.3.0 in ./env/lib/python3.10/site-packages (from typing-inspect>=0.8.0->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (1.0.0) Requirement already satisfied: pysocks!=1.5.7,<2.0,>=1.5.6 in ./env/lib/python3.10/site-packages (from urllib3>=1.1.0->llama-index-readers-web) (1.7.1) Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in ./env/lib/python3.10/site-packages (from dataclasses-json->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (3.21.3) Requirement already satisfied: pytz>=2020.1 in ./env/lib/python3.10/site-packages (from pandas->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (2024.1) Requirement already satisfied: tzdata>=2022.7 in ./env/lib/python3.10/site-packages (from pandas->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (2024.1) Requirement already satisfied: annotated-types>=0.4.0 in ./env/lib/python3.10/site-packages (from pydantic>=1.10->llamaindex-py-client<0.2.0,>=0.1.18->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (0.7.0) Requirement already satisfied: pydantic-core==2.18.4 in ./env/lib/python3.10/site-packages (from pydantic>=1.10->llamaindex-py-client<0.2.0,>=0.1.18->llama-index-core<0.11.0,>=0.10.1->llama-index-readers-web) (2.18.4) 이 웹페이지에서는 인공 일반 지능(AGI)에 대한 계획 및 그 이상에 대한 논의가 이루어지고 있습니다. |
▶ requirements.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
aiohttp==3.9.5 aiosignal==1.3.1 annotated-types==0.7.0 anyio==4.4.0 async-timeout==4.0.3 attrs==23.2.0 beautifulsoup4==4.12.3 certifi==2024.6.2 charset-normalizer==3.3.2 chromedriver-autoinstaller==0.6.4 click==8.1.7 cssselect==1.2.0 dataclasses-json==0.6.6 Deprecated==1.2.14 dirtyjson==1.0.8 distro==1.9.0 exceptiongroup==1.2.1 feedfinder2==0.0.4 feedparser==6.0.11 filelock==3.14.0 frozenlist==1.4.1 fsspec==2024.6.0 greenlet==3.0.3 h11==0.14.0 html2text==2020.1.16 httpcore==1.0.5 httpx==0.27.0 idna==3.7 jieba3k==0.35.1 joblib==1.4.2 jsonpatch==1.33 jsonpointer==2.4 langchain==0.2.3 langchain-core==0.2.5 langchain-text-splitters==0.2.1 langsmith==0.1.75 llama-index==0.10.43 llama-index-agent-openai==0.2.7 llama-index-cli==0.1.12 llama-index-core==0.10.43 llama-index-embeddings-openai==0.1.10 llama-index-indices-managed-llama-cloud==0.1.6 llama-index-legacy==0.9.48 llama-index-llms-openai==0.1.22 llama-index-multi-modal-llms-openai==0.1.6 llama-index-program-openai==0.1.6 llama-index-question-gen-openai==0.1.3 llama-index-readers-file==0.1.23 llama-index-readers-llama-parse==0.1.4 llama-index-readers-web==0.1.18 llama-parse==0.4.4 llamaindex-py-client==0.1.19 lxml==5.2.2 marshmallow==3.21.3 multidict==6.0.5 mypy-extensions==1.0.0 nest-asyncio==1.6.0 networkx==3.3 newspaper3k==0.2.8 nltk==3.8.1 numpy==1.26.4 openai==1.33.0 orjson==3.10.3 outcome==1.3.0.post0 packaging==23.2 pandas==2.2.2 pillow==10.3.0 playwright==1.44.0 pydantic==2.7.3 pydantic_core==2.18.4 pyee==11.1.0 pypdf==4.2.0 PySocks==1.7.1 python-dateutil==2.9.0.post0 pytz==2024.1 PyYAML==6.0.1 regex==2024.5.15 requests==2.32.3 requests-file==2.1.0 selenium==4.21.0 sgmllib3k==1.0.0 six==1.16.0 sniffio==1.3.1 sortedcontainers==2.4.0 soupsieve==2.5 spider-client==0.0.11 SQLAlchemy==2.0.30 striprtf==0.0.26 tenacity==8.3.0 tiktoken==0.7.0 tinysegmenter==0.3 tldextract==5.1.2 tqdm==4.66.4 trio==0.25.1 trio-websocket==0.11.1 typing-inspect==0.9.0 typing_extensions==4.12.2 tzdata==2024.1 urllib3==2.2.1 wrapt==1.16.0 wsproto==1.2.0 yarl==1.9.4 |
※ pip install openai langchain llama-index llama-index-readers-web 명령을 실행했다.