import re
from bs4 import BeautifulSoup
from langchain_community.document_loaders import PDFMinerPDFasHTMLLoader
from langchain_core.documents import Document
pdfMinerPDFasHTMLLoader = PDFMinerPDFasHTMLLoader("nke-10k-2023.pdf")
documentList = pdfMinerPDFasHTMLLoader.load()
document = documentList[0]
beautifulSoup = BeautifulSoup(document.page_content, "html.parser")
resultSet = beautifulSoup.find_all("div")
currentFontSize = None
currentText = ""
snippetList = [] # (text, fontSize) 튜플 리스트
for result in resultSet:
span = result.find("span")
if not span:
continue
style = span.get("style")
if not style:
continue
fontSizeString = re.findall("font-size:(\d+)px", style)
if not fontSizeString:
continue
fontSize = int(fontSizeString[0])
if not currentFontSize:
currentFontSize = fontSize
if fontSize == currentFontSize:
currentText += result.text
else:
snippetList.append((currentText, currentFontSize))
currentFontSize = fontSize
currentText = result.text
snippetList.append((currentText, currentFontSize))
# for snippet in snippetList[:3]:
# print(snippet)
"""
('FORM 10-K\nFORM 10-K\n', 29)
('UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n', 7)
('FORM 10-K \n', 10)
"""
currentIndex = -1
semanticSnippetDocumentList = []
# 가정 : 제목은 해당 내용보다 글꼴 크기가 더 크다.
for snippet in snippetList:
# 현재 조각의 글꼴 크기 > 이전 섹션의 제목 => 새 제목인 경우
if (not semanticSnippetDocumentList or snippet[1] > semanticSnippetDocumentList[currentIndex].metadata["heading_font"]):
metadataDictionary = {"heading" : snippet[0], "content_font" : 0, "heading_font" : snippet[1]}
metadataDictionary.update(document.metadata)
semanticSnippetDocumentList.append(Document(page_content = "", metadata = metadataDictionary))
currentIndex += 1
continue
# 현재 조각의 글꼴 크기 <= 이전 섹션의 콘텐츠 => 콘텐츠가 동일한 섹션에 속하는 경우
# (필요한 경우 하위 섹션에 대해 트리와 같은 구조를 만들 수도 있지만 이는 더 많은 생각이 필요할 수 있으며 데이터에 따라 다를 수 있다)
if (not semanticSnippetDocumentList[currentIndex].metadata["content_font"] or snippet[1] <= semanticSnippetDocumentList[currentIndex].metadata["content_font"]):
semanticSnippetDocumentList[currentIndex].page_content += snippet[0]
semanticSnippetDocumentList[currentIndex].metadata["content_font"] = max(snippet[1], semanticSnippetDocumentList[currentIndex].metadata["content_font"])
continue
# 현재 조각의 글꼴 크기가 이전 섹션의 내용보다 크지만 이전 섹션의 제목보다 작은 경우 새 섹션을 만드는 것
# (예 : PDF 제목의 글꼴 크기가 가장 높지만 모든 섹션을 포함하는 것을 원하지 않는다)
metadataDictionary = {"heading" : snippet[0], "content_font" : 0, "heading_font" : snippet[1]}
metadataDictionary.update(document.metadata)
semanticSnippetDocumentList.append(Document(page_content = "", metadata = metadataDictionary))
currentIndex += 1
for semanticSnippetDocument in semanticSnippetDocumentList[:3]:
print(semanticSnippetDocument.page_content)
print()
"""
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
(Mark One)
☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE FISCAL YEAR ENDED MAY 31, 2023
OR
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE TRANSITION PERIOD FROM
TO
.
Commission File No. 1-10635
(Exact name of Registrant as specified in its charter)
Oregon
(State or other jurisdiction of incorporation)
93-0584541
(IRS Employer Identification No.)
One Bowerman Drive, Beaverton, Oregon 97005-6453
(Address of principal executive offices and zip code)
(503) 671-6453
(Registrant's telephone number, including area code)
SECURITIES REGISTERED PURSUANT TO SECTION 12(B) OF THE ACT:
Class B Common Stock
(Title of each class)
NKE
New York Stock Exchange
(Trading symbol)
(Name of each exchange on which registered)
SECURITIES REGISTERED PURSUANT TO SECTION 12(G) OF THE ACT:
NONE
Indicate by check mark:
• if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.
• if the registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act.
• whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities
Exchange Act of 1934 during the preceding 12 months (or for such shorter period that the registrant was required
to file such reports), and (2) has been subject to such filing requirements for the past 90 days.
"""