pdf 근접단어 찾기(pdfminer) chatgpt
import io
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
import re
import nltk
import numpy as np
# PDF 파일 열기
pdf_file = open('example.pdf', 'rb')
# PDF를 텍스트로 변환
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(pdf_file):
interpreter.process_page(page)
pdf_text = retstr.getvalue()
pdf_file.close()
device.close()
retstr.close()
# 문장으로 분리
sentences = nltk.sent_tokenize(pdf_text)
# 문장에서 단어로 분리
words = [nltk.word_tokenize(sent) for sent in sentences]
# 근접한 단어 찾기
word1 = 'apple'
word2 = 'juice'
for sent_words in words:
sent_len = len(sent_words)
for i in range(sent_len):
if sent_words[i] == word1:
for j in range(max(0, i-5), min(sent_len, i+6)):
if re.match(r'\w', sent_words[j]):
if np.abs(i-j) <= 5 and sent_words[j] == word2:
print(f"{word1} and {word2} are found in the same sentence: {' '.join(sent_words)}")
break