Bom dia Icaro,
Segue a arvore do projeto.
import PyPDF2
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import os
def extract_text_from_pdf(pdf_path):
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
text = ''
for page_num in range(pdf_reader.numPages):
text += pdf_reader.getPage(page_num).extractText()
return text
def clean_and_tokenize(text):
# Tokenização e remoção de stopwords
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english')) # Pode ser necessário ajustar o idioma
tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]
return tokens
def analyze_swot(tokens):
# Contagem de palavras para análise SWOT
counter = Counter(tokens)
strengths = counter.most_common(5) # 5 palavras mais comuns como forças
weaknesses = counter.most_common()[:-6:-1] # 5 palavras menos comuns como fraquezas
return strengths, weaknesses
def main(pdf_path):
text = extract_text_from_pdf(pdf_path)
tokens = clean_and_tokenize(text)
strengths, weaknesses = analyze_swot(tokens)
print("Forças:")
for word, count in strengths:
print(f"{word}: {count}")
print("\nFraquezas:")
for word, count in weaknesses:
print(f"{word}: {count}")
if name == "main":
pdf_path = r"C:/Users/oleperei/Downloads/FLS_report-9m-2023.pdf" if os.path.exists(pdf_path):
main(pdf_path)
else:
print(f"Erro: O arquivo não existe - {pdf_path}")