이 스크립트는 효율적이고 통찰력 있는 분석을 위해 맞춤화된 PDF 처리, 텍스트 추출, 문장 토큰화, 시각화를 통한 주제 모델링 수행을 위한 강력한 워크플로우를 보여줍니다.
import os import matplotlib.pyplot as plt import nltk import pandas as pd import pdftotext import re import seaborn as sns from nltk.tokenize import sent_tokenize from top2vec import Top2Vec from wordcloud import WordCloud from cleantext import clean
다음으로 punkt 토크나이저가 다운로드되었는지 확인하세요.
nltk.download('punkt')
def normalize_text(text): """Normalize text by removing special characters and extra spaces, and applying various other cleaning options.""" # Apply the clean function with specified parameters cleaned_text = clean( text, fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII representation lower=True, # lowercase text no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them no_urls=True, # replace all URLs with a special token no_emails=True, # replace all email addresses with a special token no_phone_numbers=True, # replace all phone numbers with a special token no_numbers=True, # replace all numbers with a special token no_digits=True, # replace all digits with a special token no_currency_symbols=True, # replace all currency symbols with a special token no_punct=False, # remove punctuations lang="en", # set to 'de' for German special handling ) # Further clean the text by removing any remaining special characters except word characters, whitespace, and periods/commas cleaned_text = re.sub(r"[^\w\s.,]", "", cleaned_text) # Replace multiple whitespace characters with a single space and strip leading/trailing spaces cleaned_text = re.sub(r"\s ", " ", cleaned_text).strip() return cleaned_text
def extract_text_from_pdf(pdf_path): with open(pdf_path, "rb") as f: pdf = pdftotext.PDF(f) all_text = "\n\n".join(pdf) return normalize_text(all_text)
def split_into_sentences(text): return sent_tokenize(text)
def process_files(file_paths): authors, titles, all_sentences = [], [], [] for file_path in file_paths: file_name = os.path.basename(file_path) parts = file_name.split(" - ", 2) if len(parts) != 3 or not file_name.endswith(".pdf"): print(f"Skipping file with incorrect format: {file_name}") continue year, author, title = parts author, title = author.strip(), title.replace(".pdf", "").strip() try: text = extract_text_from_pdf(file_path) except Exception as e: print(f"Error extracting text from {file_name}: {e}") continue sentences = split_into_sentences(text) authors.append(author) titles.append(title) all_sentences.extend(sentences) print(f"Number of sentences for {file_name}: {len(sentences)}") return authors, titles, all_sentences
def save_data_to_csv(authors, titles, file_paths, output_file): texts = [] for fp in file_paths: try: text = extract_text_from_pdf(fp) sentences = split_into_sentences(text) texts.append(" ".join(sentences)) except Exception as e: print(f"Error processing file {fp}: {e}") texts.append("") data = pd.DataFrame({ "Author": authors, "Title": titles, "Text": texts }) data.to_csv(output_file, index=False, quoting=1, encoding='utf-8') print(f"Data has been written to {output_file}")
def load_stopwords(filepath): with open(filepath, "r") as f: stopwords = f.read().splitlines() additional_stopwords = ["able", "according", "act", "actually", "after", "again", "age", "agree", "al", "all", "already", "also", "am", "among", "an", "and", "another", "any", "appropriate", "are", "argue", "as", "at", "avoid", "based", "basic", "basis", "be", "been", "begin", "best", "book", "both", "build", "but", "by", "call", "can", "cant", "case", "cases", "claim", "claims", "class", "clear", "clearly", "cope", "could", "course", "data", "de", "deal", "dec", "did", "do", "doesnt", "done", "dont", "each", "early", "ed", "either", "end", "etc", "even", "ever", "every", "far", "feel", "few", "field", "find", "first", "follow", "follows", "for", "found", "free", "fri", "fully", "get", "had", "hand", "has", "have", "he", "help", "her", "here", "him", "his", "how", "however", "httpsabout", "ibid", "if", "im", "in", "is", "it", "its", "jstor", "june", "large", "lead", "least", "less", "like", "long", "look", "man", "many", "may", "me", "money", "more", "most", "move", "moves", "my", "neither", "net", "never", "new", "no", "nor", "not", "notes", "notion", "now", "of", "on", "once", "one", "ones", "only", "open", "or", "order", "orgterms", "other", "our", "out", "own", "paper", "past", "place", "plan", "play", "point", "pp", "precisely", "press", "put", "rather", "real", "require", "right", "risk", "role", "said", "same", "says", "search", "second", "see", "seem", "seems", "seen", "sees", "set", "shall", "she", "should", "show", "shows", "since", "so", "step", "strange", "style", "such", "suggests", "talk", "tell", "tells", "term", "terms", "than", "that", "the", "their", "them", "then", "there", "therefore", "these", "they", "this", "those", "three", "thus", "to", "todes", "together", "too", "tradition", "trans", "true", "try", "trying", "turn", "turns", "two", "up", "us", "use", "used", "uses", "using", "very", "view", "vol", "was", "way", "ways", "we", "web", "well", "were", "what", "when", "whether", "which", "who", "why", "with", "within", "works", "would", "years", "york", "you", "your", "suggests", "without"] stopwords.extend(additional_stopwords) return set(stopwords)
def filter_stopwords_from_topics(topic_words, stopwords): filtered_topics = [] for words in topic_words: filtered_topics.append([word for word in words if word.lower() not in stopwords]) return filtered_topics
def generate_wordcloud(topic_words, topic_num, palette='inferno'): colors = sns.color_palette(palette, n_colors=256).as_hex() def color_func(word, font_size, position, orientation, random_state=None, **kwargs): return colors[random_state.randint(0, len(colors) - 1)] wordcloud = WordCloud(width=800, height=400, background_color='black', color_func=color_func).generate(' '.join(topic_words)) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.title(f'Topic {topic_num} Word Cloud') plt.show()
file_paths = [f"/home/roomal/Desktop/Dreyfus-Project/Dreyfus/{fname}" for fname in os.listdir("/home/roomal/Desktop/Dreyfus-Project/Dreyfus/") if fname.endswith(".pdf")] authors, titles, all_sentences = process_files(file_paths) output_file = "/home/roomal/Desktop/Dreyfus-Project/Dreyfus_Papers.csv" save_data_to_csv(authors, titles, file_paths, output_file) stopwords_filepath = "/home/roomal/Documents/Lists/stopwords.txt" stopwords = load_stopwords(stopwords_filepath) try: topic_model = Top2Vec( all_sentences, embedding_model="distiluse-base-multilingual-cased", speed="deep-learn", workers=6 ) print("Top2Vec model created successfully.") except ValueError as e: print(f"Error initializing Top2Vec: {e}") except Exception as e: print(f"Unexpected error: {e}") num_topics = topic_model.get_num_topics() topic_words, word_scores, topic_nums = topic_model.get_topics(num_topics) filtered_topic_words = filter_stopwords_from_topics(topic_words, stopwords) for i, words in enumerate(filtered_topic_words): print(f"Topic {i}: {', '.join(words)}") keywords = ["heidegger"] topic_words, word_scores, topic_scores, topic_nums = topic_model.search_topics(keywords=keywords, num_topics=num_topics) filtered _search_topic_words = filter_stopwords_from_topics(topic_words, stopwords) for i, words in enumerate(filtered_search_topic_words): generate_wordcloud(words, topic_nums[i]) for i in range(reduced_num_topics): topic_words = topic_model.topic_words_reduced[i] filtered_words = [word for word in topic_words if word.lower() not in stopwords] print(f"Reduced Topic {i}: {', '.join(filtered_words)}") generate_wordcloud(filtered_words, i)
reduced_num_topics = 5 topic_mapping = topic_model.hierarchical_topic_reduction(num_topics=reduced_num_topics) # Print reduced topics and generate word clouds for i in range(reduced_num_topics): topic_words = topic_model.topic_words_reduced[i] filtered_words = [word for word in topic_words if word.lower() not in stopwords] print(f"Reduced Topic {i}: {', '.join(filtered_words)}") generate_wordcloud(filtered_words, i)
부인 성명: 제공된 모든 리소스는 부분적으로 인터넷에서 가져온 것입니다. 귀하의 저작권이나 기타 권리 및 이익이 침해된 경우 자세한 이유를 설명하고 저작권 또는 권리 및 이익에 대한 증거를 제공한 후 이메일([email protected])로 보내주십시오. 최대한 빨리 처리해 드리겠습니다.
Copyright© 2022 湘ICP备2022001581号-3