Этот скрипт демонстрирует мощный рабочий процесс для обработки PDF-файлов, извлечения текста, токенизации предложений и выполнения тематического моделирования с визуализацией, предназначенный для эффективного и глубокого анализа.
import os import matplotlib.pyplot as plt import nltk import pandas as pd import pdftotext import re import seaborn as sns from nltk.tokenize import sent_tokenize from top2vec import Top2Vec from wordcloud import WordCloud from cleantext import clean
Далее убедитесь, что токенизатор punkt загружен:
def normalize_text(text): """Normalize text by removing special characters and extra spaces, and applying various other cleaning options.""" # Apply the clean function with specified parameters cleaned_text = clean( text, fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII representation lower=True, # lowercase text no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them no_urls=True, # replace all URLs with a special token no_emails=True, # replace all email addresses with a special token no_phone_numbers=True, # replace all phone numbers with a special token no_numbers=True, # replace all numbers with a special token no_digits=True, # replace all digits with a special token no_currency_symbols=True, # replace all currency symbols with a special token no_punct=False, # remove punctuations lang="en", # set to 'de' for German special handling ) # Further clean the text by removing any remaining special characters except word characters, whitespace, and periods/commas cleaned_text = re.sub(r"[^\w\s.,]", "", cleaned_text) # Replace multiple whitespace characters with a single space and strip leading/trailing spaces cleaned_text = re.sub(r"\s ", " ", cleaned_text).strip() return cleaned_text
def extract_text_from_pdf(pdf_path): with open(pdf_path, "rb") as f: pdf = pdftotext.PDF(f) all_text = "\n\n".join(pdf) return normalize_text(all_text)
def split_into_sentences(text): return sent_tokenize(text)
def process_files(file_paths): authors, titles, all_sentences = [], [], [] for file_path in file_paths: file_name = os.path.basename(file_path) parts = file_name.split(" - ", 2) if len(parts) != 3 or not file_name.endswith(".pdf"): print(f"Skipping file with incorrect format: {file_name}") continue year, author, title = parts author, title = author.strip(), title.replace(".pdf", "").strip() try: text = extract_text_from_pdf(file_path) except Exception as e: print(f"Error extracting text from {file_name}: {e}") continue sentences = split_into_sentences(text) authors.append(author) titles.append(title) all_sentences.extend(sentences) print(f"Number of sentences for {file_name}: {len(sentences)}") return authors, titles, all_sentences
def save_data_to_csv(authors, titles, file_paths, output_file): texts = [] for fp in file_paths: try: text = extract_text_from_pdf(fp) sentences = split_into_sentences(text) texts.append(" ".join(sentences)) except Exception as e: print(f"Error processing file {fp}: {e}") texts.append("") data = pd.DataFrame({ "Author": authors, "Title": titles, "Text": texts }) data.to_csv(output_file, index=False, quoting=1, encoding='utf-8') print(f"Data has been written to {output_file}")
def load_stopwords(filepath): with open(filepath, "r") as f: stopwords = f.read().splitlines() additional_stopwords = ["able", "according", "act", "actually", "after", "again", "age", "agree", "al", "all", "already", "also", "am", "among", "an", "and", "another", "any", "appropriate", "are", "argue", "as", "at", "avoid", "based", "basic", "basis", "be", "been", "begin", "best", "book", "both", "build", "but", "by", "call", "can", "cant", "case", "cases", "claim", "claims", "class", "clear", "clearly", "cope", "could", "course", "data", "de", "deal", "dec", "did", "do", "doesnt", "done", "dont", "each", "early", "ed", "either", "end", "etc", "even", "ever", "every", "far", "feel", "few", "field", "find", "first", "follow", "follows", "for", "found", "free", "fri", "fully", "get", "had", "hand", "has", "have", "he", "help", "her", "here", "him", "his", "how", "however", "httpsabout", "ibid", "if", "im", "in", "is", "it", "its", "jstor", "june", "large", "lead", "least", "less", "like", "long", "look", "man", "many", "may", "me", "money", "more", "most", "move", "moves", "my", "neither", "net", "never", "new", "no", "nor", "not", "notes", "notion", "now", "of", "on", "once", "one", "ones", "only", "open", "or", "order", "orgterms", "other", "our", "out", "own", "paper", "past", "place", "plan", "play", "point", "pp", "precisely", "press", "put", "rather", "real", "require", "right", "risk", "role", "said", "same", "says", "search", "second", "see", "seem", "seems", "seen", "sees", "set", "shall", "she", "should", "show", "shows", "since", "so", "step", "strange", "style", "such", "suggests", "talk", "tell", "tells", "term", "terms", "than", "that", "the", "their", "them", "then", "there", "therefore", "these", "they", "this", "those", "three", "thus", "to", "todes", "together", "too", "tradition", "trans", "true", "try", "trying", "turn", "turns", "two", "up", "us", "use", "used", "uses", "using", "very", "view", "vol", "was", "way", "ways", "we", "web", "well", "were", "what", "when", "whether", "which", "who", "why", "with", "within", "works", "would", "years", "york", "you", "your", "suggests", "without"] stopwords.extend(additional_stopwords) return set(stopwords)
def filter_stopwords_from_topics(topic_words, stopwords): filtered_topics = [] for words in topic_words: filtered_topics.append([word for word in words if word.lower() not in stopwords]) return filtered_topics
def generate_wordcloud(topic_words, topic_num, palette='inferno'): colors = sns.color_palette(palette, n_colors=256).as_hex() def color_func(word, font_size, position, orientation, random_state=None, **kwargs): return colors[random_state.randint(0, len(colors) - 1)] wordcloud = WordCloud(width=800, height=400, background_color='black', color_func=color_func).generate(' '.join(topic_words)) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.title(f'Topic {topic_num} Word Cloud') plt.show()
file_paths = [f"/home/roomal/Desktop/Dreyfus-Project/Dreyfus/{fname}" for fname in os.listdir("/home/roomal/Desktop/Dreyfus-Project/Dreyfus/") if fname.endswith(".pdf")] authors, titles, all_sentences = process_files(file_paths) output_file = "/home/roomal/Desktop/Dreyfus-Project/Dreyfus_Papers.csv" save_data_to_csv(authors, titles, file_paths, output_file) stopwords_filepath = "/home/roomal/Documents/Lists/stopwords.txt" stopwords = load_stopwords(stopwords_filepath) try: topic_model = Top2Vec( all_sentences, embedding_model="distiluse-base-multilingual-cased", speed="deep-learn", workers=6 ) print("Top2Vec model created successfully.") except ValueError as e: print(f"Error initializing Top2Vec: {e}") except Exception as e: print(f"Unexpected error: {e}") num_topics = topic_model.get_num_topics() topic_words, word_scores, topic_nums = topic_model.get_topics(num_topics) filtered_topic_words = filter_stopwords_from_topics(topic_words, stopwords) for i, words in enumerate(filtered_topic_words): print(f"Topic {i}: {', '.join(words)}") keywords = ["heidegger"] topic_words, word_scores, topic_scores, topic_nums = topic_model.search_topics(keywords=keywords, num_topics=num_topics) filtered _search_topic_words = filter_stopwords_from_topics(topic_words, stopwords) for i, words in enumerate(filtered_search_topic_words): generate_wordcloud(words, topic_nums[i]) for i in range(reduced_num_topics): topic_words = topic_model.topic_words_reduced[i] filtered_words = [word for word in topic_words if word.lower() not in stopwords] print(f"Reduced Topic {i}: {', '.join(filtered_words)}") generate_wordcloud(filtered_words, i)
reduced_num_topics = 5 topic_mapping = topic_model.hierarchical_topic_reduction(num_topics=reduced_num_topics) # Print reduced topics and generate word clouds for i in range(reduced_num_topics): topic_words = topic_model.topic_words_reduced[i] filtered_words = [word for word in topic_words if word.lower() not in stopwords] print(f"Reduced Topic {i}: {', '.join(filtered_words)}") generate_wordcloud(filtered_words, i)
