| from sklearn.feature_extraction.text import TfidfVectorizer |
| import pandas as pd |
| import numpy as np |
| from itertools import islice |
| from romanize import uroman |
|
|
|
|
| verses = [ |
| 1, |
| 1534, |
| 2747, |
| 3606, |
| 4895, |
| 5854, |
| 6512, |
| 7130, |
| 7215, |
| 8026, |
| 8721, |
| 9538, |
| 10257, |
| 11200, |
| 12022, |
| 12302, |
| 12707, |
| 12874, |
| 13944, |
| 16471, |
| 17608, |
| 17725, |
| 19016, |
| 20380, |
| 20534, |
| 21807, |
| 22164, |
| 22361, |
| 22434, |
| 22580, |
| 22601, |
| 22649, |
| 22754, |
| 22857, |
| 22910, |
| 22948, |
| 23159, |
| 23214, |
| 24285, |
| 24963, |
| 26114, |
| 26993, |
| 27999, |
| 28432, |
| 28869, |
| 29125, |
| 29274, |
| 29429, |
| 29533, |
| 29628, |
| 29717, |
| 29764, |
| 29877, |
| 29960, |
| 30006, |
| 30031, |
| 30334, |
| 30442, |
| 30547, |
| 30608, |
| 30713, |
| 30726, |
| 30741, |
| 30766, |
| 31171 |
| ] |
|
|
| |
| verses = [x-1 for x in verses] |
|
|
| |
| def extract_interested_verse(file_path, line_number, romanize=False): |
| with open(file_path, 'r', encoding='utf-8') as file: |
| for i, line in enumerate(file): |
| if i == line_number: |
| if romanize: |
| return uroman(line.strip()) |
| else: |
| return line.strip() |
| return None |
|
|
|
|
| |
| def segment_corpus(file_path, romanize=False): |
| documents = [] |
| current_document = [] |
| with open(file_path, 'r', encoding='utf-8') as file: |
| for i, line in enumerate(file, start=1): |
| if i in verses: |
| if current_document: |
| joined_doc_string = " ".join(current_document) |
| if romanize: |
| joined_doc_string = uroman(joined_doc_string) |
| documents.append(joined_doc_string) |
| current_document = [] |
| current_document.append(line.strip()) |
| |
| if current_document: |
| joined_doc_string = " ".join(current_document) |
| if romanize: |
| joined_doc_string = uroman(joined_doc_string) |
| documents.append(joined_doc_string) |
| return documents |
|
|
| |
| def analyze_verse_in_corpus(file_path, interested_line, romanize=False): |
| documents = segment_corpus(file_path, romanize=romanize) |
| tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 4)) |
| tfidf_matrix = tfidf_vectorizer.fit_transform(documents) |
| feature_names = tfidf_vectorizer.get_feature_names_out() |
|
|
| |
| document_index = next(i for i, v in enumerate(verses) if v > interested_line) - 1 |
|
|
| |
| scores = np.array(tfidf_matrix[document_index].todense()).flatten() |
| scores_dict = dict(zip(feature_names, scores)) |
|
|
| |
| interested_verse = extract_interested_verse(file_path, interested_line - 1, romanize=romanize) |
| |
| |
| if interested_verse: |
| tfidf_vectorizer_verse = TfidfVectorizer(ngram_range=(2, 4)) |
| tfidf_vectorizer_verse.fit([interested_verse]) |
| verse_ngrams = tfidf_vectorizer_verse.get_feature_names_out() |
| verse_scores = {ngram: scores_dict.get(ngram, 0) for ngram in verse_ngrams} |
| |
| sorted_verse_scores = dict(sorted(verse_scores.items(), key=lambda item: item[1], reverse=True)) |
| return sorted_verse_scores |
| else: |
| return "Verse not found." |
|
|
|
|
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |