User:Cramulator/Summaries source code
Appearance
popularTechnical.txt
|
---|
# popularTechnical.txt copied from: # https://pageviews.wmcloud.org/massviews/?platform=all-access&agent=user&source=category&range=latest-20&subjectpage=0&subcategories=1&sort=views&direction=1&view=list&target=https://en.wikipedia.org/wiki/Category:Wikipedia%20articles%20that%20are%20too%20technical # Category:Wikipedia articles that are too technical 1 Malaysia 176,115 8,386 / day 2 DeepSeek 159,276 7,585 / day 3 Edward Snowden 121,710 5,796 / day 4 Wayback Machine 96,298 4,586 / day 5 List of TCP and UDP port numbers 79,959 3,808 / day 6 Occam's razor 73,617 3,506 / day 7 Dionysus 69,671 3,318 / day 8 Multiple myeloma 62,123 2,958 / day 9 Existentialism 50,531 2,406 / day 10 The Jackson 5 48,319 2,301 / day 11 Psychosis 47,105 2,243 / day 12 5.56×45mm NATO 43,267 2,060 / day 13 World Trade Organization 40,079 1,909 / day 14 DisplayPort 36,291 1,728 / day 15 Eigenvalues and eigenvectors 34,156 1,626 / day 16 Avicenna 32,268 1,537 / day 17 Rorschach test 31,424 1,496 / day 18 Android 16 31,202 1,486 / day 19 BlackBerry Limited 27,678 1,318 / day 20 Astigmatism 27,440 1,307 / day 21 CAN bus 27,233 1,297 / day 22 Madonna–whore complex 26,262 1,251 / day 23 Coulomb's law 26,151 1,245 / day 24 Ø 25,367 1,208 / day 25 Wi-Fi 6 25,007 1,191 / day 26 Sports betting 24,857 1,184 / day 27 Pituitary gland 24,482 1,166 / day 28 Confidence interval 24,239 1,154 / day 29 Waste management 23,647 1,126 / day 30 Cotton 23,530 1,120 / day 31 Synthetic-aperture radar 23,519 1,120 / day 32 Diesel engine 23,182 1,104 / day 33 Elasticsearch 22,714 1,082 / day 34 Theropoda 21,941 1,045 / day 35 Brownian motion 21,885 1,042 / day 36 Carpal tunnel syndrome 19,926 949 / day 37 Psychoanalysis 18,922 901 / day 38 Tuned mass damper 18,539 883 / day 39 Nominal Pipe Size 18,484 880 / day 40 Finite-state machine 18,421 877 / day 41 Fatal insomnia 18,015 858 / day 42 Jacques Derrida 17,849 850 / day 43 Miami-Dade County, Florida 17,548 836 / day 44 Time crystal 17,515 834 / day 45 Microphone 17,396 828 / day 46 Carnian pluvial episode 17,367 827 / day 47 Long short-term memory 17,293 823 / day 48 1. FSV Mainz 05 17,128 816 / day 49 OLED 17,067 813 / day 50 Fire extinguisher 16,807 800 / day 51 Fossil 16,774 799 / day 52 Spinal cord 16,656 793 / day 53 Hypovolemic shock 16,534 787 / day 54 Magnet URI scheme 16,466 784 / day 55 Code-switching 16,283 775 / day 56 Minimum wage 16,080 766 / day 57 USB4 15,910 758 / day 58 Invertible matrix 15,557 741 / day 59 Liquid breathing 15,168 722 / day 60 MIDI 14,917 710 / day 61 Thermal conductivity and resistivity 14,792 704 / day 62 OBD-II PIDs 14,377 685 / day 63 Theory of mind 14,317 682 / day 64 Enmeshment 14,275 680 / day 65 Greensleeves 14,100 671 / day 66 Null hypothesis 13,864 660 / day 67 Spectral density 13,676 651 / day 68 Ku Klux Klan titles and vocabulary 13,619 649 / day 69 List of Internet phenomena 13,595 647 / day 70 Effect size 13,570 646 / day 71 Gram-negative bacteria 13,488 642 / day 72 Consumer price index 13,420 639 / day 73 Glycine 13,361 636 / day 74 Water polo 12,715 605 / day 75 Filioque 12,426 592 / day 76 Swift (programming language) 12,404 591 / day 77 ISO metric screw thread 12,348 588 / day 78 Gravastar 11,967 570 / day 79 Berberine 11,908 567 / day 80 FFmpeg 11,877 566 / day 81 Type I and type II errors 11,783 561 / day 82 Busy beaver 11,749 559 / day 83 Rainbow table 11,670 556 / day 84 Latex 11,627 554 / day 85 AV1 11,615 553 / day 86 Problem solving 11,608 553 / day 87 Paralegal 11,565 551 / day 88 Strategy 11,510 548 / day 89 Bipolar junction transistor 11,260 536 / day 90 Is-a 11,246 536 / day 91 Vancomycin 11,176 532 / day 92 Eigendecomposition of a matrix 11,153 531 / day 93 Carl Rogers 11,130 530 / day 94 Younger Dryas impact hypothesis 10,984 523 / day |
parse_articles.py
|
---|
import re
import json
import requests
from typing import List, Dict, Optional, Tuple
def parse_technical_template(wikitext: str) -> Tuple[bool, bool, Optional[int]]:
"""
Parse the technical template from the wikitext.
Returns a tuple of (needs_summarization, is_section, template_position)
where:
- needs_summarization: Whether the article needs summarization
- is_section: True if the template is for a section, False if it's for the entire article
- template_position: Position of the template in the wikitext (for section extraction)
"""
# Find all Technical templates (using case-insensitive flag)
template_matches = list(re.finditer(r'{{Technical(\|.*?)?}}', wikitext, re.IGNORECASE))
for match in template_matches:
template_pos = match.start()
template_text = match.group(0)
# Check if this is a section template by looking for the 'section' parameter
# anywhere in the template parameters (case-insensitive)
is_section = bool(re.search(r'\|section\b', template_text, re.IGNORECASE))
if is_section:
return True, True, template_pos
else:
# It's a general technical template
return True, False, None
# No technical template found
return False, False, None
def get_article_wikitext(title: str) -> str:
"""Fetch the wikitext of an article using the MediaWiki API"""
params = {
'action': 'parse',
'page': title,
'prop': 'wikitext',
'format': 'json'
}
api_url = 'https://en.wikipedia.org/w/api.php'
response = requests.get(api_url, params=params)
data = response.json()
if 'error' in data:
print(f"Error fetching {title}: {data['error']['info']}")
return ""
return data['parse']['wikitext']['*']
def extract_section_wikitext(wikitext: str, template_position: int) -> Tuple[str, str]:
"""
Extract the wikitext for the section containing the template.
This function finds the section heading (==) preceding the template
and includes all content until the next heading of same or lower level.
Returns:
Tuple[str, str]: A tuple of (section_wikitext, section_name)
"""
if template_position is None:
return "", ""
# Find the section heading preceding the template
section_text = wikitext[:template_position]
heading_matches = list(re.finditer(r'^(=+)\s*([^=].*?)\s*\1\s*$', section_text, re.MULTILINE))
if not heading_matches:
return "", "" # No section found
# Get the last heading before the template
last_heading = heading_matches[-1]
heading_level = len(last_heading.group(1)) # Number of = signs
section_name = last_heading.group(2).strip() # Extract the actual section name
section_start = last_heading.start()
# Find the next heading of same or lower level
rest_of_text = wikitext[template_position:] # Start search from the template position, not section start
next_heading_pattern = r'^(=+)\s*([^=].*?)\s*\1\s*$'
next_headings = list(re.finditer(next_heading_pattern, rest_of_text, re.MULTILINE))
if next_headings:
for heading in next_headings:
next_level = len(heading.group(1))
if next_level <= heading_level:
section_end = template_position + heading.start()
return wikitext[section_start:section_end].strip(), section_name
# If we don't find a next heading, include everything until the end
return wikitext[section_start:].strip(), section_name
def read_article_list(file_path: str) -> List[str]:
"""Read the list of articles from the file"""
articles = []
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# Skip the first three lines as mentioned
for line in lines[3:]:
parts = line.strip().split('\t')
if len(parts) >= 2:
# The article title is the second column
articles.append(parts[1])
return articles
def analyze_articles(article_titles: List[str]) -> List[Dict]:
"""Analyze articles and determine if they need summarization"""
results = []
for title in article_titles:
print(f"Processing article: {title}")
wikitext = get_article_wikitext(title)
if not wikitext:
continue
needs_summarization, is_section, template_position = parse_technical_template(wikitext)
if needs_summarization:
result = {
"title": title,
"needs_summarization": True
}
if is_section:
# Only summarize the specific section
section_text, section_name = extract_section_wikitext(wikitext, template_position)
result["section_name"] = section_name
result["wikitext"] = section_text
else:
# Summarize the entire article
result["section_name"] = "entire article"
result["wikitext"] = wikitext
results.append(result)
else:
# Article doesn't need summarization
results.append({
"title": title,
"needs_summarization": False
})
return results
def main():
file_path = "popularTechnical.txt"
output_file = "technical_articles.json"
# Read the list of articles
article_titles = read_article_list(file_path)
print(f"Found {len(article_titles)} articles to analyze")
# Analyze the articles
results = analyze_articles(article_titles)
# Save the results to a JSON file
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"Results saved to {output_file}")
print(f"Articles needing summarization: {sum(1 for r in results if r.get('needs_summarization', False))}")
if __name__ == "__main__":
main()
|
obtain_summaries.py
|
---|
import json
from os import environ
from google import genai
from google.genai import types
def load_existing_summaries():
"""Load existing summaries from file if it exists"""
try:
with open("summaries9.json", "r", encoding="utf-8") as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
return {}
def save_summaries(summaries):
"""Save summaries to file"""
with open("summaries9.json", "w", encoding="utf-8") as f:
json.dump(summaries, f, indent=2, ensure_ascii=False)
def get_summary(title, section_name, wikitext):
"""Get summary from Google Gemini API"""
client = genai.Client(api_key=environ["GEMINI_API_KEY"])
# Build the prompt based on whether it's a section or entire article
if section_name == "entire article":
prompt = f"""Please provide a one paragraph plain text summary at grade 9 reading level for the following wikitext from the English Wikipedia article "{title}":
{wikitext}"""
else:
prompt = f"""Please provide a one paragraph plain text summary at grade 9 reading level for the following wikitext from the {section_name} section of the English Wikipedia article "{title}":
{wikitext}"""
response = client.models.generate_content(
model="gemini-2.5-pro-exp-03-25",
config=types.GenerateContentConfig(temperature=0),
contents=prompt)
return response.text
def main():
# Check if API key is set
if "GEMINI_API_KEY" not in environ:
print("Error: GEMINI_API_KEY environment variable not set")
print("Please set it with: export GEMINI_API_KEY=your_api_key")
return
# Load existing summaries
summaries = load_existing_summaries()
# Load articles that need summarization
try:
with open("technical_articles.json", "r", encoding="utf-8") as f:
articles = json.load(f)
except FileNotFoundError:
print("Error: technical_articles.json not found")
print("Please run parse_articles.py first")
return
# Count articles that need summarization
to_summarize = [a for a in articles if a.get("needs_summarization", False) and a["title"] not in summaries]
total = len(to_summarize)
print(f"Found {total} articles that need summarization")
# Generate summaries for articles that need it
for i, article in enumerate(to_summarize, 1):
title = article["title"]
section_name = article.get("section_name", "entire article")
wikitext = article["wikitext"]
print(f"Generating summary {i}/{total}: {title} [{section_name}]")
try:
summary = get_summary(title, section_name, wikitext)
summaries[title] = {
"title": title,
"section_name": section_name,
"summary": summary
}
# Save after each successful summary to avoid losing work
save_summaries(summaries)
print(f"✓ Summary saved for {title}")
except Exception as e:
print(f"× Error generating summary for {title}: {str(e)}")
print(f"Completed. {len(summaries)} total summaries in summaries9.json")
if __name__ == "__main__":
main()
|
prepare_summaries.py
|
---|
import json
import re
import html
def load_summaries():
"""Load summaries from file"""
try:
with open("summaries.json", "r", encoding="utf-8") as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
print("Error: summaries.json not found or invalid")
return {}
def markdown_to_wikitext(text):
"""Convert markdown formatting to wikitext"""
# Convert *bold* to ''italic''
text = re.sub(r'\*([^*]+)\*', r"''\1''", text)
# Convert [text](link) to [link text]
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'[\2 \1]', text)
return text
def create_talk_page_message(title, section_name, summary):
"""Create a talk page message for the article"""
# Clean up the summary and convert markdown to wikitext
summary = markdown_to_wikitext(summary.strip())
# Create the talk page URL
talk_url = f"https://en.wikipedia.org/w/index.php?title=Talk:{title}&action=edit§ion=new&preloadtitle=Proposed%20summary%20for%20technical%20prose"
# Create the message based on whether it's a section or the entire article
if section_name == "entire article":
message = f"""I've been using Google's Gemini 2.5 Pro Experimental [[large language model]] to create summaries for the most popular articles with {{{{tl|Technical}}}} templates. This article, {title}, has such a template above the entire article. Here is the paragraph summary at grade 5 reading level which Gemini 2.5 Pro suggested:
:{summary}
While I have read and may have made some modifications to that summary, I am not going to add it to the article because I want other editors to review, revise if appropriate, and add it instead. This is an experiment with a few dozen articles initially to see how these suggestions are received, and after a week or two, I will decide how to proceed. Thank you for your consideration. ~~~~"""
else:
message = f"""I've been using Google's Gemini 2.5 Pro Experimental [[large language model]] to create summaries for the most popular articles with {{{{tl|Technical}}}} templates. This article, {title}, has such a template in the "[[{title}#{section_name}|{section_name}]]" section. Here is the paragraph summary at grade 5 reading level which Gemini 2.5 Pro suggested for that section:
:{summary}
While I have read and may have made some modifications to that summary, I am not going to add it to the section because I want other editors to review, revise if appropriate, and add it instead. This is an experiment with a few dozen articles initially to see how these suggestions are received, and after a week or two, I will decide how to proceed. Thank you for your consideration. ~~~~"""
# Escape HTML special characters in the message for the text area
escaped_message = html.escape(message)
# Create HTML with a copy button
html_output = f"""<div class="talk-page-message">
<h2>{title} - {section_name}</h2>
<p><a href="{talk_url}" target="_blank">Add a new talk page section</a></p>
<div class="copy-container">
<textarea class="message-text" rows="15" cols="100" readonly>{escaped_message}</textarea>
<button class="copy-button" onclick="copyText(this)">Copy</button>
</div>
</div>"""
return html_output
def generate_html_page(summaries):
"""Generate a complete HTML page with all talk page messages"""
html_head = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Wikipedia Talk Page Messages</title>
<style>
body {
font-family: Arial, sans-serif;
line-height: 1.6;
margin: 0;
padding: 20px;
background-color: #f5f5f5;
}
h1 {
color: #333;
text-align: center;
}
.talk-page-message {
background-color: white;
border-radius: 8px;
padding: 15px;
margin-bottom: 20px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
h2 {
margin-top: 0;
color: #0366d6;
}
.copy-container {
position: relative;
margin-top: 10px;
}
.message-text {
width: 100%;
border: 1px solid #ddd;
border-radius: 4px;
padding: 10px;
font-family: monospace;
resize: vertical;
}
.copy-button {
position: absolute;
top: 5px;
right: 5px;
background-color: #0366d6;
color: white;
border: none;
border-radius: 4px;
padding: 5px 10px;
cursor: pointer;
}
.copy-button:hover {
background-color: #0353a8;
}
.success {
background-color: #28a745;
}
</style>
</head>
<body>
<h1>Wikipedia Talk Page Messages</h1>
"""
html_messages = ""
for title, data in summaries.items():
section_name = data.get("section_name", "entire article")
summary = data.get("summary", "")
html_messages += create_talk_page_message(title, section_name, summary) + "\n"
html_foot = """<script>
function copyText(button) {
const container = button.parentElement;
const textarea = container.querySelector('.message-text');
textarea.select();
document.execCommand('copy');
// Show success state
button.textContent = 'Copied!';
button.classList.add('success');
// Reset after 2 seconds
setTimeout(() => {
button.textContent = 'Copy';
button.classList.remove('success');
}, 2000);
}
</script>
</body>
</html>"""
return html_head + html_messages + html_foot
def main():
# Load summaries
summaries = load_summaries()
if not summaries:
print("No summaries found.")
return
# Generate HTML
html_content = generate_html_page(summaries)
# Write to file
with open("talk_page_messages.html", "w", encoding="utf-8") as f:
f.write(html_content)
print(f"Generated talk page messages for {len(summaries)} articles.")
print("Output saved to talk_page_messages.html")
if __name__ == "__main__":
main()
|