Jump to content

User:Cramulator/Summaries source code

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Cramulator (talk | contribs) at 21:59, 4 April 2025 (placeholder for postmortem). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.

I will be posting a postmortem report on this project here in mid-April. Here is the Village Pump discussion, the original fifth grade reading level summaries, the subsequently recommended ninth grade level summaries, and the seventh grade summaries with sources. Cramulator (talk) 21:58, 4 April 2025 (UTC)

popularTechnical.txt
# popularTechnical.txt copied from:
# https://pageviews.wmcloud.org/massviews/?platform=all-access&agent=user&source=category&range=latest-20&subjectpage=0&subcategories=1&sort=views&direction=1&view=list&target=https://en.wikipedia.org/wiki/Category:Wikipedia%20articles%20that%20are%20too%20technical
# Category:Wikipedia articles that are too technical
1	Malaysia	176,115	8,386 / day
2	DeepSeek	159,276	7,585 / day
3	Edward Snowden	121,710	5,796 / day
4	Wayback Machine	96,298	4,586 / day
5	List of TCP and UDP port numbers	79,959	3,808 / day
6	Occam's razor	73,617	3,506 / day
7	Dionysus	69,671	3,318 / day
8	Multiple myeloma	62,123	2,958 / day
9	Existentialism	50,531	2,406 / day
10	The Jackson 5	48,319	2,301 / day
11	Psychosis	47,105	2,243 / day
12	5.56×45mm NATO	43,267	2,060 / day
13	World Trade Organization	40,079	1,909 / day
14	DisplayPort	36,291	1,728 / day
15	Eigenvalues and eigenvectors	34,156	1,626 / day
16	Avicenna	32,268	1,537 / day
17	Rorschach test	31,424	1,496 / day
18	Android 16	31,202	1,486 / day
19	BlackBerry Limited	27,678	1,318 / day
20	Astigmatism	27,440	1,307 / day
21	CAN bus	27,233	1,297 / day
22	Madonna–whore complex	26,262	1,251 / day
23	Coulomb's law	26,151	1,245 / day
24	Ø	25,367	1,208 / day
25	Wi-Fi 6	25,007	1,191 / day
26	Sports betting	24,857	1,184 / day
27	Pituitary gland	24,482	1,166 / day
28	Confidence interval	24,239	1,154 / day
29	Waste management	23,647	1,126 / day
30	Cotton	23,530	1,120 / day
31	Synthetic-aperture radar	23,519	1,120 / day
32	Diesel engine	23,182	1,104 / day
33	Elasticsearch	22,714	1,082 / day
34	Theropoda	21,941	1,045 / day
35	Brownian motion	21,885	1,042 / day
36	Carpal tunnel syndrome	19,926	949 / day
37	Psychoanalysis	18,922	901 / day
38	Tuned mass damper	18,539	883 / day
39	Nominal Pipe Size	18,484	880 / day
40	Finite-state machine	18,421	877 / day
41	Fatal insomnia	18,015	858 / day
42	Jacques Derrida	17,849	850 / day
43	Miami-Dade County, Florida	17,548	836 / day
44	Time crystal	17,515	834 / day
45	Microphone	17,396	828 / day
46	Carnian pluvial episode	17,367	827 / day
47	Long short-term memory	17,293	823 / day
48	1. FSV Mainz 05	17,128	816 / day
49	OLED	17,067	813 / day
50	Fire extinguisher	16,807	800 / day
51	Fossil	16,774	799 / day
52	Spinal cord	16,656	793 / day
53	Hypovolemic shock	16,534	787 / day
54	Magnet URI scheme	16,466	784 / day
55	Code-switching	16,283	775 / day
56	Minimum wage	16,080	766 / day
57	USB4	15,910	758 / day
58	Invertible matrix	15,557	741 / day
59	Liquid breathing	15,168	722 / day
60	MIDI	14,917	710 / day
61	Thermal conductivity and resistivity	14,792	704 / day
62	OBD-II PIDs	14,377	685 / day
63	Theory of mind	14,317	682 / day
64	Enmeshment	14,275	680 / day
65	Greensleeves	14,100	671 / day
66	Null hypothesis	13,864	660 / day
67	Spectral density	13,676	651 / day
68	Ku Klux Klan titles and vocabulary	13,619	649 / day
69	List of Internet phenomena	13,595	647 / day
70	Effect size	13,570	646 / day
71	Gram-negative bacteria	13,488	642 / day
72	Consumer price index	13,420	639 / day
73	Glycine	13,361	636 / day
74	Water polo	12,715	605 / day
75	Filioque	12,426	592 / day
76	Swift (programming language)	12,404	591 / day
77	ISO metric screw thread	12,348	588 / day
78	Gravastar	11,967	570 / day
79	Berberine	11,908	567 / day
80	FFmpeg	11,877	566 / day
81	Type I and type II errors	11,783	561 / day
82	Busy beaver	11,749	559 / day
83	Rainbow table	11,670	556 / day
84	Latex	11,627	554 / day
85	AV1	11,615	553 / day
86	Problem solving	11,608	553 / day
87	Paralegal	11,565	551 / day
88	Strategy	11,510	548 / day
89	Bipolar junction transistor	11,260	536 / day
90	Is-a	11,246	536 / day
91	Vancomycin	11,176	532 / day
92	Eigendecomposition of a matrix	11,153	531 / day
93	Carl Rogers	11,130	530 / day
94	Younger Dryas impact hypothesis	10,984	523 / day
parse_articles.py
import re
import json
import requests
from typing import List, Dict, Optional, Tuple

def parse_technical_template(wikitext: str) -> Tuple[bool, bool, Optional[int]]:
    """
    Parse the technical template from the wikitext.
    Returns a tuple of (needs_summarization, is_section, template_position)
    where:
    - needs_summarization: Whether the article needs summarization
    - is_section: True if the template is for a section, False if it's for the entire article
    - template_position: Position of the template in the wikitext (for section extraction)
    """
    # Find all Technical templates (using case-insensitive flag)
    template_matches = list(re.finditer(r'{{Technical(\|.*?)?}}', wikitext, re.IGNORECASE))

    for match in template_matches:
        template_pos = match.start()
        template_text = match.group(0)

        # Check if this is a section template by looking for the 'section' parameter
        # anywhere in the template parameters (case-insensitive)
        is_section = bool(re.search(r'\|section\b', template_text, re.IGNORECASE))

        if is_section:
            return True, True, template_pos
        else:
            # It's a general technical template
            return True, False, None

    # No technical template found
    return False, False, None

def get_article_wikitext(title: str) -> str:
    """Fetch the wikitext of an article using the MediaWiki API"""
    params = {
        'action': 'parse',
        'page': title,
        'prop': 'wikitext',
        'format': 'json'
    }
    api_url = 'https://en.wikipedia.org/w/api.php'
    response = requests.get(api_url, params=params)
    data = response.json()

    if 'error' in data:
        print(f"Error fetching {title}: {data['error']['info']}")
        return ""

    return data['parse']['wikitext']['*']

def extract_section_wikitext(wikitext: str, template_position: int) -> Tuple[str, str]:
    """
    Extract the wikitext for the section containing the template.
    This function finds the section heading (==) preceding the template
    and includes all content until the next heading of same or lower level.

    Returns:
        Tuple[str, str]: A tuple of (section_wikitext, section_name)
    """
    if template_position is None:
        return "", ""

    # Find the section heading preceding the template
    section_text = wikitext[:template_position]
    heading_matches = list(re.finditer(r'^(=+)\s*([^=].*?)\s*\1\s*$', section_text, re.MULTILINE))

    if not heading_matches:
        return "", ""  # No section found

    # Get the last heading before the template
    last_heading = heading_matches[-1]
    heading_level = len(last_heading.group(1))  # Number of = signs
    section_name = last_heading.group(2).strip()  # Extract the actual section name
    section_start = last_heading.start()

    # Find the next heading of same or lower level
    rest_of_text = wikitext[template_position:]  # Start search from the template position, not section start
    next_heading_pattern = r'^(=+)\s*([^=].*?)\s*\1\s*$'
    next_headings = list(re.finditer(next_heading_pattern, rest_of_text, re.MULTILINE))

    if next_headings:
        for heading in next_headings:
            next_level = len(heading.group(1))
            if next_level <= heading_level:
                section_end = template_position + heading.start()
                return wikitext[section_start:section_end].strip(), section_name

    # If we don't find a next heading, include everything until the end
    return wikitext[section_start:].strip(), section_name

def read_article_list(file_path: str) -> List[str]:
    """Read the list of articles from the file"""
    articles = []
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        # Skip the first three lines as mentioned
        for line in lines[3:]:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                # The article title is the second column
                articles.append(parts[1])
    return articles

def analyze_articles(article_titles: List[str]) -> List[Dict]:
    """Analyze articles and determine if they need summarization"""
    results = []

    for title in article_titles:
        print(f"Processing article: {title}")
        wikitext = get_article_wikitext(title)

        if not wikitext:
            continue

        needs_summarization, is_section, template_position = parse_technical_template(wikitext)

        if needs_summarization:
            result = {
                "title": title,
                "needs_summarization": True
            }

            if is_section:
                # Only summarize the specific section
                section_text, section_name = extract_section_wikitext(wikitext, template_position)
                result["section_name"] = section_name
                result["wikitext"] = section_text
            else:
                # Summarize the entire article
                result["section_name"] = "entire article"
                result["wikitext"] = wikitext

            results.append(result)
        else:
            # Article doesn't need summarization
            results.append({
                "title": title,
                "needs_summarization": False
            })

    return results

def main():
    file_path = "popularTechnical.txt"
    output_file = "technical_articles.json"

    # Read the list of articles
    article_titles = read_article_list(file_path)
    print(f"Found {len(article_titles)} articles to analyze")

    # Analyze the articles
    results = analyze_articles(article_titles)

    # Save the results to a JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"Results saved to {output_file}")
    print(f"Articles needing summarization: {sum(1 for r in results if r.get('needs_summarization', False))}")

if __name__ == "__main__":
    main()
obtain_summaries.py
import json
from os import environ
from google import genai
from google.genai import types

# Global constants for file paths
INPUT_FILE = "technical_articles.json"
OUTPUT_FILE = "summaries9.json"

# Template for summary prompts
SUMMARY_PROMPT_TEMPLATE = """Please provide a one paragraph summary at grade {grade_level} reading level for the following wikitext {section_text}English Wikipedia article "{title}". Format your response as raw wikitext (not in a code block) with the following requirements:

1. Use an encyclopedic tone that avoids first and second person pronouns (no "you", "we", "I")
2. Maintain a neutral point of view using precise language
3. Present facts without embellishment in third person, present tense where appropriate
4. Include citations using <ref> tags for key facts (e.g., <ref>Source details</ref>)
5. Use wiki formatting for emphasis (''italics'', '''bold''') instead of markdown
6. If linking to other Wikipedia articles, use [[article name]] format

Your response should be directly usable in Wikipedia without further formatting changes.

{wikitext}"""

def load_existing_summaries():
    """Load existing summaries from file if it exists"""
    try:
        with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
            return json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        return {}

def save_summaries(summaries):
    """Save summaries to file"""
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(summaries, f, indent=2, ensure_ascii=False)

def create_summary_prompt(title, section_name, wikitext, grade_level=9):
    """Create a summary prompt for the API"""
    # Base prompt template
    section_text = f"from the {section_name} section of the " if section_name != "entire article" else "from the "
    
    prompt = SUMMARY_PROMPT_TEMPLATE.format(
        grade_level=grade_level,
        section_text=section_text,
        title=title,
        wikitext=wikitext
    )
    
    return prompt

def get_summary(title, section_name, wikitext, grade_level=9):
    """Get summary from Google Gemini API"""
    client = genai.Client(api_key=environ["GEMINI_API_KEY"])
    
    # Build the prompt
    prompt = create_summary_prompt(title, section_name, wikitext, grade_level)
    
    response = client.models.generate_content(
        model="gemini-2.5-pro-exp-03-25",
        config=types.GenerateContentConfig(temperature=0),
        contents=prompt)
    
    return response.text

def main():
    # Check if API key is set
    if "GEMINI_API_KEY" not in environ:
        print("Error: GEMINI_API_KEY environment variable not set")
        print("Get one for free at: https://aistudio.google.com/app/apikey")
        print("Please set it with: export GEMINI_API_KEY=your_api_key")
        return

    # Load existing summaries
    summaries = load_existing_summaries()
    
    # Load articles that need summarization
    try:
        with open(INPUT_FILE, "r", encoding="utf-8") as f:
            articles = json.load(f)
    except FileNotFoundError:
        print(f"Error: {INPUT_FILE} not found")
        print("Please run parse_articles.py first")
        return
    
    # Reading level to use (can be easily changed in one place)
    grade_level = 9
    
    # Count articles that need summarization
    to_summarize = [a for a in articles if a.get("needs_summarization", False) and a["title"] not in summaries]
    total = len(to_summarize)
    
    print(f"Found {total} articles that need summarization")
    print(f"Using grade {grade_level} reading level for summaries")
    
    # Generate summaries for articles that need it
    for i, article in enumerate(to_summarize, 1):
        title = article["title"]
        section_name = article.get("section_name", "entire article")
        wikitext = article["wikitext"]
        
        print(f"Generating summary {i}/{total}: {title} [{section_name}]")
        
        try:
            summary = get_summary(title, section_name, wikitext, grade_level)
            summaries[title] = {
                "title": title,
                "section_name": section_name,
                "summary": summary
            }
            
            # Save after each successful summary to avoid losing work
            save_summaries(summaries)
            print(f"✓ Summary saved for {title}")
            
        except Exception as e:
            print(f"× Error generating summary for {title}: {str(e)}")
    
    print(f"Completed. {len(summaries)} total summaries in {OUTPUT_FILE}")

if __name__ == "__main__":
    main()
prepare_summaries.py
import json
import re
import html

def load_summaries():
    """Load summaries from file"""
    try:
        with open("summaries.json", "r", encoding="utf-8") as f:
            return json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        print("Error: summaries.json not found or invalid")
        return {}

def markdown_to_wikitext(text):
    """Convert markdown formatting to wikitext"""
    # Convert *bold* to ''italic''
    text = re.sub(r'\*([^*]+)\*', r"''\1''", text)
    
    # Convert [text](link) to [link text]
    text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'[\2 \1]', text)
    
    return text

def create_talk_page_message(title, section_name, summary):
    """Create a talk page message for the article"""
    # Clean up the summary and convert markdown to wikitext
    summary = markdown_to_wikitext(summary.strip())
    
    # Create the talk page URL
    talk_url = f"https://en.wikipedia.org/w/index.php?title=Talk:{title}&action=edit&section=new&preloadtitle=Proposed%20summary%20for%20technical%20prose"
    
    # Create the message based on whether it's a section or the entire article
    if section_name == "entire article":
        message = f"""I've been using Google's Gemini 2.5 Pro Experimental [[large language model]] to create summaries for the most popular articles with {{{{tl|Technical}}}} templates. This article, {title}, has such a template above the entire article. Here is the paragraph summary at grade 5 reading level which Gemini 2.5 Pro suggested:

:{summary}

While I have read and may have made some modifications to that summary, I am not going to add it to the article because I want other editors to review, revise if appropriate, and add it instead. This is an experiment with a few dozen articles initially to see how these suggestions are received, and after a week or two, I will decide how to proceed. Thank you for your consideration. ~~~~"""
    else:
        message = f"""I've been using Google's Gemini 2.5 Pro Experimental [[large language model]] to create summaries for the most popular articles with {{{{tl|Technical}}}} templates. This article, {title}, has such a template in the "[[{title}#{section_name}|{section_name}]]" section. Here is the paragraph summary at grade 5 reading level which Gemini 2.5 Pro suggested for that section:

:{summary}

While I have read and may have made some modifications to that summary, I am not going to add it to the section because I want other editors to review, revise if appropriate, and add it instead. This is an experiment with a few dozen articles initially to see how these suggestions are received, and after a week or two, I will decide how to proceed. Thank you for your consideration. ~~~~"""
    
    # Escape HTML special characters in the message for the text area
    escaped_message = html.escape(message)
    
    # Create HTML with a copy button
    html_output = f"""<div class="talk-page-message">
    <h2>{title} - {section_name}</h2>
    <p><a href="{talk_url}" target="_blank">Add a new talk page section</a></p>
    <div class="copy-container">
        <textarea class="message-text" rows="15" cols="100" readonly>{escaped_message}</textarea>
        <button class="copy-button" onclick="copyText(this)">Copy</button>
    </div>
</div>"""
    
    return html_output

def generate_html_page(summaries):
    """Generate a complete HTML page with all talk page messages"""
    html_head = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Wikipedia Talk Page Messages</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            line-height: 1.6;
            margin: 0;
            padding: 20px;
            background-color: #f5f5f5;
        }
        h1 {
            color: #333;
            text-align: center;
        }
        .talk-page-message {
            background-color: white;
            border-radius: 8px;
            padding: 15px;
            margin-bottom: 20px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }
        h2 {
            margin-top: 0;
            color: #0366d6;
        }
        .copy-container {
            position: relative;
            margin-top: 10px;
        }
        .message-text {
            width: 100%;
            border: 1px solid #ddd;
            border-radius: 4px;
            padding: 10px;
            font-family: monospace;
            resize: vertical;
        }
        .copy-button {
            position: absolute;
            top: 5px;
            right: 5px;
            background-color: #0366d6;
            color: white;
            border: none;
            border-radius: 4px;
            padding: 5px 10px;
            cursor: pointer;
        }
        .copy-button:hover {
            background-color: #0353a8;
        }
        .success {
            background-color: #28a745;
        }
    </style>
</head>
<body>
    <h1>Wikipedia Talk Page Messages</h1>
"""
    
    html_messages = ""
    for title, data in summaries.items():
        section_name = data.get("section_name", "entire article")
        summary = data.get("summary", "")
        html_messages += create_talk_page_message(title, section_name, summary) + "\n"
    
    html_foot = """<script>
    function copyText(button) {
        const container = button.parentElement;
        const textarea = container.querySelector('.message-text');
        
        textarea.select();
        document.execCommand('copy');
        
        // Show success state
        button.textContent = 'Copied!';
        button.classList.add('success');
        
        // Reset after 2 seconds
        setTimeout(() => {
            button.textContent = 'Copy';
            button.classList.remove('success');
        }, 2000);
    }
</script>
</body>
</html>"""
    
    return html_head + html_messages + html_foot

def main():
    # Load summaries
    summaries = load_summaries()
    
    if not summaries:
        print("No summaries found.")
        return
    
    # Generate HTML
    html_content = generate_html_page(summaries)
    
    # Write to file
    with open("talk_page_messages.html", "w", encoding="utf-8") as f:
        f.write(html_content)
    
    print(f"Generated talk page messages for {len(summaries)} articles.")
    print("Output saved to talk_page_messages.html")

if __name__ == "__main__":
    main()