Jump to content

User:IngenuityBot/fixpages.py

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
import requests, re
from getpagemetadata import getpagemetadata

urls = {
    "cbc.ca": "[[CBC News]]",
    "ctvnews.ca": "[[CTV News]]",
    "globalnews.ca": "[[Global News]]",
    "thestar.com": "[[Toronto Star]]",
    "washingtonpost.com": "[[The Washington Post]]",
    "nytimes.com": "[[The New York Times]]",
    "theglobeandmail.com": "[[The Globe and Mail]]",
    "nationalpost.com": "[[National Post]]",
    "apnews.com": "[[Associated Press]]",
    "reuters.com": "[[Reuters]]",
    "bbc.com": "[[BBC News]]",
    "theguardian.com": "[[The Guardian]]",
    "aljazeera.com": "[[Al Jazeera]]",
    "npr.org": "[[NPR]]",
    "nbcnews.com": "[[NBC News]]",
    "usatoday.com": "[[USA Today]]",
    "latimes.com": "[[Los Angeles Times]]",
    "wsj.com": "[[The Wall Street Journal]]",
    "politico.com": "[[Politico]]",
    "bloomberg.com": "[[Bloomberg News]]",
    "axios.com": "[[Axios (website)|Axios]]",
    "businessinsider.com": "[[Business Insider]]",
    "thehill.com": "[[The Hill (newspaper)|The Hill]]",
    "nypost.com": "[[New York Post]]",
    "chicagotribune.com": "[[Chicago Tribune]]",
    "vox.com": "[[Vox (website)|Vox]]",
    "slate.com": "[[Slate (magazine)|Slate]]",
    "theatlantic.com": "[[The Atlantic]]",
    "newyorker.com": "[[The New Yorker]]",
    "time.com": "[[Time (magazine)|Time]]",
    "smh.com.au": "[[The Sydney Morning Herald]]",
    "space.com": "[[Space.com]]",
    "rollingstone.com": "[[Rolling Stone]]",
    "nzherald.co.nz": "[[The New Zealand Herald]]",
    "news.com.au": "[[News.com.au]]",
    "nasa.gov": "[[NASA]]",
    "msnbc.com": "[[MSNBC]]",
    "thejc.com": "[[The Jewish Chronicle]]",
    "theconversation.com": "[[The Conversation (website)|The Conversation]]",
    "hollywoodreporter.com": "[[The Hollywood Reporter]]",
    "gizmodo.com": "[[Gizmodo]]",
    "thediplomat.com": "[[The Diplomat]]",
    "deadline.com": "[[Deadline Hollywood]]",
    "abcnews.go.com": "[[ABC News]]",
    "cnn.com": "[[CNN]]",
    "theverge.com": "[[The Verge]]",
    "theage.com.au": "[[The Age]]",
    "afp.com": "[[Agence France-Presse]]",
    "arstechica.com": "[[Ars Technica]]",
    "theaustralian.com.au": "[[The Australian]]",
    "avclub.com": "[[The A.V. Club]]",
    "buzzfeednews.com": "[[BuzzFeed News]]",
    "csmonitor.com": "[[The Christian Science Monitor]]",
    "cnet.com": "[[CNET]]",
    "telegraph.co.uk": "[[The Daily Telegraph]]",
    "ew.com": "[[Entertainment Weekly]]",
    "forbes.com": "[[Forbes]]",
    "foxnews.com": "[[Fox News]]",
    "ign.com": "[[IGN]]",
    "qz.com": "[[Quartz (publication)|Quartz]]",
    "scientificamerican.com": "[[Scientific American]]",
    "scmp.com": "[[South China Morning Post]]",
    "variety.com": "[[Variety (magazine)|Variety]]",
    "vogue.com": "[[Vogue (magazine)|Vogue]]",
    "vox.com": "[[Vox (website)|Vox]]",
    "wired.com": "[[Wired (magazine)|Wired]]"
}


def get_wikipedia_content(titles):
    endpoint = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "content",
        "format": "json",
        "titles": "|".join(titles[:50])
    }
    response = requests.get(endpoint, params=params)
    data = response.json()
    pages = data["query"]["pages"]
    content_dict = {}
    for _, page_data in pages.items():
        try:
            content_dict[page_data["title"]] = page_data["revisions"][0]["*"]
        except:
            pass
    if len(titles) > 50:
        content_dict.update(get_wikipedia_content(titles[50:]))
    return content_dict


def get_wikipedia_pages():
    with open("pages.txt", "r") as f:
        return f.read().split("\n")


def parse_time(timestamp):
    return timestamp.split("T")[0]


def metadata_to_wikitext(metadata):
    if not metadata["title"] or not metadata["url"]:
        return None

    metadata["title"] = metadata["title"].replace('|', '{{!}}')

    args = []

    for key, value in metadata.items():
        if value:
            args.append(f"|{key}={value}")

    argtext = " ".join(args)
    return "{{cite web " + argtext + "}}"


def main():
    pages = get_wikipedia_pages()
    print(f"Fetching content of {len(pages)} pages...")
    content = get_wikipedia_content(pages)
    print("Done.")

    regex = re.compile(r"<ref(?:\s+name=\"?[^>]+\"?)?>\[?(http[^ <\]]+)]?(?: ?{{bare[^}]+?}})?<\/ref>")

    for item in pages:
        if item not in content:
            continue

        matches = re.findall(regex, content[item])

        if not matches:
            continue

        to_replace = []

        for item in matches:
            for url in urls:
                if re.findall("[\.\/]" + url, item):
                    to_replace.append(item)
                    break
        
        if not to_replace:
            continue

        for item in to_replace:
            metadata = getpagemetadata(item)
            metadata["date"] = parse_time(metadata["date"]) if "date" in metadata else None
            wikitext = metadata_to_wikitext(metadata)
            if not wikitext:
                continue
            print(f"Original URL: {item}")
            print(f"\tReplaced with: {wikitext}\n")

            with open("results.txt", "a") as f:
                f.write(f"{item}\n{wikitext}\n\n")


if __name__ == "__main__":
    main()