User:InfoboxBot/wikipedia edit pages clean.py
Appearance
The following code was developed for Python 3.6. It may not work on previous versions of Python. It is designed to work in conjunction with a pre-existing MySQL database containing scraped infobox data from enwiki, which is populated by several other entirely different scripts, which are not currently open source/available for public viewing (although I may eventually make the whole thing properly & fully open source).
import urllib.request
import urllib
import json
import requests
import time
import glob, os
import mwparserfromhell
import re
import pymysql.cursors
import sys
# Set up session object for all requests
s = requests.Session()
# NOTE: All cookies received will be stored in the session object
headers = {
'User-Agent': 'enwiki InfoboxBot power/dam infobox editor by Garzfoth, v1.0a' # UPDATE WITH CHANGES
}
######################################################################################
# ==> REMEMBER TO UPDATE THE DATABASE STRING IF CHANGING THE TEMPLATE SET <==
######################################################################################
database = "infobox_power_station"
database_host = "localhost"
database_user = "" # CONFIDENTIAL (REMOVED)
database_password = "" # CONFIDENTIAL (REMOVED)
database_charset = "utf8mb4"
login_username = "" # CONFIDENTIAL (REMOVED)
login_botpassword = "" # CONFIDENTIAL (REMOVED)
######################################################################################
# ==> REMEMBER TO SET THE KEY / PREVALUE / VALUE / LIMIT <==
######################################################################################
ib_key = "ps_units_manu_model"
ib_prevalue = "Vestas"
ib_value = "[[Vestas]]"
sql_limit = 0
if sql_limit > 0:
sql_post = " LIMIT "+str(sql_limit)
else:
sql_post = ""
connection = pymysql.connect(host=database_host,
user=database_user,
password=database_password,
db=database,
charset=database_charset,
cursorclass=pymysql.cursors.DictCursor)
try:
with connection.cursor() as cursor:
# Execute SQL
######################################################################################
# ==> REMEMBER TO UPDATE THE SQL QUERY IF NECESSARY <==
######################################################################################
sql = "SELECT pageid, title FROM `data` WHERE `key` = '"+ib_key+"' AND `value` = '"+ib_prevalue+"'"+sql_post
cursor.execute(sql)
result = cursor.fetchall()
finally:
connection.close()
######################################################################################
# ==> REMEMBER TO UPDATE THE EDIT SUMMARY IF NECESSARY <==
######################################################################################
editsummary = "Automated edit: fixing infobox parameter \""+ib_key+"\""
print("----------")
print("SQL: "+sql)
print("ib_key: "+ib_key)
print("ib_value: "+ib_value)
print("Transformation: "+ib_prevalue+" => "+ib_value)
print("----------")
print("If the above values are incorrect, please press Ctrl+C within 10 seconds...")
n = 10 # seconds
n = n * 2 # half-second increments
for i in range(n):
sys.stdout.write('\r')
j = (i + 1) / n
sys.stdout.write("[%-20s] %d%%" % ('='*int(20*j), 100*j))
sys.stdout.flush()
time.sleep(0.5) # half-second increments
sys.stdout.write("\n")
sys.stdout.flush()
print("Main program starting...")
print("----------")
# Start login process
# Acquire login token
query = {
"action": "query",
"format": "json",
"meta": "tokens",
"type": "login"
}
encodedquery = urllib.parse.urlencode(query)
baseurl = "https://en.wikipedia.org/w/api.php?"
#print(baseurl+encodedquery)
login1 = s.get(baseurl+encodedquery, headers=headers)
print("Login #1 (login token): " + login1.json()["query"]["tokens"]["logintoken"])
# login1["token"]
# Log in
query = {
"action": "login",
"format": "json",
"lgname": login_username
}
querypost = {
"action": "login",
"format": "json",
"lgname": login_username,
"lgpassword": login_botpassword,
"lgtoken": login1.json()["query"]["tokens"]["logintoken"]
}
encodedquery = urllib.parse.urlencode(query)
#print(baseurl+encodedquery)
login2 = s.post("https://en.wikipedia.org/w/api.php", data=querypost, headers=headers)
#print("Login #2: " + login2.json()["login"]["result"])
print(login2.json())
print("----------")
i = 0
result_count = len(result)
for item in result:
i = i + 1
print(item["title"] + " - " + str(item["pageid"]))
# Acquire content/timestamp for page to edit
query = {
"action": "query",
"format": "json",
"curtimestamp": 1,
"prop": "revisions",
"pageids": item["pageid"],
"rvprop": "content|timestamp"
}
encodedquery = urllib.parse.urlencode(query)
#print(baseurl+encodedquery)
response = s.get(baseurl+encodedquery, headers=headers)
wikicode = mwparserfromhell.parse(response.json()["query"]["pages"][str(item["pageid"])]["revisions"][0]["*"])
templates = wikicode.filter_templates()
#tpl = next(x for x in templates if x.startswith("{{Infobox power station") or x.startswith("{{infobox power station") or x.startswith("{{Infobox power plant") or x.startswith("{{infobox power plant") or x.startswith("{{Infobox wind farm") or x.startswith("{{infobox wind farm") or x.startswith("{{Infobox nuclear power station") or x.startswith("{{infobox nuclear power station"))
######################################################################################
# ==> REMEMBER TO SET THE TEMPLATE SET BEING USED <==
######################################################################################
tpl = next(x for x in templates if x.name.matches(["Infobox power station", "Infobox power plant", "Infobox wind farm", "Infobox nuclear power station"]))
#tpl = next(x for x in templates if x.name.matches(["Infobox dam", "Infobox hydroelectric power station"]))
#length = len(tpl.params)
#print("Params found: "+str(length))
tpl.add(ib_key, ib_value)
#print(str(wikicode))
# response.json()["curtimestamp"]
# response.json()["query"]["pages"][str(item["pageid"])]["revisions"][0]["timestamp"]
# Obtain CSRF token
query = {
"action": "query",
"format": "json",
"meta": "tokens",
"type": "csrf"
}
encodedquery = urllib.parse.urlencode(query)
csrf = s.get(baseurl+encodedquery, headers=headers)
# Make the edit (but first, some useful comments)
# https://www.mediawiki.org/wiki/API:Assert
# assert = "user" / "bot" ==> request will fail if user is not logged in / if user is not a bot
# https://www.mediawiki.org/wiki/API:Edit
# summary = "edit summary"
# minor = 1 ==> marks the edit as minor
# notminor = 1 ==> "If set, don't mark the edit as minor, even if you have the "Mark all my edits minor by default" preference enabled"
# bot = 1 ==> marks the edit as bot-made
querypost = {
"action": "edit",
"assert": "user",
"format": "json",
"pageid": item["pageid"],
"text": str(wikicode),
"summary": editsummary,
"minor": 1,
"basetimestamp": response.json()["query"]["pages"][str(item["pageid"])]["revisions"][0]["timestamp"],
"starttimestamp": response.json()["curtimestamp"],
"nocreate": 1,
"watchlist": "nochange",
"token": csrf.json()["query"]["tokens"]["csrftoken"]
}
finalresult = s.post("https://en.wikipedia.org/w/api.php", data=querypost, headers=headers)
print(finalresult)
# Connect to the database
connection = pymysql.connect(host=database_host,
user=database_user,
password=database_password,
db=database,
charset=database_charset,
cursorclass=pymysql.cursors.DictCursor)
try:
with connection.cursor() as cursor:
# Update the record
sql = "UPDATE `data` SET `value` = %s WHERE `pageid` = %s AND `title` = %s AND `key` = %s"
cursor.execute(sql, (ib_value, item["pageid"], item["title"], ib_key))
# connection is not autocommit by default. So you must commit to save your changes.
connection.commit()
finally:
connection.close()
print("Updated in DB.")
if i != result_count:
time.sleep(10) # Let's not kill the Wikipedia API
print("----------")
print("Done!")