Module:Webarchive

Permanently protected module
From Wikipedia, the free encyclopedia

--[[ ----------------------------------

Lua module implementing the {{webarchive}} template. 

A merger of the functionality of three templates: {{wayback}}, {{webcite}} and {{cite archives}}
	
]]


--[[--------------------------< D E P E N D E N C I E S >------------------------------------------------------
]]

require('strict');
local getArgs = require ('Module:Arguments').getArgs;


--[[--------------------------< F O R W A R D   D E C L A R A T I O N S >--------------------------------------
]]

local categories = {};															-- category names
local config = {};																-- global configuration settings
local digits = {};																-- for i18n; table that translates local-wiki digits to western digits
local err_warn_msgs = {};														-- error and warning messages
local excepted_pages = {};
local month_num = {};															-- for i18n; table that translates local-wiki month names to western digits
local prefixes = {};															-- service provider tail string prefixes
local services = {};															-- archive service provider data from
local s_text = {};																-- table of static text strings used to build final rendering
local uncategorized_namespaces = {};											-- list of namespaces that we should not categorize
local uncategorized_subpages = {};												-- list of subpages that should not be categorized


--[[--------------------------< P A G E   S C O P E   I D E N T I F I E R S >----------------------------------
]]

local non_western_digits;														-- boolean flag set true when data.digits.enable is true
local this_page = mw.title.getCurrentTitle();

local track = {};																-- Associative array to hold tracking categories
local ulx = {};																	-- Associative array to hold template data 


--[[--------------------------< S U B S T I T U T E >----------------------------------------------------------

Populates numbered arguments in a message string using an argument table.

]]

local function substitute (msg, args)
	return args and mw.message.newRawMessage (msg, args):plain() or msg;
end


--[[--------------------------< tableLength >-----------------------

Given a 1-D table, return number of elements

]]

local function tableLength(T)
	local count = 0
	for _ in pairs(T) do count = count + 1 end
	return count
end


--[=[-------------------------< M A K E _ W I K I L I N K >----------------------------------------------------

Makes a wikilink; when both link and display text is provided, returns a wikilink in the form [[L|D]]; if only
link is provided, returns a wikilink in the form [[L]]; if neither are provided or link is omitted, returns an
empty string.

]=]

local function make_wikilink (link, display, no_link)
	if nil == no_link then
		if link and ('' ~= link) then
			if display and ('' ~= display) then
				return table.concat ({'[[', link, '|', display, ']]'});
			else
				return table.concat ({'[[', link, ']]'});
			end
		end
		return display or '';													-- link not set so return the display text

	else																		-- no_link
		if display and ('' ~= display) then										-- if there is display text
			return display;														-- return that
		else
			return link or '';													-- return the target article name or empty string
		end
	end
end


--[[--------------------------< createTracking >-----------------------

Return data in track[] ie. tracking categories

]]

local function createTracking()
	if not excepted_pages[this_page.fullText] then								-- namespace:title/fragment is allowed to be categorized (typically this module's / template's testcases page(s))
		if uncategorized_namespaces[this_page.nsText] then
			return '';															-- this page not to be categorized so return empty string
		end
		for _,v in ipairs (uncategorized_subpages) do							-- cycle through page name patterns
			if this_page.text:match (v) then									-- test page name against each pattern
				return '';														-- this subpage type not to be categorized so return empty string
			end
		end
	end

	local out = {};
	if tableLength(track) > 0 then
		for key, _ in pairs(track) do											-- loop through table
			table.insert (out, make_wikilink (key));							-- and convert category names to links
		end
	end
	return table.concat (out);													-- concat into one big string; empty string if table is empty

end


--[[--------------------------< inlineError >-----------------------

Critical error. Render output completely in red. Add to tracking category.

This function called as the last thing before abandoning this module

]]

local function inlineError (msg, args)
	track[categories.error] = 1
	return table.concat ({
		'<span style="font-size:100%" class="error citation-comment">Error in ',	-- open the error message span
		config.tname,															-- insert the local language template name
		' template: ',
		substitute (msg, args),													-- insert the formatted error message
		'.</span>',																-- close the span
		createTracking()														-- add the category
		})
end


--[[--------------------------< inlineRed >-----------------------

Render a text fragment in red, such as a warning as part of the final output.
Add tracking category.

 ]]

local function inlineRed(msg, trackmsg)
	if trackmsg == "warning" then
		track[categories.warning] = 1;
	elseif trackmsg == "error" then
		track[categories.error] = 1;
	end

	return '<span style="font-size:100%" class="error citation-comment">' .. msg .. '</span>'
end


--[[--------------------------< base62 >-----------------------

Convert base-62 to base-10
Credit: https://de.wikipedia.org/wiki/Modul:Expr 

]]

local function base62( value )
	local r = 1																	-- default return value is input value is malformed

	if value:match ('%W') then													-- value must only be in the set [0-9a-zA-Z]
		return;																	-- nil return when value contains extraneous characters
	end

	local n = #value															-- number of characters in value
	local k = 1
	local c
	r = 0
	for i = n, 1, -1 do															-- loop through all characters in value from ls digit to ms digit
		c = value:byte( i, i )
		if c >= 48 and c <= 57 then												-- character is digit 0-9
			c = c - 48
		elseif c >= 65 and c <= 90 then											-- character is ascii a-z
			c = c - 55
		else																	-- must be ascii A-Z
			c = c - 61
		end
		r = r + c * k															-- accumulate this base62 character's value
		k = k * 62																-- bump for next
	end -- for i

	return r
end 


--[[--------------------------< D E C O D E _ D A T E >--------------------------------------------------------

Given a date string, return it in iso format along with an indicator of the date's format.  Except that month names
must be recognizable as legitimate month names with proper capitalization, and that the date string must match one
of the recognized date formats, no error checking is done here; return nil else

]]

local function decode_date (date_str)
	local patterns = {
		['dmy'] = {'^(%d%d?) +([^%s%d]+) +(%d%d%d%d)$', 'd', 'm', 'y'},			-- %a does not recognize unicode combining characters used by some languages
		['mdy'] = {'^([^%s%d]+) (%d%d?), +(%d%d%d%d)$', 'm', 'd', 'y'},
		['ymd'] = {'^(%d%d%d%d) +([^%s%d]+) (%d%d?)$', 'y', 'm', 'd'},			-- not mos compliant at en.wiki but may be acceptible at other wikis
		};
	
	local t = {};

	if non_western_digits then													-- this wiki uses non-western digits?
		date_str = mw.ustring.gsub (date_str, '%d', digits);					-- convert this wiki's non-western digits to western digits
	end

	if date_str:match ('^%d%d%d%d%-%d%d%-%d%d$') then							-- already an iso format date, return western digits form
		return date_str, 'iso';
	end
	
	for k, v in pairs (patterns) do
		local c1, c2, c3 = mw.ustring.match (date_str, patterns[k][1]);			-- c1 .. c3 are captured but we don't know what they hold
		
		if c1 then																-- set on match
			t = {																-- translate unspecified captures to y, m, and d
				[patterns[k][2]] = c1,											-- fill the table of captures with the captures
				[patterns[k][3]] = c2,											-- take index names from src_pattern table and assign sequential captures
				[patterns[k][4]] = c3,
				};
			if month_num[t.m] then												-- when month not already a number
				t.m = month_num[t.m];											-- replace valid month name with a number
			else
				return nil, 'iso';												-- not a valid date form because month not valid
			end

			return mw.ustring.format ('%.4d-%.2d-%.2d', t.y, t.m, t.d), k;		-- return date in iso format
		end
	end
	return nil, 'iso';															-- date could not be decoded; return nil and default iso date
end

	
--[[--------------------------< makeDate >-----------------------

Given year, month, day numbers, (zero-padded or not) return a full date in df format
where df may be one of:
	mdy, dmy, iso, ymd

on entry, year, month, day are presumed to be correct for the date that they represent; all are required

in this module, makeDate() is sometimes given an iso-format date in year:
	makeDate (2018-09-20, nil, nil, df)
this works because table.concat() sees only one table member

]]

local function makeDate (year, month, day, df)
	local format = {
		['dmy'] = 'j F Y',
		['mdy'] = 'F j, Y',
		['ymd'] = 'Y F j',
		['iso'] = 'Y-m-d',
		};

	local date = table.concat ({year, month, day}, '-');						-- assemble year-initial numeric-format date (zero padding not required here)

	if non_western_digits then													-- this wiki uses non-western digits?
		date = mw.ustring.gsub (date, '%d', digits);							-- convert this wiki's non-western digits to western digits
	end

	return mw.getContentLanguage():formatDate (format[df], date);
end


--[[--------------------------< I S _ V A L I D _ D A T E >----------------------------------------------------

Returns true if date is after 31 December 1899 (why is 1900 the min year? shouldn't the internet's date-of-birth
be min year?), not after today's date, and represents a valid date (29 February 2017 is not a valid date).  Applies
Gregorian leapyear rules.

all arguments are required

]]

local function is_valid_date (year, month, day)
	local days_in_month = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
	local month_length;
	local y, m, d;
	local today = os.date ('*t');												-- fetch a table of current date parts

	if not year or '' == year or not month or '' == month or not day or '' == day then
		return false;															-- something missing
	end
	
	y = tonumber (year);
	m = tonumber (month);
	d = tonumber (day);

	if 1900 > y or today.year < y or 1 > m or 12 < m then						-- year and month are within bounds	TODO: 1900?
		return false;
	end

	if (2==m) then																-- if February
		month_length = 28;														-- then 28 days unless
		if (0==(y%4) and (0~=(y%100) or 0==(y%400))) then						-- is a leap year?
			month_length = 29;													-- if leap year then 29 days in February
		end
	else
		month_length=days_in_month[m];
	end

	if 1 > d or month_length < d then											-- day is within bounds
		return false;
	end
																					-- here when date parts represent a valid date
	return os.time({['year']=y, ['month']=m, ['day']=d, ['hour']=0}) <= os.time();	-- date at midnight must be less than or equal to current date/time
end


--[[--------------------------< decodeWebciteDate >-----------------------

Given a URI-path to Webcite (eg. /67xHmVFWP) return the encoded date in df format

returns date string in df format - webcite date is a unix timestamp encoded as bae62
or the string 'query'

]]

local function decodeWebciteDate(path, df)

	local dt = {};
	local decode;

	dt = mw.text.split(path, "/")

	-- valid URL formats that are not base62

	-- http://www.webcitation.org/query?id=1138911916587475
	-- http://www.webcitation.org/query?url=http..&date=2012-06-01+21:40:03
	-- http://www.webcitation.org/1138911916587475
	-- http://www.webcitation.org/cache/73e53dd1f16cf8c5da298418d2a6e452870cf50e
	-- http://www.webcitation.org/getfile.php?fileid=1c46e791d68e89e12d0c2532cc3cf629b8bc8c8e

	if dt[2]:find ('query', 1, true) or 
		dt[2]:find ('cache', 1, true) or
		dt[2]:find ('getfile', 1, true) or
		tonumber(dt[2]) then
			return 'query';
	end

	decode = base62(dt[2]);														-- base62 string -> exponential number
	if not decode then
		return nil;																-- nil return when dt[2] contains characters not in %w
	end
	dt = os.date('*t', string.format("%d", decode):sub(1,10))					-- exponential number -> text -> first 10 characters (a unix timestamp) -> a table of date parts

	decode = makeDate (dt.year, dt.month, dt.day, 'iso');						-- date comparisons are all done in iso format with western digits
	if non_western_digits then													-- this wiki uses non-western digits?
		decode = mw.ustring.gsub (decode, '%d', digits);						-- convert this wiki's non-western digits to western digits
	end

	return decode;
end


--[[--------------------------< decodeWaybackDate >-----------------------

Given a URI-path to Wayback (eg. /web/20160901010101/http://example.com )
or Library of Congress Web Archives (eg. /all/20160901010101/http://example.com)
or UK Government Web Archive (eg. /ukgwa/20160901010101/http://example.com or /tna/20160901010101/http://example.com)

return the formatted date eg. "September 1, 2016" in df format 
Handle non-digits in snapshot ID such as "re_" and "-" and "*"

returns two values:
	first value is one of these:
		valid date string in df format - wayback date is valid (including the text string 'index' when date is '/*/')
		empty string - wayback date is malformed (less than 8 digits, not a valid date)
		nil - wayback date is '/save/' or otherwise not a number
	
	second return value is an appropriate 'message' may or may not be formatted

]]

local function decodeWaybackDate(path, df)

	local msg, snapdate;

	snapdate = path:gsub ('^/web/', ''):gsub ('^/all/', ''):gsub ('^/ukgwa/', ''):gsub ('^/tna/', ''):gsub ('^/', '');	-- remove leading /web/, /all/, /ukgwa/, /tna/, or /
	snapdate = snapdate:match ('^[^/]+');										-- get timestamp
	if snapdate == "*" then														-- eg. /web/*/http.., etc.
		return 'index';															-- return indicator that this url has an index date
	end

	snapdate = snapdate:gsub ('%a%a_%d?$', ''):gsub ('%-', '');					-- from date, remove any trailing "re_", dashes

	msg = '';
	if snapdate:match ('%*$') then												-- a trailing '*' causes calendar display at archive .org
		snapdate = snapdate:gsub ('%*$', '');									-- remove so not part of length calc later
		msg = inlineRed (err_warn_msgs.ts_cal, 'warning');						-- make a message
	end

	if not tonumber(snapdate) then
		return nil, 'ts_nan';													-- return nil (fatal error flag) and message selector
	end

	local dlen = snapdate:len();
	if dlen < 8 then															-- we need 8 digits TODO: but shouldn't this be testing for 14 digits?
		return '', inlineRed (err_warn_msgs.ts_short, 'error');					-- return empty string and error message
	end

	local year, month, day = snapdate:match ('(%d%d%d%d)(%d%d)(%d%d)');			-- no need for snapdatelong here

	if not is_valid_date (year, month, day) then
		return '', inlineRed (err_warn_msgs.ts_date, 'error');					-- return empty string and error message
	end

	snapdate = table.concat ({year, month, day}, '-');							-- date comparisons are all done in iso format
	if 14 == dlen then
		return snapdate, msg;													-- return date with message if any
	else
		return snapdate, msg .. inlineRed (err_warn_msgs.ts_len, 'warning');	-- return date with warning message(s)
	end
end


--[[--------------------------< decodeArchiveisDate >-----------------------

Given an Archive.is "long link" URI-path (e.g. /2016.08.28-144552/http://example.com)
return the date in df format (e.g. if df = dmy, return 28 August 2016)
Handles "." and "-" in snapshot date, so 2016.08.28-144552 is same as 20160828144552

returns two values:
	first value is one of these:
		valid date string in df format - archive.is date is valid (including the text string 'short link' when url is the short form)
		empty string - wayback date is malformed (not a number, less than 8 digits, not a valid date)
		nil - wayback date is '/save/'
	
	second return value is an appropriate 'message' may or may not be formatted

]]

local function decodeArchiveisDate(path, df)
	local snapdate

	if path:match ('^/%w+$') then												-- short form url path is '/' followed by some number of base 62 digits and nothing else
		return "short link"														-- e.g. http://archive.is/hD1qz
	end

	snapdate = mw.text.split (path, '/')[2]:gsub('[%.%-]', '');					-- get snapshot date, e.g. 2016.08.28-144552; remove periods and hyphens

	local dlen = string.len(snapdate)
	if dlen < 8 then															-- we need 8 digits TODO: but shouldn't this be testing for 14 digits?
		return '', inlineRed (err_warn_msgs.ts_short, 'error');					-- return empty string and error message
	end

	local year, month, day = snapdate:match ('(%d%d%d%d)(%d%d)(%d%d)');			-- no need for snapdatelong here

	if not is_valid_date (year, month, day) then
		return '', inlineRed (err_warn_msgs.ts_date, 'error');					-- return empty string and error message
	end

	snapdate = table.concat ({year, month, day}, '-');							-- date comparisons are all done in iso format
	if 14 == dlen then
		return snapdate;														-- return date
	else
		return snapdate, inlineRed (err_warn_msgs.ts_len, 'warning');			-- return date with warning message
	end
 end


--[[--------------------------< serviceName >-----------------------

Given a domain extracted by mw.uri.new() (eg. web.archive.org) set tail string and service ID

]]

local function serviceName(host, no_link)
	local tracking;
	local index;
	
	host = host:lower():gsub ('^web%.(.+)', '%1'):gsub ('^www%.(.+)', '%1');	-- lowercase, remove web. and www. subdomains

	if services[host] then
		index = host;
	else
		for k, _ in pairs (services) do
			if host:find ('%f[%a]'..k:gsub ('([%.%-])', '%%%1')) then
				index = k;
				break;
			end
		end
	end
	
	if index then
		local out = {''};														-- empty string in [1] so that concatenated result has leading single space
		ulx.url1.service = services[index][4] or 'other';
		tracking = services[index][5] or categories.other;
																				-- build tail string
		if false == services[index][1] then										-- select prefix
			table.insert (out, prefixes.at);
		elseif true == services[index][1] then
			table.insert (out, prefixes.atthe);
		else
			table.insert (out, services[index][1]);
		end
		
		table.insert (out, make_wikilink (services[index][2], services[index][3], no_link));	-- add article wikilink
		if services[index][6] then												-- add tail postfix if it exists
			table.insert (out, services[index][6]);
		end
		
		ulx.url1.tail = table.concat (out, ' ');								-- put it all together; result has leading space character

	else																		-- here when unknown archive
		ulx.url1.service = 'other';
		tracking = categories.unknown;
		ulx.url1.tail = table.concat ({'', prefixes.at, host, inlineRed (err_warn_msgs.unknown_url, error)}, ' ');
	end
	
	track[tracking] = 1
end


--[[--------------------------< parseExtraArgs >-----------------------

Parse numbered arguments starting at 2, such as url2..url10, date2..date10, title2..title10
	For example: {{webarchive |url=.. |url4=.. |url7=..}}
		Three url arguments not in numeric sequence (1..4..7). 
			Function only processes arguments numbered 2 or greater (in this case 4 and 7)
				It creates numeric sequenced table entries like:
				urlx.url2.url = <argument value for url4>
				urlx.url3.url = <argument value for url7>
			Returns the number of URL arguments found numbered 2 or greater (in this case returns "2")

 ]]

local function parseExtraArgs(args)

	local i, j, argurl, argurl2, argdate, argtitle

	j = 2
	for i = 2, config.maxurls do
		argurl = "url" .. i
		if args[argurl] then
			argurl2 = "url" .. j
			ulx[argurl2] = {}
			ulx[argurl2]["url"] = args[argurl]
			argdate = "date" .. i
			if args[argdate] then
				ulx[argurl2]["date"] = args[argdate]
			else
				ulx[argurl2]["date"] = inlineRed (err_warn_msgs.date_miss, 'warning');
			end
	
			argtitle = "title" .. i
			if args[argtitle] then
				ulx[argurl2]["title"] = args[argtitle]
			else
				ulx[argurl2]["title"] = nil
			end
			j = j + 1
		end
	end

	if j == 2 then
		return 0
	else
		return j - 2
	end
end


--[[--------------------------< comma >-----------------------

Given a date string, return "," if it's MDY 

]]

local function comma(date)
	return (date and date:match ('%a+ +%d%d?(,) +%d%d%d%d')) or '';
end


--[[--------------------------< createRendering >-----------------------

Return a rendering of the data in ulx[][]

]]

local function createRendering()

	local displayfield
	local out = {};
	
	local index_date, msg = ulx.url1.date:match ('(index)(.*)');				-- when ulx.url1.date extract 'index' text and message text (if there is a message)
	ulx.url1.date = ulx.url1.date:gsub ('index.*', 'index');					-- remove message

	if 'none' == ulx.url1.format then											-- For {{wayback}}, {{webcite}}
		table.insert (out, '[');												-- open extlink markup
		table.insert (out, ulx.url1.url);										-- add url

		if ulx.url1.title then
			table.insert (out, ' ')												-- the required space
			table.insert (out, ulx.url1.title)									-- the title
			table.insert (out, ']');											-- close extlink markup
			table.insert (out, ulx.url1.tail);									-- tail text
			if ulx.url1.date then
				table.insert (out, '&#32;(');									-- open date text; TODO: why the html entity? replace with regular space?
				table.insert (out, 'index' == ulx.url1.date and s_text.archive or s_text.archived);	-- add text
				table.insert (out, ' ');