Module:Replace

From Roat Pkz
Jump to navigation Jump to search

Documentation for this module may be created at Module:Replace/doc

-- <pre>
-- A module to allow substitution of regex replace functions

local p = {}

function p.main(frame)
	local args = frame:getParent().args
	return p._main(args)
end

-- a : string
-- b : search
-- c : replace
function p._main(args)
	local a =  mw.text.decode(args[1])
	local b = mw.text.decode(args[2] or '')
	local c = mw.text.decode(args[3] or '')

	-- let us use real regex stuff
	b = mw.ustring.gsub(b,'\\','%')
				:gsub('%*%?','-')
				:gsub('¦','|')

	c = mw.ustring.gsub(c,'$(%d)','%%%1')

	local ret = ''
	-- test for alteration, and find an apprporiate set if necessary
	-- alteration currently only works for groups
	-- it also only works for 1 level of alteration
	-- e.g. (foo|ba(z|r)) will fail
	-- other operations inside alteration groups should work fine
	-- e.g. (foo|ba[rz]) will be fine

	-- looks for unescaped | within parentheses, where the last parenthesis isn't escaped
	-- characters inside cannot be unescaped parentheses, as it finds the first unescaped one to close the capture
	-- as such, that will cause unwanted results for nested captures
	-- may need to be refined
	if string.find(b,'%(.-%f[%%|][^)]-[^%%]%)') then
		ret = p.alterationGroups(a,b,c)

	-- perform basic replacements
	else
		ret = mw.ustring.gsub(a,b,c)
	end

	-- trim whitespace
	ret = mw.text.trim(ret)

	-- condense whitespace
	ret = mw.ustring.gsub(ret,'  +',' ')

	-- fix problems with the pipe trick
	-- may need to look at this later if we decide to use SMW more
	-- the pipe trick operates weirdly with SMW properties
	-- [[property::value| ]] is actually intentional, so that it produces no text
	ret = mw.ustring.gsub(ret,'%| %]%]','|]]')

	return ret
end

-- trying to emulate regex's alteration in pattern matches
-- only works on unnested groups, but supports up to 9
-- any number of alterations works
-- requires the string being matched against (tst)
-- will return the first combination found that works against tst
-- if we have '(a|b)(c|d|e)(f|g)', tests are performed in this order:
-- acf, acg, adf, adg, aef, aeg, bcf, bcg, bdf, bdg, bef, beg
-- if no group works, the final combination is returned
-- neither result works anyway
function p.alterationGroups(tst,reg,repl)
	-- create 2 sets of captures to use
	-- one for storage, the other for manipulation
	local captures,_captures = {},{}

	-- string to hold pattern match
	local s = reg
	mw.log(s)
	mw.log('---')

	-- convert parentheses into full width for temp parsing
	s = mw.ustring.gsub(s,'%%%(','(')
	s = mw.ustring.gsub(s,'%%%)',')')

	-- string to use for string.format in tests
	-- convert %s to %$ for temp parsing
	local _sform = mw.ustring.gsub(s,'%%s','%%$')

	-- matches any set of parentheses that isn't started with a %
	_sform = string.gsub(_sform,'%f[(%%](%b())','(%%s)')

	-- convert full width back to escaped
	_sform = _sform:gsub('(','%%('):gsub(')','%%)')

	-- double up % since string.format is bitchy
	_sform = _sform:gsub('%%([^s])','%%%%%1')

	-- turn $ back to s
	_sform = _sform:gsub('%%%%%$','%%%%s')

	-- finds all parenthetical groups that aren't begun with a %
	-- add to table of manipulate-able capture
	for v in string.gmatch(s,'%f[(%%](%b())') do
		-- match to remove parentheses
		table.insert(captures,string.match(v,'^%((.+)%)$'))
	end

	-- convert each capture into a table
	-- split by alteration character
	for _, v in ipairs(captures) do
		table.insert(_captures,mw.text.split(v,'|'))
	end

	-- table of all possible combinations used for the formatting
	local groupstouse = {}

	-- recursive function
	local function addtogroups(x,stor)
		-- for all in the set
		for _, v in ipairs(_captures[x]) do
			-- temporary storage
			local _stor = {}

			-- deep copy because fuck lua
			for _, u in ipairs(stor or {}) do
				table.insert(_stor,u)
			end

			-- add current pattern to storage
			table.insert(_stor,v)

			-- if there's a next group, run func on those
			if _captures[x+1] then
				addtogroups(x+1,_stor)

			-- otherwise just add this to the master table
			else
				for i, u in ipairs(_stor) do
					_stor[i] = string.format('(%s)',u)
				end

				table.insert(groupstouse,_stor)
			end
		end
	end

	-- run recursive func
	addtogroups(1)

	local formatted
	-- replaced stuff
	local _tst = tst

	-- holds original string
	local tst2 = {mw.ustring.codepoint(tst,1,999999999999999999999)}

	-- check each possible group
	for i, v in ipairs(groupstouse) do

		-- format current groups to check
		formatted = mw.ustring.format(_sform,unpack(v))
		--local
		-- if a match is found, use those groups
		-- replace formatted
		-- use higher numbered characters to avoid parsing of replaced value
		local s,e = mw.ustring.find(_tst,formatted)
		while s do
			-- change all characters in the range of the match to unicode characters
			-- uses a unicode character that holds the index of pattern in groupstouse
			-- located at 4000 + i * 1000
			_tst = mw.text.split(_tst,'')
			for x=s,e do
				_tst[x] = mw.ustring.char(4000+1000*i)
			end
			
			_tst = table.concat(_tst)
			s,e = mw.ustring.find(_tst,formatted,e+1)
		end
		-- logging results
		mw.log(table.concat(v,' -- ')..' : '..formatted)
	end

	-- turn test string into table of unicode indices
	_tst = {mw.ustring.codepoint(_tst,1,99999999999999999)}

	-- combines table.concat and mw.ustring.char
	-- turn only a subset of the table into characters
	local function byteconcat(tbl,start,_end)
		local ret = {}
		start = start or 1
		_end = _end or #tbl
		for x = start,_end do
			table.insert(ret,mw.ustring.char(tbl[x]))
		end
		return table.concat(ret)
	end

	-- start and end indices
	local s,e = 1,111111111111111111111111

	-- while a start point exists
	while s do
		s = nil
		local ch = 0

		-- find first index of a unicode character 5000 or higher
		-- these indicate an area to replace
		for i, v in ipairs(_tst) do
			if v > 4999 then
				s = i
				ch = v
				break
			end
		end

		-- no start point means we're done
		if not s then
			break
		end

		-- index of the group set
		local groupmatch = ch / 1000 - 4

		-- find the end point
		e = s
		while ch == _tst[e+1] do
			e = e + 1
		end

		-- replace these characters with the index
		for x=s,e do
			_tst[x] = groupmatch
		end

		-- strings to reform the new return string
		local _tst1,_tst2,_tst3

		-- form strings from the found indices
		_tst1 = byteconcat(_tst,1,s-1)
		_tst2 = byteconcat(tst2,s,e)
		_tst3 = byteconcat(_tst,e+1)
		local tstsize = mw.ustring.len(_tst2)

		-- do replacement
		_tst2 = mw.ustring.gsub(_tst2,table.concat(groupstouse[groupmatch]),repl)

		-- fill tst2 with dummy characters to match the proper length
		tstsize = mw.ustring.len(_tst2) - tstsize

		if tstsize > 0 then
			for x=s+1,tstsize+s do
				table.insert(tst2,x,99)
			end
		end

		-- re-concatenate the strings
		_tst = table.concat{_tst1,_tst2,_tst3}

		-- turn newest string into table
		_tst = {mw.ustring.codepoint(_tst,1,99999999999999999)}
	end

	-- return finished string
	return byteconcat(_tst)
end

return p