Wikimore

/*
 * OCR cleanup script
 *
 * Mostly a bunch of regexes and prayer
 */

/* eslint-disable camelcase, no-restricted-syntax */

( function ( $, mw ) {
	'use strict';

	const version = '0.1';
	const signature = 'wsCleanup';

	const DEBUG = 0;
	const INFO = 1;
	const ERROR = 2;

	const Cleanup = {
		logLevel: ERROR,
		enable: true,
		testFunctions: [],
		enableTesting: mw.config.get( 'wgTitle' ).endsWith( 'cleanup-test' ),
		portletCategory: 'page',
		activeNamespaces: [ 'page' ],
		actionTitle: 'WsCleanup',
		additionalOcrReplacements: [],
		disabledReplacements: [],
		cleanupFunctions: [],
		italicWords: [],
		doLongSReplacements: false,
		doTemplateCleanup: true,
		remove_running_header: true,
		replaceSmartQuotes: true,
		collapseSuspiciousParagraphs: true,
		shortLineThreshold: 45,
		possibleLanguages: [ 'en' ], // 'fr', 'es', 'de', 'zh-pinyin' ],
		italiciseForeign: true,
		smallAbbreviations: [],
		runningHeaderPatterns: [
			/^([ivxlcIVLXC.,]+|[iI0-9.,]+)\s+([A-Z[\]\s^*\-–—.,]*)\s*$/,
			/^([A-Z\s[\]^*\-–—.,]*)\s+([ivxlcIVLXC.,]+|[iI0-9.,]+)\s*$/,
			/^\s*(\d+|[A-Z[\] ]+)\s*$/
		],
		smallAbbrTemplate: 'smaller',
		editSummary: '/* Proofread */',
		markProofread: true,
		cleanupAccesskey: 'c'
	};

	function log( level, s ) {
		if ( level >= Cleanup.logLevel ) {

			// eslint-disable-next-line no-console
			let log_fn = console.log;

			if ( level >= ERROR ) {
				// eslint-disable-next-line no-console
				log_fn = console.error;
			}
			log_fn( 'Cleanup: ', s );
		}
	}

	class CleanupProcessor {
		constructor() {}

		process( /* text */ ) {
			throw new Error( 'Processors must implement process()' );
		}

		name() {
			throw new Error( 'Processors must implement name()' );
		}
	}

	function process_editor( editor, processor ) {
		let text = editor.get();
		log( INFO, `Processing editor with ${processor.name()}` );
		text = processor.process( text );
		editor.set( text );
	}

	class WholeWordRegexProcessor extends CleanupProcessor {
		constructor( reps ) {
			super();
			this.reps = reps;
		}

		process( text ) {
			log( DEBUG, `Making ${this.reps.length} replacements` );

			for ( const v of this.reps ) {
				const good = v[ 1 ];
				const bad = v[ 0 ];

				const re = new RegExp( '\\b' + bad + '\\b', 'g' );

				text = text.replace( re, good );
			}
			return text;
		}

		name() {
			return 'Generic whole word regexes';
		}
	}

	function pageMayHaveLangs( deniedLangs ) {
		const hasLangs = Cleanup.possibleLanguages.filter(
			( value ) => deniedLangs.includes( value )
		);
		return hasLangs.length > 0;
	}

	class PartialWordRegexProcessor extends CleanupProcessor {
		constructor( reps ) {
			super();
			this.reps = reps;
		}

		process( text ) {
			log( DEBUG, `Making ${this.reps.length} replacements` );

			this.reps.forEach( ( v, i ) => {

				const options = v[ 2 ];

				let skip = false;
				Cleanup.disabledReplacements.forEach( ( dv ) => {
					if ( dv[ 0 ].source === v[ 0 ].source ) {
						// no repl - skip all, else only skip if repl also matches
						if ( !dv[ 1 ] || dv[ 1 ] === v[ 1 ] ) {
							skip = true;
						}
					}
				} );

				if ( skip ) {
					log( DEBUG, `Skipped disabled replacement: ${v[ 0 ].source} -> ${v[ 1 ]}` );
					return;
				}

				if ( options && options.notLangs ) {
					if ( pageMayHaveLangs( options.notLangs ) ) {
						log( DEBUG, `Skipped replacement with denied language: ${v[ 0 ].source} (due to ${options.notLangs})` );
						return;
					}
				}

				if ( options && options.onlyLangs ) {
					if ( !pageMayHaveLangs( options.onlyLangs ) ) {
						log( DEBUG, `Skipped replacement as no allowed language: ${v[ 0 ].source} (due to ${options.onlyLangs})` );
						return;
					}
				}

				try {
					const newflags = 'g' + v[ 0 ].flags.replace( 'g', '' );

					// \b doesn't match useful things like unicode, so fix that up
					// this can't do everything but it might help
					const newSource = v[ 0 ].source;
					// \b at the the start - replace with non-consuming space-or-start
					// .replace( /^\\b/, '(?<=^|[\\s\\-;:\'",.!?–—{}\\[]\\|])' );

					text = text.replace( new RegExp( newSource, newflags ), v[ 1 ] );
				} catch ( error ) {
					log( ERROR, `Error in ${i}th replacement: ${v}` );
					throw error;
				}
			} );
			return text;
		}

		name() {
			return 'Generic partial word regexes';
		}
	}

	/**
	 * Make replacements for things that cannot be a suffix in a word, but instead
	 * must be a new word (i.e. a space has gone missing _before_ the match)
	 */
	class BannedSuffixProcessor extends CleanupProcessor {
		constructor( suffix_list ) {
			super();
			this.suffix_list = suffix_list;
		}

		process( text ) {
			for ( const v of this.suffix_list ) {
				const newflags = 'g' + v.flags.replace( 'g', '' );
				const regex = new RegExp( '(\\w+)(' + v.source + ')', newflags );

				text = text.replace( regex, '$1 $2' );
			}
			return text;
		}

		name() {
			return 'Banned suffixes';
		}
	}

	/**
	 * Make replacements for things that cannot be a prefix in a word, but instead
	 * must be a previous word (i.e. a space has gone missing _after_ the match)
	 */
	class BannedPrefixProcessor extends CleanupProcessor {
		constructor( prefix_list ) {
			super();
			this.prefix_list = prefix_list;
		}

		process( text ) {
			for ( const v of this.prefix_list ) {
				const newflags = 'g' + v.flags.replace( 'g', '' );
				text = text.replace( new RegExp( '(' + v.source + ')(\\w+)', newflags ), '$1 $2' );
			}
			return text;
		}

		name() {
			return 'Banned prefixes';
		}
	}

	/**
	 * Make replacements for words that cannot stand alone, but would most likely be
	 * suffixes of previous words (i.e. a space has been inserted _before_ the match)
	 */
	class OrphanSuffixProcessor extends CleanupProcessor {
		constructor( reps ) {
			super();
			this.reps = reps;
		}

		process( text ) {
			for ( const v of this.reps ) {
				const newflags = 'g' + v.flags.replace( 'g', '' );
				text = text.replace( new RegExp( '[\\s\\-](' + v.source + '\\b)', newflags ), '$1' );
			}
			return text;
		}

		name() {
			return 'Orphan suffixes';
		}
	}

	/**
	 * Make replacements for words that cannot stand alone, but would most likely be
	 * prefixes of following words (i.e. a space has been inserted _afteR_ the match)
	 */
	class OrphanPrefixProcessor extends CleanupProcessor {
		constructor( reps ) {
			super();
			this.reps = reps;
		}

		process( text ) {
			for ( const v of this.reps ) {
				const newflags = 'gi' + v.flags.replace( /[gi]/, '' );
				text = text.replace( new RegExp( '(\\b' + v.source + ')[\\s\\-]', newflags ), '$1' );
			}
			return text;
		}

		name() {
			return 'Orphan prefixes';
		}
	}

	/**
	 * Wrap selected matches in italics
	 */
	class ItaliciseProcessor extends CleanupProcessor {
		constructor( reps ) {
			super();
			this.reps = reps;
		}

		process( text ) {
			for ( const v of this.reps ) {
				const newflags = 'g' + v.flags.replace( /[gi]/, '' );
				text = text.replace( new RegExp( '(?<!\'\')(' + v.source + ')', newflags ), "''$1''" );
			}
			return text;
		}

		name() {
			return 'Italics';
		}
	}

	/*
	 * These functions need the original line breaks
	 */
	const do_pre_collapse_cleanup = function ( editor ) {

		const reps = [

			// remove trailing spaces at the end of each line
			[ / +\n/, '\n' ],

			// treat these symbols as hyphens
			[ /[⌐¬]/, '-' ],

			// join words that are hyphenated across a line break
			// (but leave "|-" table syntax alone)

			// Capitals keep their hyphen e.g. non-European
			[ /([^|])-\n(?=[ÁÀA-ZÉÈÖ])/, '$1-' ],
			// everything else loses the hyphen
			[ /([^|])-\n(?=[\w])/, '$1' ]
		];

		process_editor( editor, new PartialWordRegexProcessor( reps ) );
	};

	class RunningHeaderProcessor extends CleanupProcessor {

		constructor( rh_patterns ) {
			super();
			this.rh_patterns = rh_patterns;
		}

		name() {
			return 'Trim running header patterns';
		}

		process( text ) {
			text = text.split( /\r?\n/ );

			let new_start_line = 0;

			for ( const line of text ) {
				if ( line.trim().length === 0 ) {
					new_start_line += 1;
					continue;
				}

				let found = false;
				for ( const pattern of this.rh_patterns ) {
					if ( pattern.test( line ) ) {
						new_start_line += 1;
						found = true;
						break;
					}
				}

				if ( !found ) {
					break;
				}
			}

			return text.slice( new_start_line ).join( '\n' );
		}
	}

	const do_generic_cleanup = function ( editor ) {

		// various cleanup
		const reps = [
			// Digitized by Google (kill)
			[ /\s?D[ijl]g[ijl]t[ijl][sz][eco]d\s+by[^\n]*\s+([6G][Oo0Q]{2}g[lIf][eco])?/, '' ],
			[ /\bG[oO0]{2}gle\b/, '' ],

			// Remove highly suspicious chars
			[ /[■•]/, '' ],

			// remove trailing whitespace preceding a hard line break
			[ / +<br *\/?>/, '<br />' ],

			// remove trailing whitespace at the end of page text
			[ /\s+$/, '' ],

			// remove trailing spaces at the end of refs
			[ / +<\/ref>/, '</ref>' ],

			// remove trailing spaces at the end of template calls
			[ / +}}/, '}}' ],

			// lines containing only punctuation are likely junk
			[ /^[.,^]$/m, '' ],

			// convert double-hyphen to mdash (avoiding breaking HTML comment syntax)
			[ /([^!])--([^>])/, '$1—$2' ],

			// Remove spaces around hyphens between words
			// Eg. pack -house -> pack-house
			[ /(\w) ?- ?(\w)/, '$1-$2' ],

			// remove unwanted spaces before punctuation marks
			[ / ([);:?!,.])/, '$1' ],

			// ensure spaces after punctuation marks
			[ /([);:?!,.])([^ 0-9\n}|"'’”])/, '$1 $2' ],

			// ...but double punctuation doesn't get any spaces
			[ /([);:?!,.]) +([\n);:?!,.\]]|$)/, '$1$2' ],

			// Double full-stop is probably just (3 or 4 is OK - ellipsis)
			[ /(\w)\.\. (?=\w)/, '$1. ' ],

			// no spaces for inter-numeric punctuation
			[ /([0-9][,]) +([0-9]{3}(?![0-9]))/, '$1$2' ],

			// quotes at start of line can't be a close
			[ /^(['"]) (?=[A-Za-z])/m, '$1' ],

			// quotes at end of line can't be an open
			[ / (['"])$/m, '$1' ],

			// no space in "'s"
			[ / ?' ?s([\n ])/, '\'s$1' ],

			[ /\( +/, '(' ],
			[ / +\)/, ')' ],
			[ / *— */, '—' ],

			// Date ranges
			[ /([0-9]{3,4})-([0-9]{2,4})/, '$1–$2' ],

			// figures
			[ / ?, ?ooo/, ',000' ],

			// q.v. to q. v.
			[ /q\.v\./, 'q. v.' ],

			// i.e.
			[ /\bi\.? ?e\.(?!')/, "''i.e.''" ],

			// & c. to &c.
			[ / ?& ?[coe][.,]([,]?)/, ' &c.$1' ],

			// this is an old pound noation
			// with a slash after a space
			[ /([0-9]) ?[/]\.(?=\s)/, "$1''l.''" ],

			// No spaces between num and st/nd/rd
			[ /([0-9]) (st|nd|rd)\b/, '$1$2' ],

			[ /ty(one|two|three|four|five|six|seven|eight|nine|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth)/, 'ty-$1' ],

			// ﬁ ligature to fi
			[ /ﬁ/, 'fi' ],

			[ /ſ/, 'f' ],

			[ /_/, ' ' ]
		];

		process_editor( editor, new PartialWordRegexProcessor( reps ) );
	};

	const do_ocr_fixes = function ( editor ) {

		const reps = [

			// some apostrophes probably bogus at word start
			[ /\b([vw])'([a-z])/, '$1$2' ],

			// some mis-read full-stops
			[ /\b(?<=Mr|Mrs|Mssrs|Ms)'/, '.' ],

			// ^ -> '' : delete spurious carets
			[ /(?<=w)\^/, '' ],

			// ! -> l
			[ /ua!(?=\s)/, 'ual' ],

			// / -> f
			[ /\/ellow/, 'fellow' ],

			// / -> t
			[ /(\s)\/he\b/, '$1the' ],

			// £ -> f
			[ /£f\b/, 'ff' ],

			// « -> s
			[ /(?<=\w)«(?=\s)/, 's' ],

			// $ -> s
			[ /(?<=[a-z])\$/, 's' ],

			// }' -> y
			[ /r}'/, 'ry' ],

			// ' -> y
			[ /(?<=\b[Vv]er)'/, 'ery' ],

			[ />(?=['"])/, '?' ],

			// } -> ?
			[ /(?<=[a-z]) }/, '?' ],

			[ /\('(?=yc)/, 'C' ],

			// 'I' -> T
			[ /(?<=\W)'[IJ]'(?=\w)/, 'T' ],

			// 0 -> O
			[ /\b0[*']([BNR])/, "O'$1" ], // Irish names

			// 1 -> i
			[ /(?<=\. )1(?=n|s|t)/, 'I' ],
			[ /1(?=n|s|t)/, 'i' ], // hard to tell In or in
			// avoid units, dates, and "1 of", "1 to" and "1 in"
			[ / 1 (?![0-9A-Z]|(or|to|in|of)\b|inch|mi\b|mile|ft|foot|cm|cent(i|\b)|dollar|pound|yard|metr|mm|km|kilo|acre|hect[ao])/, ' I ' ],

			// 4 -> d
			[ /4oor/, 'door' ],
			[ /e4\b/, 'ed' ],

			// 6 -> o
			[ /\b6(?=[a-z])/, 'o' ], // 6n, 6f, etc

			// 8 -> S
			[ /\b8(?=\w|\b)/, 'S' ], // 8o, etc, but not 8o00

			// 8i -> th
			[ /\b8i/, 'th' ],

			// a -> e
			[ /(?<=[Jj]udg)a/, 'e' ],

			// a -> f
			[ /\baf\b/, 'of' ],

			// a -> n
			[ /\baad/, 'and' ],
			[ /upoa/, 'upon' ],
			[ /\bia\b/, 'in' ],
			[ /(?<=[Rr])emaia/, 'emain' ],

			// a -> s
			[ /riaon/, 'rison' ],
			[ /wera\b/, 'wers' ],
			[ /\beap/, 'esp' ],

			// AA -> w
			[ /\b(AA|AV)(?=[a-z]{2})/, 'w$1' ],
			[ /\bnat\b/, 'not' ],

			// ae -> nc
			[ /aaee(|s|d)\b/, 'ance$1' ],

			// Av -> w
			[ /Av(ill|ith|ere\b|est|here|hat|as\b|ould|ho|or|hich|hen|ell|eigh|ise|eak|rit|ron)/,
				'w$1' ],
			[ /AV(ill|ith|ere\b|est|here|hat|as\b|ould|ho|or|hich|hen|ell|eigh|ise|eak|rit|ron)/,
				'W$1' ],
			[ /(?<=[a-z])AV\b/, 'w' ],

			// Avli -> wh
			[ /\bAvli(ich|om?(ever)?|en|ere|ether|y)\b/, 'wh$1' ],
			[ /\bAVli(ich|om?(ever)?|en|ere|ether|y)\b/, 'Wh$1' ],

			// b -> e
			[ /\b([Tt])hb/, '$1he' ],

			// b -> h
			[ /\bbow(so|ever|itz|beit)/, 'how$1' ], // watch for bowl...
			[ /\b(?<=[Tt])be(?=y\b|a\b|se\b|ir\b)/, 'he$1' ],
			[ /\b(?<=[Ww])b(?=i|e)/, 'h' ], // which, when
			[ /\bbas(|n't|ten)\b/, 'has$1' ],
			[ /\bber(|self|eto)\b/, 'her$1' ],
			[ /\bbim(|self)\b/, 'him$1' ],
			[ /([Ww])hicb/, '$1hich' ],
			[ /\b([Ss])bow/, '$1how' ],

			// b -> o
			[ /(?<=\b[Ss])b/, 'o' ],

			// b -> r
			[ /mbeb\b/, 'mber' ],
			[ /dmibal/, 'dmiral' ],
			[ /xtba/, 'xtra' ],
			[ /Victobia/, 'Victoria' ],

			// B -> E
			[ /\b(?<=TH|THR)B/, 'E' ],

			// B -> R
			[ /Bailw/, 'Railw' ],
			[ /Boyal/, 'Royal' ],
			[ /\bFBO/, 'FRO' ],

			// c -> e
			[ /cx(?![ivxcdm]+\b)/, '$1ex' ], // mind roman numerals
			[ /becn/, 'been' ],
			[ /\bbcen/, 'been' ],
			[ /(C|c)lcar/, '$1lear' ],
			[ /(a|u|o|p)pces\b/, '$1pees' ], // rupees,...
			[ /(C|c)asc(\b|(?=\w)[^a])/, '$1ase$2' ],
			[ /\bwc\b/, 'we' ],
			[ /(?<=[Ss]t|\b[Tt])cam/, 'eam' ],
			[ /(S|s)evc/, '$1eve' ], // several/severe
			[ /([Gg])rcat/, '$1reat' ],
			[ /([fvh])crence/, '$1erence' ],
			[ /\b(?<=[Hh])c\b/, 'e' ], // hc -> he
			[ /\bcn(?!i)/, 'en' ],
			[ /\bmcn\b/, 'men' ],
			[ /((?=\w)[^ao]|\b)rcs/, '$1res' ], // avoid arcs/orcs
			[ /\Borcs\b/, 'ores' ], // but it can be a suffix of ores
			[ /\bpcople(|s)\b/, 'people$1' ],
			[ /\b&e\.(?=\s|$)/, '&c.' ],
			[ /catc(|d)\b/, 'cate$1' ],
			[ /\bcight/, 'eight' ],
			[ /nccessar/, 'necessar' ],
			[ /\b([Ww])cr/, '$1er' ],
			[ /([^Aaeou])rcat/, '$1reat' ],
			[ /\b([Oo])nc(|s)\b/, '$1ne$2' ],
			[ /(?<=\b[Ss])[ec][ec](?=m|ing)/, 'ee' ], // seem, seeing
			[ /(?<=g)mics\b/, 'mies' ],
			[ /(?<=\b[\Ss])tr[ce][ce]ct/, 'treet' ], // street
			[ /ocict/, 'ociet' ], // society
			[ /cither/, 'either' ], // cither exists, but...
			[ /(?<=\b[Ss])[ce][ce](?=d|\b|ing|m)/, 'ee' ], // see, seed, seeing
			[ /(?<=\b[Ss])c(?=er|ct)/, 'e' ], // seer... (not secretary)
			[ /(?<![ln])icf/, 'ief' ], // grief
			[ /c(?=ver|lectr)/, 'e' ], // ever, every, electric
			[ /(?<=[Pp])copl[ce]/, 'eople' ], // people
			[ /(?<=[Gg]rac|[Rr]os)c/, 'e' ], // grace, rose
			[ /(?<=[Cc]ru|[Yy]i)cl/, 'el' ], // cruel, yield, etc
			[ /cl(?=\b|l|f)/, 'el' ], // inc. scfl -> self
			[ /ncral(?=(?:s|ly|ity|ities)\b)/, 'neral' ], // general-
			[ /cth(?!ood|eroy|roat|yma|esis|etic|lip|idro|i\b)/, 'eth' ], // maketh, etc
			[ /tcd\b/, 'ted' ],

			[ /\b(t|tsz|sz)c\b/, '$1e' ], // chinese

			// ce -> œ
			[ /(?<=[Mm]an)ce(?=u)/, 'œ' ],

			// ci -> d
			[ /(P|p)rociu/, '$1rodu' ],
			[ /\bacidition(s|)\b/, 'addition$1' ],

			// ci -> ici
			[ /offci/, 'offici' ],

			// cnce: ence
			[ /cnce\b/, 'ence' ],

			[ /clves\b/, 'elves' ],

			// bom to born
			[ /\bbom\b/, 'born' ],

			// c -> d
			[ /aciva/, 'adva' ], // advantag...

			// c -> g
			[ /(\B[^\bzlp])inc\b/, '$1ing' ],

			// c -> o
			[ /\bcwn/, 'own' ],
			[ /cc(?=ln|ld|mp|lum|n|resp|s)/, 'co' ], // Lincoln, cold, company, ...
			[ /\bcc(?=urt)/, 'co' ], // court, not accurtation
			[ /\bcught/, 'ought' ],

			// c -> s
			[ /\b([dD])icre/, '$1isre' ], // disregard

			// ci -> d
			[ /eci\b/, 'ed' ],

			// d -> i
			[ /\bwdth/, 'with' ],

			// d -> o
			[ /d(?=mp|wn)/, 'o' ], //  eg. compose, town
			[ /fdr/, 'for' ],

			// dl -> 31
			[ /\b[Sd3]lst\b/, '31st' ],

			// e -> a
			[ /\bscele(|s|d)\b/, 'scale$1' ],

			// e -> c
			[ /\be(ome)\b/, 'c$1' ],
			[ /rcet/, 'rect' ], // direct...
			[ /struet/, 'struct' ],
			[ /enee\b/, 'ence' ],
			[ /expeet/, 'expect' ],
			[ /((?=\B)[^n]|[oi]n)speet/, 'spect' ], // avoid speet and Nunspeet
			[ /taeh/, 'tach' ], // detach
			[ /\bwhieh(|ever)\b/, 'which$1' ],
			[ /\bfec\b/, 'fee' ],
			[ /execpt/, 'except' ],
			[ /([^q])uet(ing|ed)\b/, '$1ucted' ], // conducted
			[ /&e\./, '&c.' ],
			[ /(?<=[Uu]n)ele(?=s?\b)/, 'cle' ],

			// é -> è
			[ /ére\b/, 'ère' ], // No words end with acute-e ére

			// E -> F
			[ /E(rom )/, 'F$1' ],

			// e -> o
			[ /\bef\b/, 'of' ],
			[ /\bfrem\b/, 'from' ],
			[ /\bse\b/, 'so', { notLangs: [ 'es', 'fr', 'zh-pinyin' ] } ],

			// e -> r
			[ /rthee(?!ls)/, 'rther' ], // further, northern
			[ /outhee(?!ls|l\b)/, 'outher' ], // southern/ly
			[ /([^r])eoad/, '$1road' ], // broad

			// e -> s
			[ /\beo(|uth)\b/, 'so' ],
			[ /\bthoee\b/, 'those' ],

			// el -> d
			[ /\belyn/, 'dyn' ],
			[ /itel\b/, 'ited' ], // cited, united,...

			// -eney -> -ency (sad for Sweeny Todd)
			[ /eney\b/, 'ency' ],

			// er -> ev
			[ /\berery/, 'every' ],

			// é -> c
			[ /([aeiou])é(t)/, '$1c$2' ],

			// f -> nothing
			[ /\bhighfer/, 'higher' ],

			// f -> i
			[ /anfes\b/, 'anies' ],
			[ /stfan/, 'stian' ],

			// f -> l
			[ /(?<=[Aa])farm/, 'larm' ],

			// f -> t
			[ /\b(|in)difterent/, 'different' ],
			[ /\bfwo/, 'two' ],

			// f -> r
			[ /(?<=\bB)[ft]it(?=ish|ain)/, 'rit' ],

			// ff -> fl
			[ /\bff(ood)\b/, 'fl$1' ],

			// ff -> ñ
			[ /(?<=[Ss])paf[ifl]a\b/, 'paña' ],

			// g -> ç
			[ /(?<=Mendon?)ga\b/, 'ça' ],
			[ /(?<=Gu?on?)g(?=all?o)\b/, 'ç' ],
			[ /Lorengo/, 'Lorenço' ],

			// G -> 6
			[ /\bG([0-9]*)th\b/, '6$1th' ],

			// h -> b
			[ /([Dd])ouht/, '$1oubt' ],
			[ /\bhe(en)\b/, 'be$1' ],
			[ /(Oo])hser/, '$1bser' ], // observe
			[ /\bhio/, 'bio' ],
			[ /\bemh/, 'emb' ],
			[ /\bheyo/, 'beyo' ],
			[ /\bohs\B/, 'obs' ],
			[ /\bhy\b/, 'by' ],
			[ /\bhe(?=ings?|en\b|an\b)/, 'be' ],
			[ /\bhene(?!icos|n|q)/, 'bene' ],

			// h -> c
			[ /\bhareful(|ly)/, 'careful$1' ],

			// h -> im
			[ /\bh(?=nony|nonies)\b/, 'im' ],

			// h/U -> li
			[ /\b(h|U)(fe|ke|ttle)\b/, 'li$2' ],
			[ /nghs([ht])/, 'nglis$1' ], // English, etc

			// h -> n
			[ /\bih(?![ilr])/, 'in' ],
			[ /lahd(?='?s?\b|ing'?s?\b)/, 'land' ],

			// h -> li
			[ /\bhv[ec](?=s|)\b/, 'live' ],
			[ /(?=\b[Aa])hve\b/, 'live' ],
			[ /hng(?=s|ly)?\b/, 'ling' ],
			[ /dehc/, 'delic' ], // delicate, etc

			// h -> lt
			[ /cuh(?=(|y)\b)/, 'cult' ], // difficult(y), etc

			// H -> li
			[ /\bHke/, 'like' ],

			// H -> ll
			[ /(?<=\bA|[a-z])H/, 'll' ],

			// hv -> lw
			[ /(?<=[Aa]|ai|l)hvay/, 'lway' ], // always, railway, spillway

			// convert i9 to 19, etc.
			[ /[il]([0-9])/, '1$1' ],

			// i -> 1
			[ /\b[Il][Iil]th\b/, '11th' ],
			[ /(?<=[0-9])ist\b/, '1st' ],

			// I -> 1
			[ /\bIst\b/, '1st', { notLangs: [ 'de' ] } ],

			// i -> nothing
			[ /\bsomie/, 'some' ],
			[ /sielf/, 'self' ],
			[ /\b([Tt])hi(ey|ese)\b/, '$1h$2' ],
			[ /senise/, 'sense' ],
			[ /(?<=[Ff])irom/, 'rom' ],

			// I -> nothing
			// See also T -> nothing

			// i -> a
			[ /\bnime(ed|ly)/, 'namely' ],

			// i -> f
			[ /\bior(\b|m)/, 'for$1' ],
			[ /(I|i)nior/, '$1nfor' ],
			[ /([^m])afi(a|o)/, '$1aff$2' ],
			[ /\ba[ií]f/, 'aff' ],
			[ /([rhlf])iei(s|ly|)\b/, '$1ief$2' ], // brief

			// i -> j
			[ /(in|b|con|de|a)iect/, '$1ject' ],
			[ /\biett(y|ies)/, 'jett$1' ],

			// i -> l
			[ /([a-z])abie\b/, '$1able' ],
			[ /ficuit(|y)/, 'ficult$1' ],
			[ /enerai/, 'eneral' ],
			[ /\biab(o|ou)r/, 'lab$1r' ],
			[ /cicar/, 'clear' ],
			[ /shali(\b|ow)/, 'shall$1' ],
			[ /(i)abie\b/, '$1able' ], // reliable, ...
			[ /reiig/, 'relig' ],
			[ /([aeiou])riy\b/, '$1rly' ],
			[ /\b(un|)iaw/, '$1law' ],
			[ /\bgloi(y|ious)/, 'glor$1' ],
			[ /tiy\b/, 'tly' ],
			[ /iais\b/, 'ials' ], // materials...
			[ /\b(Ii)li(s?\b|ness)/, '$1ll$2' ],
			[ /(?<=[Ss]e)if/, 'lf' ], // self

			// -isli -> -ish
			[ /(\w)isli\b/, '$1ish' ],

			// i -> r
			[ /eiy(?![ua])/, 'ery' ],
			[ /([Ff])iist/, '$1irst' ],
			[ /([Gg])ieat/, '$1reat' ],
			[ /\b([Pp])oit(?![ior])/, '$ort' ], // port/ion
			[ /beied\b/, 'bered' ],

			// i -> t
			[ /(a|o|i)iion/, '$1tion' ],
			[ /leci\b/, 'lect' ],
			[ /aier/, 'ater' ], // material
			[ /\bmulii/, 'multi' ],
			[ /\bihe/, 'the' ], // the, there...
			[ /nir(ies|y)/, 'ntr$1' ], // country
			[ /\bio(|wards?|gether)\b/, 'to$1' ],
			[ /\bihat\b/, 'that' ],
			[ /enily\b/, 'ently' ],
			[ /ciion/, 'ction' ],
			[ /(?<=[Bb]u)i/, 't' ],
			[ /Stewari/, 'Stewart' ],

			// i' in a word -> r (not 's)
			[ /(?<=[a-z])i'(?=[a-rt-z]|s\w)/, 'r' ],

			// i^ > r
			[ /(?<=[a-z])i\^/, 'r' ],

			// i- -> r (be more careful than ^, - can be right)
			[ /(?<=Yo)i-/, 'r' ],

			// I -> f
			[ /\bIor([^gim]|\b)/, 'for$1' ],

			// I -> l
			[ /\b[l1I]' ?(?=[AEIOUÉÈaeiouéè]\w)/, 'l\'' ],

			// I' at word start -> f (except I'd. I'm, I'll, etc)
			[ /\bI'([a-ce-kn-uw-z])/, 'f$1' ],

			// I- -> L
			[ /\bI-ord/, 'Lord' ],

			// I^ -> P
			[ /\bI\^/, 'P' ],

			// id -> nl
			[ /\boidy/, 'only' ],

			// id -> ul
			[ /\bshoidd/, 'should' ],

			// if -> i
			[ /(?<=\b[Oo])if\b/, 'f' ],

			// If -> N (happens in cap'd words)
			[ /\b([A-Z]+)If\b/, '$1N' ],

			// ii -> a
			[ /\biind\b/, 'and' ],
			[ /\biimount/, 'amount' ],

			// II -> H
			[ /\bII(e|[a-z]{2,})\b/, 'H$1' ],

			// ii -> h
			[ /tiie/, 'the' ],
			[ /hicii/, 'hich' ], // which

			// II -> M
			[ /II(?=r|s)/, 'M' ],

			// ii -> n
			[ /aiis(?!m)/, 'ans' ],
			[ /co(?:ii|tt)c/, 'conc' ],

			// ii -> u
			[ /(?<=\b[SsBbMm])ii/, 'u' ],
			[ /\bii(?!\b|i)/, 'u' ], // avoid roman nums iii
			[ /iiim(?=s?\b)/, 'ium' ],
			[ /(?<=[Yy])oii/, 'ou' ],

			// ii -> ü
			[ /(?<=\bHs?)iian\b/, 'üan' ],
			[ /\bMiiller/, 'Müller' ],
			[ /\bYii(?=n\b|an\b)/, 'Yü' ],
			[ /\bTriib/, 'Trüb' ],

			// -iiig -> -ing
			[ /iiig\b/, 'ing' ],

			// ij -> h
			[ /tija(?!j)/, 'tha' ],
			[ /([Tt])ij([ae])/, '$1h$2' ],

			// il -> H
			[ /(\W |\n)il(e|im|er)/, '$1 H$2' ],

			// Il -> H
			[ /\bIlo(?![ck]no|ilo|ko|na|ne\b|ngot|nka|rin|ts?\b|tycin|well)/, 'Ho' ],

			// in -> m
			[ /soine/, 'some' ],
			[ /inod(er|[^e])/, 'mod$1' ], // avoid ..node...
			[ /ninent/, 'nment' ], // government/s
			[ /\bcomin([au])/, 'commu$1' ], // community, communication, command
			[ /\biny(|self)\b/, 'my$1' ],
			[ /\binen\b/, 'men' ],
			[ /([^mst])inent/, '$1ment' ], // document...
			[ /(to|for|by|with|told|tell|let|g[ia]ve|from|towards|[oui]nto|under) ine\b/, '$1 me' ], // ine could be a suffix, so hit the common ones by ngram
			[ /\bimined/, 'immed' ],
			[ /\binean(|s)\b/, 'means' ],
			[ /\bMohainn/, 'Mohamm' ],
			[ /sinug/, 'smug' ],
			[ /inforin/, 'inform' ],
			[ /\bhiin(self|)\b/, 'him$1' ],
			[ /\b([Ee])nin(i|e)/, '$1nm$2' ], // enmity, enmesh..
			[ /\b([Ff])roin\b/, '$1rom' ],
			[ /([Mm])einb/, '$1emb' ],

			// in -> th
			[ /(?<=(?:[Ii]n|[Tt]o|[Ff]or) )ine(?=\b|re\b|se\b|ir\b)/, 'the' ],

			// io -> w
			[ /\bneio(|ly)\b/, 'new$1' ],

			// ir -> n
			[ /\biir/, 'in' ],

			// it -> n
			[ /meitt/, 'ment' ],

			// iv -> j
			[ /\biv(?=st\b)/, 'ju' ],

			// iv -> w
			[ /\bneiv(|ly)\b/, 'new$1' ],
			[ /tiveen/, 'tween' ],

			// IVI -> M
			[ /\bIVI(?=[a-z])/, 'M' ],

			// j -> f
			[ /\boj\b/, 'of' ],

			// j -> i
			[ /thjs/, 'this' ],

			// J -> I
			[ /\bJowa/, 'Iowa' ],

			// J -> G
			[ /\b\(J(?=uide)/, 'G' ],

			// J -> l
			[ /\bJibert/, 'libert' ],
			[ /\b(?<=[Bb])jood/, 'lood' ], // blood

			[ /ojher/, 'other' ],

			// j -> y
			[ /ojal/, 'oyal' ],
			[ /\b([Mm])anj\b/, '$1any' ],
			[ /\b([Tt])hej\b/, '$1hey' ],

			// Ji -> h
			[ /Jiave/, 'have' ],
			[ /tJie/, 'the' ],

			// jl -> d
			[ /arjl/, 'ard' ],

			// jj -> g
			[ /jjht/, 'ght' ],

			// j}3^ -> y
			[ /(3|j|\})\^/, 'y' ],

			// k -> ic
			[ /whkh/, 'which' ],

			// kl -> d
			[ /Eklinb/, 'Edinb' ],

			// K -> E
			[ /Kng/, 'Eng' ],

			// l -> nothing
			[ /\b(|in)diflferent/, '$1different' ],
			[ /\beitlher\b/, 'either' ],
			[ /eaclh/, 'each' ],
			[ /Clhin(a|ese)/, 'Chin$1' ],
			[ /(?<=[Ff]l|[Dd]r|ang|[Qq]|iq|)uild/, 'uid' ], // fluid etc
			[ /(?<=\b[Tt])(?:lh|hl|jh|hj)(?=[ieo])/, 'h' ], // the, these, those, etc

			// l -> d
			[ /listor/, 'distor' ], // distort...

			// l -> f
			[ /\bol\b/, 'of' ],
			[ /\bl(orm)\b/, 'f$1' ],

			// l -> i
			[ /fui(\b|ness\b)/, 'ful$1' ],
			[ /(d|D)ipio/, '$1iplo' ],
			[ /(P|p)arll/, '$1arli' ],
			[ /\bWilllam/, 'William' ],
			[ /\b([Ff])lc/, '$1ic' ], // fiction
			[ /\b(Tt])helr/, '$1heir' ],
			[ /(?<=[Rr]|[Vv]|[Dd]|[Tt]|[g]|[Ff]|[Mm])ellc/, 'elic' ], // relic, delicate,

			// l -> I
			[ /"\blon(a|ian)/, 'Ion$1' ],
			[ /\bl'(ve|ll)\b/, "I'$1" ],
			[ /\blt('?s|self)\b/, 'it$1' ],

			// l -> h
			[ /(a|o)rslip/, '$1rship' ], // scholarship, warships, worship
			[ /\b([Ww])hicl/, 'which' ],
			[ /(\w)encl\b/, 'ench' ], // french, bench...

			// l ->li
			[ /\blke/, 'like' ],

			// l -> t
			[ /([0-9])lh\b/, '$1th' ],
			[ /\boul/, 'out' ],
			[ /([Aa])fler/, '$1fter' ],
			[ /ifl(?=\b|ness|ly)/, 'ift' ], // swift

			// la -> h
			[ /\bthrougla/, 'through' ],
			[ /\btla(?<!c)/, 'th' ],

			// li -> b
			[ /\blio([^n])/, 'bio$1' ], // not lion...
			[ /liject/, 'bject' ], // subject

			// li -> lh
			[ /\botlier(|s|wise)/, 'others' ],
			[ /\b([Mm])onarcli(|s|y)/, '$1onarch$2' ],

			// lT -> ff
			[ /di(lT|flP)ere/, 'differe' ],

			// l) -> b
			[ /al\) ?le\b/, 'able' ],

			// l^ -> f
			[ /l\^(?=[a-z])/, 'f' ],

			// li -> b
			[ /\bliy\b/, 'by' ],

			// li -> h ... "the", "them", "their", "with", "much", "here" and whe etcetera
			[ /([tT][Jl]i)(e|at|is|an|em|ear|eir|en|ither|ose|rough|ree)\b/i, 'th$2' ],
			[ /\b([SsWw])lie/, '$1he' ], // she, when...
			[ /\b([Ww])li(at|ole)/, '$1h$2' ], // what, whole
			[ /(wlicli|ivhic(li|h)|wliich|wiiich|whicli)/, 'which' ],
			[ /liurcli/, 'hurch' ],
			[ /\bli(ave|ere|is|ad|ard)/, 'h$1' ],
			[ /\bIl(is)\b/, 'H$1' ],
			[ /witli/, 'with' ],
			[ /mucli\b/, 'much ' ],
			[ /\blias/, ' has' ],
			[ /\bwlio/, 'who' ],
			[ /\b(an|)otlier\b/, '$1other' ],
			[ /ealtli/, 'ealth' ],
			[ /([Cc])lii/, '$1hi' ], // China/ese...
			[ /([SsMu]ucli)/, '$1uch' ],
			[ /cliann/, 'chann' ],
			[ /ubhs/, 'ublis' ], // publish
			[ /\bliate/, 'hate' ],
			[ /liion/, 'hion' ], // fashion
			[ /(?<=[Tt])liing/, 'hing' ], // thing
			[ /(?<=[Nn]e|[Ee])itlier/, 'ither' ], // either, neither
			[ /(?<=[Cc]|\b)liarm/, 'harm' ],

			// li -> k
			[ /([LlBb])ooli(\b|s)/, '$1ook\b' ],

			// llt -> th
			[ /\bllt(e)\b/, 'th$1' ],

			// lli -> th
			[ /\blli(at|e)\b/, 'th$1' ],

			// ln -> b
			[ /suln/, 'sub' ],
			[ /([Hh])md/, '$1ind' ],

			// lu -> hi
			[ /(?<=[a-z][^li])lucal/, 'hical' ], // -graphical

			// m -> in
			[ /mg\b/, 'ing' ],
			[ /\bopm/, 'opin' ],
			[ /Chm(a|ese)/, 'Chin$1' ],
			[ /(?<=\b[Pp]la)m/, 'in' ],

			// m -> n
			[ /\bFramce/, 'France' ],
			[ /\bFremch/, 'French' ],
			[ /\bJume\b/, 'June' ],

			// m -> on
			[ /atim\b/, 'ation' ],
			[ /\b(V|v)erbation\b/, '$1erbatim' ], // fix verbatim

			// m -> rn
			[ /ceming\b/, 'cerning' ],
			[ /\b([Un]w|[Ww])om\b/, '$1orn' ],
			[ /(?<=[Nn]orth|[Ss]outh|[Ee]ast|[Ww]est)em\b/, 'ern' ],
			[ /(?<=B[ij[oö])m\b/, 'rn' ],
			[ /Foumier/, 'Fournier' ],

			// m -> un
			[ /\bmorth/, 'unorth' ],

			// m -> w
			[ /\b([Nn])em([^aeo]|\b)/, '$1ew$2' ], // new, newly, news

			// mn -> nm
			[ /mnent/, 'nment' ],

			// mu -> nm
			[ /\bumu(?=[aeiou])/, 'unm' ],

			// M -> N
			[ /\bNongol/, 'Mongol' ],

			// n -> a
			[ /(G|g)rent/, '$1reat' ],
			[ /\bns/, 'as' ],
			[ /ncknow/, 'acknow' ],

			// n -> h
			[ /\btn(e|a)/, 'th$1' ],
			[ /\bwn/, 'wh' ],
			[ /([Ss])mitn/, '$1mith' ],

			// n -> in
			[ /(?<=[^Eaeiou])ng\b/, 'ing' ], // -ing

			// n -> m
			[ /(?<=I|i)nperi/, 'mperi' ], // imperial
			[ /(?<=H|h)inse/, 'imse' ], // himself
			[ /iun\b/, 'ium' ],
			[ /(?<=\b[a-z]\w+l)don/, 'dom' ], // no lowercase ends ldon
			[ /(?<=[Nn])unber/, 'umber' ],
			[ /stanp/, 'stamp' ],
			[ /\bn(?=ores?\b|oreover)/, 'm' ],

			// n -> o
			[ /\bnf/, 'of' ],

			// n -> ri
			[ /scnb/, 'scrib' ],

			// n -> u
			[ /\bont (of|the|to|in|a|that|and|for|with|by)\b/, 'out $1' ], // ont may be suffix, filter by common ngram
			[ /([Nn])nm(?!a)/, 'num' ],
			[ /snb/, 'sub' ],
			[ /onsly\b/, 'ously' ],
			[ /(C|c|w|W|Sh|sh)onld/, '$1ould' ],
			[ /\b([Th])h(r?)ongh/, '$1h$2ouogh' ], // though, through-
			[ /\b([Aa])bont\b/, '$1bout' ],
			[ /thongh/, 'though' ],
			[ /\b([Cc])onrt/, '$1ourt' ], // court

			// na -> m
			[ /\b([Hh])ina(|self)\b/, '$1im$2' ],

			// ni -> m
			[ /(?<=\b|[Hh]ere-?|[Hh]ence-?)froni(?=\b|age|ward)/, 'from' ],
			[ /(?<=\b[Ww])honi/, 'hom' ],
			[ /\bhini/, 'him' ],
			[ /(?<=in|)hunian/, 'human' ],
			[ /\bnian(?=u|ly|kind)/, 'man' ], // not too general, mind pinyin
			[ /\brenio/, 'remo' ],
			[ /\bni(?=ak)/, 'm' ],
			[ /niouth/, 'mouth' ], // mouth, Plymouth, etc
			[ /(?<=[Cc]o)ni(?=plet)/, 'm' ], // complete

			// ni -> m
			[ /\bnie\b/, 'me', { notLangs: [ 'de', 'pl', 'zh-pinyin' ] } ],
			[ /\bnian/, 'man', { notLangs: [ 'zh-pinyin' ] } ],
			[ /\btians/, 'trans', { notLangs: [ 'zh-pinyin' ] } ],

			// nn -> rm
			[ /(?<=[Ff])onn(?!ish)/, 'orm' ], // formula, form, etc

			// nv -> rw
			[ /nva(?=y|rd)/, 'rwa' ], // afterward, Norway

			// o -> a
			[ /\bouth(or|en)/, 'auth$1' ], // authority...
			[ /fovo(u?)r/, 'favo$1r' ],
			[ /\b([Cc])ous([ae])/, '$1aus$2' ], // cause

			// o -> c
			[ /jeot/, 'ject' ],
			[ /(?<=[Oo])oo(?=as|i[cp]|u[pl]|lu)/, 'cc' ],
			[ /(?<=[Oo])co(?=asi|lus|lud|upa|upi|ur)/, 'cc' ], // occasion, occur,
			[ /(?<=[Ss]uc)oe/, 'ce' ], // success
			[ /(?<=[Aa]c)o(?=us[ae]|ept|iden|ord)/, 'c' ], // accuse, accept
			[ /(?<=[Aa]r|ac)oh(?=[io])/, 'ch' ], // archi..., Gracchi,

			// o -> e
			[ /(?<=dis|\b)rospect/, 'respect' ],
			[ /turo\b/, 'ture' ],
			[ /([d])loss/, '$1less' ], // endless
			[ /\b([Mm])ako\b/, '$1ake' ],
			[ /\b([Mm])ado\b/, '$1ade' ],
			[ /noss(?=\b|es|like)/, 'ness' ],
			[ /\bcomo\b/, 'come', { notLangs: [ 'es' ] } ],

			// o -> n
			[ /tioos/, 'tions' ], // could be o -> u, but choose one
			[ /iog(|s)\b/, 'ing$1' ],

			// o -> u
			[ /egolar/, 'egular' ], // regular

			// ol -> d
			[ /nolix/, 'ndix' ],

			// p -> d
			[ /ecorp([^o]?)\b/, 'ecord$1' ],

			// p -> f
			[ /\bop\b/, 'of' ],

			// P -> F
			[ /\bP(ee)\b/, 'F$1' ],
			[ /\bOP\b/, 'OF' ],

			// p -> g
			[ /inp\b/, 'ing' ],
			[ /(?<!u)prap/, 'grap' ],

			// p -> n
			[ /apd\b/, 'and' ],

			// p -> o
			[ /prth/, 'orth' ],

			// P -> ?
			[ /([a-z])P\b/, '$1?' ],

			// q -> o
			[ /qf/, 'of' ],

			// Q -> G
			[ /\bGu(?=ite?|ee[rn]|i[dzvxp]|ir[^o]|in[tq]|iet|ick|ibb)/, 'Qu' ],

			// r -> c
			[ /jert/, 'ject' ], // object, etc
			[ /(\w)reive/, '$1ceive' ], // perceive, receive, etc

			[ /anrs\b/, 'ani\'s' ], // names ending in ani + 's

			// r -> i'
			[ /prs\b/, 'pi\'s' ],

			// r -> n
			[ /\bupor\b/, 'upon' ],

			// r -> v
			[ /(he|[iasolurn])sire/, '$1sive' ],
			[ /siveless/, 'siveness' ], // after sire->sive
			[ /\b(M|m)orement/, '$1ovement' ],
			[ /\b(G|g)orernment/, '$1overnment' ],
			[ /\b([Oo])bserr/, '$1bserv' ],

			// r -> t
			[ /(?<=\b[Ii])r\b/, 't' ],

			// r^ -> p
			[ /\br\^/, 'p' ],

			// ri -> n
			[ /(?<=\b[Mm]e)ri\b/, 'n' ],

			// ri -> u
			[ /ectrial/, 'ectual' ],

			// rj -> n
			[ /\birj/, 'in' ],

			// rn -> m
			[ /([aie])urn\b/, '$1um' ],
			[ /\brern/, 'rem' ],
			[ /ernent/, 'ement' ],
			[ /\brn/, 'm' ],

			// s -> a
			[ /grsph/, 'graph' ],
			[ /csuse/, 'cause' ],

			// s -> m
			[ /\b([Ss])ees(ing|ingly|ed|s)\b/, '$1eem$2' ], // seemed

			// sb -> sh
			[ /\bsb(e|all)\b/, 'sh$1' ],

			// sc -> g
			[ /insc\b/, 'ing' ],

			// t-> c
			[ /ettual/, 'ectual' ],
			[ /fetted/, 'fected' ],

			// t -> f
			[ /\bot\b/, 'of' ],
			[ /fitty/, 'fifty' ],

			// t -> i
			[ /shtp/, 'ship' ],
			[ /(?<=[Bb]u|[Cc]h|[Mm])tld/, 'ild' ],
			[ /(?<=[Bb]u|[Gg]u?|[Tt]|[Ss]|[Ff]|[Ww])tlt/, 'ilt' ],
			[ /\btn\b/, 'in' ],

			// T -> nothing (and some I -> nothing)
			[ /\bw [IT] (?=as|hich|hen|hether|ho)/, 'w' ], // w T as > was, etc

			// T -> I
			[ /(?<!\bw )\bT(?=\b|t)/, 'I' ],
			[ /T(?=reland|rish)/, 'I' ],

			// t -> l
			[ /abte\b/, 'able' ],
			[ /(?<=[WwCc]|[Ss]h)outd/, 'ould' ],

			// t -> r
			[ /\b(?<=[Ff])ot(?!h|o|i|u|m|c)/, 'or' ],
			[ /\b(?<=[Ff])t(ance|ench)/, 'r' ],
			[ /ntt(?=y|ies)/, 'ntr' ], // country
			[ /(?<=[Ll]ive)t(?=s|p|\b)/, 'r' ], // liver, Liverpool

			// T -> Y
			[ /\b(?<=JUL|JOURNE|M|WA)T\b/, 'Y' ],
			[ /\b(?<=MON|TUES|WEDNES|THURS|FRI|SATUR|SUN|)DAT\b/, 'DAY' ],

			// ti -> h
			[ /\b([Oo])ttier(?=\b|[^eis])/, '$1ther' ],

			// ti -> n
			[ /tioti/, 'tion' ],

			// ti -> u
			[ /\btipon/, 'upon' ],

			// to -> h
			[ /\bttoe(?![ds]\b)/, 'the' ],

			// U -> li, see h/U
			[ /(?<=\b|[a-z])Uon(?=s?)/, 'lion' ],
			[ /(?<=[a-z])Ung(?=s?)/, 'ling' ],

			// u -> a
			[ /Junu([^b])/, 'Janu$1' ],
			[ /\bund\b/, 'and' ],

			// u -> c
			[ /([Dd])ouum/, '$1ocum' ],

			// u -> h
			[ /(?<=\b[Tt])u(?=e[^s]|at\b)/, 'h' ], // the, there, these, etc (not Tuesday)

			// u -> n
			[ /\baud\b/, 'and' ],
			[ /meut(\b|[^e])/, 'ment$1' ],
			[ /siau(|s)\b/, 'sian$1' ], // Persians...
			[ /\b(P|p)ersou(|s)/, '$1erson$2' ],
			[ /erument/, 'ernment' ],
			[ /([Jj])uuc/, 'junc' ],
			[ /taiu/, 'tain' ],
			[ /\biu(|to|ward)\b/, 'in$1' ],
			[ /\bauy(|where|body)\b/, 'any' ],
			[ /\biuto\b/, 'into' ],
			[ /kuow/, 'know' ],
			[ /iug(s|ed|ly|)\b/, 'ing$1' ],
			[ /auswer/, 'answer' ],

			// u -> ii
			// [ /(?<=\b[clxv]*)u(?=i*)/, 'ii' ], // roman numerals

			// "U" -> "ll" when preceded by a lowercase letter.
			// "U" -> "li"
			[ /(?<=[a-z])U(?=c)/, 'li' ], // relic
			[ /(?<=[a-z])U(?!c)/, 'll' ], // not relic

			// un -> m
			[ /\bimuned/, 'immed' ],

			// ui -> m ... "must", etc
			[ /\bui(ust)\b/, 'm$1' ],

			// v -> r
			[ /[Mm]emov/, 'memor' ],

			// v -> u
			[ /\b([Nn])vm/, '$1um' ],

			// v -> y
			[ /\bv(ear|our|ou)s?\b/, 'y$1' ],
			[ /\b(B|b|M|m|the)v/, '$1y' ],
			[ /\b(A|a)nv(\b|w)/i, '$1ny$2' ],
			[ /vield/, 'yield' ],
			[ /encv\b/, 'ency' ],
			[ /\b(?<=[GgHh])aye\b/, 'ave' ],
			[ /([Aa])bbev/, '$1bbey' ],
			[ /demv\b/, 'demy' ],
			[ /mplov/, 'mploy' ], // employ-...
			[ /itv\b/, 'ity' ],
			[ /(?<=[Vv])erv\b/, 'ery' ],
			[ /(?<=(Mon|Tues|Wednes|Thurs|Fri|\b)da)v(?=s?\b)/, 'y' ],

			// v -> w
			[ /\bvr/, 'wr' ],

			// v^ -> w
			[ /\bv[\^/]([a-z])/, 'w$1' ],

			// vc -> we
			[ /\bvc\b/, 'we' ],

			// vd -> wi
			[ /vd(ll|th)/, 'wi$1' ],

			// V -> m
			[ /\bV(iss|rs|r)\b/, 'M$1' ],

			// Vh ->Wh
			[ /\bVh/, 'Wh' ],

			// V' -> W
			[ /\bV'/, 'W' ],

			// Vi -> M
			[ /\bVir\b/, 'Mr' ],

			// vir -> w
			[ /hovir(?!u)/, 'how' ],

			// vn -> wi
			[ /vn(ll|th)/, 'wi$1' ],

			// VV -> W
			[ /\bVV(e)\b/, 'W$1' ],

			// w -> m
			[ /mewt(?!tide)/, 'ment' ],

			// w r -> w (not sure what this is about)
			[ /\bw r (?=e\b|[aeoiu]\w)/, 'w' ],

			// X -> N
			[ /\bX(?=o)/, 'N' ],

			// xv -> w
			[ /xvho/, 'who' ],
			[ /xvay/, 'way' ],
			[ /txvo/, 'two' ],

			// y -> v
			[ /([Ss])ery(a|i)/, '$1erv$2' ],
			[ /tiye(|ly|ness|nesses|s)\b/, 'tive$1' ],
			[ /eyies\b/, 'evies' ],
			[ /(?<=\b(?:[Hh]a|[BbGg]ra))ye\b/, 've' ], // have, grave, brave
			[ /\b([Oo])by(?=\B)/, '$1bv' ],
			[ /\b(?<=Gene)ya/, 'va' ],
			[ /\bevent/, 'event' ],
			[ /vent(?=\b|s|ed|or|ing|y\b|ies|ral|ro|ur|il|ri)/, 'vent' ],

			// Y -> T
			[ /\bY(?=he)/, 'T' ],

			// Y -> V
			[ /\b(?<=GENE)YA/, 'VA' ],
			[ /\bEYENT/, 'EVENT' ],
			[ /VENT(?=\b|S|ED|OR|ING|Y\b|IES|RAL|RO|UR|IL|RI)/, 'VENT' ],

			// z -> x
			[ /\bezc/, 'exc' ],

			// -> Rome/Roman
			[ /(E|K)om(e|an|ish)([ .,\n])/, 'Rom$2$3' ],

			// d', l', m', n' (not s', or english possesives get messed with)
			[ /(^|\s)([MmDdLlNnJjSsCc]|[Qq]u|[Jj]usqu)(' | ')(?=[AaEeIiOoUuÁáÀàéÉèÈ])/, "$1$2'" ]
		];

		process_editor( editor, new PartialWordRegexProcessor( reps ) );
	};

	const do_multiword_fixes = function ( editor ) {

		let reps = [

			// hyphens more likely to be em-dash
			[ /(<?=[a-z])-(the)\b/, '—$1' ],

			// Missing spaces
			// theCap unlikely to be right
			[ /\b(a|an|of|by|the)(?=[A-Z])/, '$1 ' ],

			// single cap in a word probably a dropped space
			// watch for Mc/Mac
			// needs lookbehind really
			[ /\b(\w[a-z]*[abd-z])([A-Z][a-z]+\b)/, '$1 $2' ],

			// ance is a suffix when it's not ancestor's prefix
			[ /[\s-]ance(?! st[or])\b/, 'ance' ],

			[ /\bal though/, 'although' ],

			// and<dropped space>
			// not many words start and
			[ /\band((?=[a-z])[^raoei])/, 'and $1' ],

			[ /\bbet ween/, 'between' ],

			// I
			[ /I(am\b|had|was|will|can|shall|did)/, 'I $1' ],

			// he
			[ /([Hh]e)(had|did|can|will|was)/, '$1 $2' ],

			// him
			[ /(?<=\b([Hh]im))t/, ' t' ], // e.g. himto -> him to

			[ /notbe/, 'not be' ], // cannot be, not being, ...

			[ /([deos])n(' | ')t\b/, '$1n\'t' ],

			[ /\bcom m/, 'comm' ],
			[ /(<?=in|\b)com par/, 'compar' ],

			// government can only be -a, -s, -e
			[ /(overnment)((?=\w)[^sae])/, '$1 $2' ],

			[ /((?=\w)[^sa])may/, '$1 may' ], // dismay/gamay are the only words end in may

			[ /\bme(of|to|for|that)\b/, 'me $1' ],

			[ /(s|t)my\b/, '$1 my' ], // -my isn't always a likey suffix

			[ /\bof(a|b|c|d|g|m|n|p|s|w)/, 'of $1' ], // of my/self, etc words that can't start of-

			[ /\bof(our|my|some|him|her|his)\b/, 'of $1' ],

			// of merged left, careful of Russian names...
			[ /(Earl|Duke|Queen|King|Baron|most|all|some|many)of/, '$1 of' ],

			[ /([a-z])which/, '$1 which' ], // only wrong for everwhich

			// no word ends -many except overmany
			[ /([^Oo]?[^v]?[^e]?[^r\s])many/, '$1 many' ],

			// she
			[ /([Ss]he)(had|did|will|was)/, '$1 $2' ],

			[ /\bthus(?!ly|\b)/, 'thus ' ], // no words start thus

			// some obvious loss of spaces after 'the'
			[ /\bthe(?=h|me[nm]|mer[c]|mo|im|un|wh)/, 'the ' ],

			// and before 'the'
			[ /\b(\w[^aoniy\s])the\b/, '$1 the' ],

			// before 'to'
			[ /\b(thing)to\b/, '$1 to' ],

			[ /(u|n|r) (dices?)\b/, '$1$2' ],

			[ /\bun der/, 'under' ],

			[ /\brene w(ed|al|abl)\b/, 'renew$1' ],
			[ /\bre turn/, 'return' ],

			// words ending in cious that lost a space
			[ /cious((?=[a-z])[^enl])/, 'cious $1' ],

			// Spurious spaces
			[ /\b(P|p)ro ceed/, '$1roceed' ],
			[ /\b(P|p)ro ced/, '$1roced' ],
			[ /(C|c)on cl/, '$1oncl' ], // con clude
			[ /(un)?ans wer(a|e|s|\b)/, '$1answer$2' ],
			[ /same(a|b|c|f|g|h|i|j|k|m|o|p|q|u|v|w|x|y|z)/, 'same $1' ],
			[ /\bho w/, 'how' ], // however...

			[ /\b(dis|)satis fact/, '$1satisfact' ],
			[ /\bendo (wed|wing|wments?)/, 'endo$1' ],

			[ /\bre[ -](quest|quire|solute)/, 're$1' ],

			[ /\bwasnot\b/, 'was not' ],

			[ /\b(ly)(worked)\b/, '$1-$2' ],

			// missing hyphens
			[ /\binchief(?=s?\b)/, 'in-chief' ],
			[ /(?<=y)public(?=s?\b)/, '-public' ], // notary-public, ...

			// Lone quotes at the start of a quotation
			[ /(?<=(said|answered|replied|shouted|thought|whispered|murmured|muttered|), ") /, '' ],

			// spurious punctuation, eg why. not, but avoid e.g. i.e. etc
			[ /([a-z]{3,})\. ([a-z])/, '$1 $2' ]
		];

		process_editor( editor, new PartialWordRegexProcessor( reps ) );

		// These are things that are never suffixes
		// eg. hecould -> he could
		reps = [
			/(c|sh|w)ould(n't)?/
		];

		process_editor( editor, new BannedSuffixProcessor( reps ) );

		// These can never be prefixes
		// so insert spaces after then
		reps = [
			/[Aa](?=number|bond\b|comm|rece|reci[^b])/,
			/a(?=dele)/,
			/be(?=my)/,
			/but(?=al)/, // but all, but always
			/come(?=to)/,
			/great(?=m|p|r)/,
			/[HhSsGg]ave(?=my)/, // h/gave my/self
			/me(?=wit|tow)/,
			/means/,
			/of(?=the)/,
			/sent(?=as)/,
			/some(?=[cm])/,
			/that(?=can|d|w)/, // that will
			/the(?=mes|tr|e\w)/,
			/(?:un|)usual(?!s|ness|ly)/,
			/I(?=h[eiou])/,
			/I(?=ha[^b])/, // I have/had
			/with(?=a\b|a[^lm]|all)/,
			/with(?=his|her|it|th|ha)/
		];

		process_editor( editor, new BannedPrefixProcessor( reps ) );

		// if we see these on their own, they are prefixes of the next word
		// These can be slightly aggressive, as they only fire if the prefix is
		// already isolated - they won't break up existing words
		let orphans = [
			/(a|fo)llo/, // allocate, follow
			/(un|)acknow/,
			/(|[Ii]n)conse/, // consequence, consecrate
			/circum/,
			/combin?/,
			/(|[Ii]n)compa/,
			/(|[iI]n)comple/,
			/(|[Ii]n)corp/,
			/\w*corres?/,
			/diffi/, // difficult, diffident
			/dis/, // very few words end dis, so an orphan is likely a prefix
			/decla?/, // ration can't be a simple suffix
			/ered/,
			/exper?/,
			/helio/,
			/inex/,
			/medi/, // medicine/s, medical
			/misbe/,
			/(|in)oppor/,
			/(|dis|co-?|acc|in|sub|super)ordin?/,
			/[Pp]arti/,
			/[Pp]hilo/,
			/(|im|mal)prac/,
			/(|im)practi/,
			/pre/, // pre is occasionally a suffix, but it's
			/(|un)[Pp]rinci/,
			/reca/,
			/(|p|un|under)recom/, // recommend
			/repre/,
			/(|un|tran)sub/,
			/suc/, // success...
			/(|un)sug/, // suggest, sugary../
			/sur/, // sur-
			/trans/,
			/undis/,
			/whatso/
		];

		process_editor( editor, new OrphanPrefixProcessor( orphans ) );

		// if we see these on their own, they're suffixes of the prior word
		orphans = [
			/astic/,
			/ated/,
			/atory/,
			/(|ond|ti)ar(y|ies)/,
			/tably/,
			/butors?/,
			/cating(|ly)/,
			/cellation(|s)/,
			/cien(cy|t)/,
			/ciples?/,
			/dences?/,
			/derable/,
			/digent(|s)/,
			/dit(y|ies)/,
			/drawals?/, // only withdrawal
			/ested(|ly|ness)/,
			/esque(|ly)/,
			/ficial\w*/,
			/geous(|ly|ness|nesses)/,
			/gences?/,
			/hend(|s|ing)/,
			/iast\w*/,
			/ings?/, // ing is rarely a prefix, much more likely to be -ing if it occurs alone
			/lants/,
			/lated/,
			/lative(s|ly|)/, // comp-, decla-
			/ligent(|ly|sia|sias)/,
			/mations?/, // not motions
			/munication?/,
			/ments?/,
			/mence\w*/, // commmence
			/mitted(|ly|ness)/,
			/nect(ed|ions?)/,
			/nence/,
			/nese/,
			/nien(ce|ces|ced|t)/,
			/m?on(ing|ed)/, // summoned, commisioned...
			/pan(y|ies)/,
			/pensat\w+/, // compensate
			/plet(ed|ion|ions)/,
			/politan\w*/,
			/pl?oration(|s|al)?/,
			/rative(s|ly|)/, // comp-, decla-
			/rit(ies|y)/,
			/rence(|d|s)/,
			/saries/, // anniversaries...
			/sion\w*/,
			/siderable\w*/, // avoid sideral/sideration
			/sume(\b|[^r]\w*|r[^i]\w*)/, // avoid -sumeria
			/stantly/,
			/tain(ed|s)/,
			/[as]tr[au]ction(|s|al|ary|ally)/,
			/[szt]?[aoiue]?tion(|s|al|ally)/, // not ration
			/tages?/,
			/ti[vn]ely/,
			/tinual(|ly|ness|ity)/,
			/tinuous(|ly|ness)/,
			/b?ilit(ies|y)/,
			/vid(es|ing)/,
			/wered/
		];

		process_editor( editor, new OrphanSuffixProcessor( orphans ) );
	};

	const do_foreign_italics = function ( editor ) {
		const reps = [
			/\bad (hoc|.*um|.*em)\b/,
			/de facto/,
			/quid pro quo/,
			/locum tenens/,
			/\b[Ii]bid\b/
		];

		process_editor( editor, new ItaliciseProcessor( reps ) );
	};

	const do_whole_words_reps = function ( editor ) {

		// simple whole-word replacements
		const reps = [
		];

		process_editor( editor, new WholeWordRegexProcessor( reps ) );
	};

	const doLongSReplacements = function ( editor ) {
		const long_s_reps = [
			// fix bad long se replacements
			[ /ƒ/, 'f' ],
			[ /ʃ/, 's' ],

			[ /([^i])fic\b/, '$1sic' ],
			[ /([Ee])aft/, '$1ast' ],
			[ /([W])eft/, '$1est' ], // assume Weft is West, but weft is like fabric
			[ /(af|un)?focia/, '$1socia' ],
			[ /(?<=[Aa])ff(embl|ign)/, 'ss$1' ], // assign, assemble..
			[ /(A|a)nfwer/, '$1nswer' ],
			[ /(ef)?fent/, '$1sent' ], // essential, sent, sentinel
			[ /(other|like)wife/, '$1wise' ],
			[ /\bfide\b/, 'side' ],
			[ /\bfo\b/, 'so' ],
			[ /\breft/, 'rest' ],
			[ /([Aa])bfo/, '$1bso' ],
			[ /ccef[fs]/, 'ccess' ],
			[ /bfurd/, 'bsurd' ],
			[ /affif/, 'assist' ],
			[ /aff(um|ur|er)/, 'ass$1' ], // assume, assure
			[ /(?<=A|a)fc/, 'sc' ], // ascent
			[ /Afia/, 'Asia' ],
			[ /(?<=A|a)fk/, 'sk' ], // ask
			[ /aftard/, 'astard' ],
			[ /aftic/, 'astic' ],
			[ /afty/, 'asty' ],
			[ /([Aa])lfo/, '$1lso' ],
			[ /([Aa])pfe/, '$1pse' ],
			[ /([Aa])ufp/, '$1usp' ],
			[ /baffy/, 'bassy' ],
			[ /([Bb])afe/, '$1ase' ],
			[ /([Bb]|[Cc]r)eft/, '$1est' ],
			[ /([Cc])afua/, '$1asua' ],
			[ /([Cc])auf/, '$1aus' ],
			[ /([Cc])eaf(?!a)/, '$1eas' ],
			[ /ceff/, 'cess' ], // necessary
			[ /cefs\b/, 'cess' ], // princess, process
			[ /([Cc])heft/, '$1hest' ],
			[ /Chrif/, 'Chris' ],
			[ /cife/, 'cise' ],
			[ /([Cc])laf[fs]/, '$1lass' ],
			[ /([Cc])lofe/, '$1lose' ],
			[ /([Cc])onf(id|t|eq)/, '$1ons$2' ], // const, conseq...
			[ /([Cc])ourfe/, '$1ourse' ],
			[ /([Cc])oft/, '$1ost' ],
			[ /([Cc])roff\B/, '$1ross' ], // cross-
			[ /([Cc])rofs\b/, '$1ross' ], // cross
			[ /([Dd])efcr/, '$1escr' ],
			[ /dorf(e|es|ed|ing|ings|ment)/, 'dors$1' ],
			[ /efer([vt])/, 'eser$1' ], // deserve-, desert-
			[ /([dD])if([ocprgqst]|ad)/, '$1is$2' ], // dis-
			[ /\b([dD])if([^f]\w)/, '$1is$2' ],
			[ /([Dd])iffol/, '$1issol' ],
			[ /([Dd])efir/, '$1esir' ],
			[ /efour/, 'esour' ],
			[ /offef[fs]/, 'ossess' ],
			[ /feffion/, 'session' ], // session (possesion comes later)
			[ /(?<![A-Z]|ff|\b)eff(|ed|ion|ing|ly)/, 'ess$1' ], // express, etc
			[ /([Ee])fpe/, '$1spe' ], // especial
			[ /([Ee])fq/, '$1sq' ],
			[ /(?<=R|r|t|l|p)egift/, 'egist' ], // regist.., strategist, etc
			[ /(?<=en)lift/, 'list' ],
			[ /fenf(e|es|ed|ing|ings)\b/, 'sens$1' ],
			[ /enf(e|es|ed|ing|ings)\b/, 'ens$1' ],
			[ /([Bb])eft(\b|ed|ing)/, '$1est$1' ],
			[ /([^kgrdw])eft\b/, '$1est' ], // -est
			[ /efide/, 'eside' ],
			[ /(?<=R|r)efort/, 'esort' ],
			[ /(?<=R|r|t|l|p)egift/, 'egist' ], // regist.., strategist, etc
			[ /([Ee])fta/, '$1sta' ], // establish
			[ /([Ee])fti/, '$1sti' ], // estimate
			[ /enfes/, 'enses' ],
			[ /ennf/, 'enns' ], // Pennsylv etc
			[ /erfal/, 'ersal' ],
			[ /erfon/, 'erson' ],
			[ /erfua/, 'ersua' ],
			[ /erfue/, 'ersue' ],
			[ /erfui/, 'ersui' ],
			[ /eruf/, 'erus' ],
			[ /fa(cr|fe|ga|id|le|lut|lt|tis|w\b|nds?\b)/, 'sa$1' ],
			[ /\bfay/, 'say' ],
			[ /\bfa(ve|vi)/, 'sa$1' ],
			[ /(?<=F|\bf)alf/, 'als' ], // false
			[ /fatif(?!e)/, 'satis' ],
			[ /fca([^s])/, 'sca$1' ], // scarce, scant, etc (not briefcase)
			[ /fchem/, 'schem' ],
			[ /fc(ie|ious|ure|en|rib|rip)/, 'sc$1' ], // science, conscious, secure
			[ /fenf/, 'sens' ],
			[ /fe(a\b|af|cl|co|iz)/, 'se$1' ], // season, seclude, second
			[ /fee(m|n|ing)/, 'see$1' ], // seen, seem
			[ /fe(ek|gr|duc)/, 'se$1' ],
			[ /felec/, 'selec' ],
			[ /fel(f|v)/, 'sel$1' ],
			[ /(?<=[Aa]b|[Ii]n)fence/, 'sence' ],
			[ /fepar/, 'separ' ],
			[ /feri([eo])/, 'seri$1' ],
			[ /fervi/, 'servi' ],
			[ /\bfet(|ting|s|ter)\b/, 'set' ],
			[ /fettle(\b|m|s)/, 'settle$1' ], // fettle is a word, but settle is way more common
			[ /feve(ra|n)/, 'seve$1' ], // severla, seven
			[ /fhew/, 'shew' ],
			[ /(?<=\ba?)fide(?=s?\b)/, 'side' ],
			[ /fing(le|u)/, 'sing$1' ], // single, singular
			[ /fis\b/, 'sis' ], // -sis
			[ /ffidu/, 'ssidu' ], // Assiduous
			[ /fh(al|ut|ip|o)/, 'sh$1' ],
			[ /inifter/, 'inister' ],
			[ /fidera/, 'sidera' ], // considerable/ation/ate
			[ /fift(?!h)/, 'sist' ], // subsist, consist
			[ /filen/, 'silen' ],
			[ /fign/, 'sign' ],
			[ /fimi/, 'simi' ],
			[ /fince/, 'since' ],
			[ /fion/, 'sion' ],
			[ /firft/, 'first' ],
			[ /fite\b/, 'site' ],
			[ /fitive/, 'sitive' ],
			[ /fitu/, 'situ' ],
			[ /flaught/, 'slaught' ],
			[ /flowl/, 'slowl' ],
			[ /flowne/, 'slowne' ],
			[ /fm(an|en|all|oth|ooth)/, 'sm$1' ], // small, helmsmen, smooth
			[ /focie/, 'socie' ],
			[ /fole/, 'sole' ],
			[ /foli/, 'soli' ],
			[ /folv/, 'solv' ],
			[ /fome/, 'some' ],
			[ /foon/, 'soon' ],
			[ /foph/, 'soph' ], // -sopher/y
			[ /fourc/, 'sourc' ],
			[ /fouth/, 'South' ],
			[ /fov/, 'sov' ],
			[ /fpade/, 'spade' ],
			[ /fpawn/, 'spawn' ],
			[ /fpeak/, 'speak' ],
			[ /fpec/, 'spec' ],
			[ /fpee/, 'spee' ],
			[ /fpir/, 'spir' ], // spirir, spiral,
			[ /ft(air|an|at|eem|ep|ill|on|oo|r|ud|y)/, 'st$1' ],
			[ /\bft(\w)/, 'st$1' ],
			[ /fubf/, 'subs' ], // do before fub
			[ /fub/, 'sub' ],
			[ /fucc/, 'succ' ],
			[ /fuch/, 'such' ],
			[ /fued/, 'sued' ],
			[ /\bfu(e|es|ings?)\b/, 'su$1' ],
			[ /fuf(p)/, 'sus$1' ],
			[ /fuff/, 'suff' ],
			[ /fund(?!rais)/, 'sund' ],
			[ /fumm/, 'summ' ], // summit, summary
			[ /fuit/, 'suit' ],
			[ /fuper/, 'super' ],
			[ /fupp/, 'supp' ],
			[ /fu(re|rv)/, 'su$1' ],
			[ /fw(ay|ear|orn)/, 'sw$1' ],
			[ /fyf/, 'sys' ],
			[ /fym/, 'sym' ],
			[ /grefs/, 'gress' ],
			[ /hift/, 'hist' ],
			[ /(?<=[Hh])(ea|o|oa|ou)rf/, '$1rs' ], // house, hearse, horse
			[ /i[sf]cuff/, 'iscuss' ],
			[ /ifh/, 'ish' ],
			[ /ifm\b/, 'ism' ],
			[ /ifo\b/, 'iso' ],
			[ /ifon/, 'ison' ],
			[ /iftic/, 'istic' ],
			[ /([Ii])ffu/, '$1ssu' ],
			[ /illuf/, 'illus' ],
			[ /(I|i)nft/, '$1nst' ],
			[ /\b(?<=i|I)fl/, 'sl' ], // isle, island
			[ /Jefus/, 'Jesus' ],
			[ /(?<=J|j|I|i)urif/, 'uris' ],
			[ /([Jj])uft/, '$1ust' ],
			[ /([Ll])aft/, '$1ast' ], // last, lastly, etc
			[ /lefia/, 'lesia' ],
			[ /([Ll])egif/, '$1egis' ], // legislation...
			[ /([^ie])efs/, '$1ess' ], // -ess
			[ /(?<=l|L)eff/, 'less' ], // -ess-
			[ /lifle/, 'lisle' ],
			[ /lifh/, 'lish' ],
			[ /lufiv/, 'lusiv' ],
			[ /([MmPp])afs\b/, '$1ass' ],
			[ /([Mm])i(fs\b|ff\B)/, '$1iss' ], // miss, missing
			[ /([Mm])i(f\B)/, '$1is' ], // mistake
			[ /Missifippi/, 'Missisippi' ],
			[ /Missiffippi/, 'Mississippi' ],
			[ /([Mm])oft/, 'most' ],
			[ /mongft/, 'mongst' ],
			[ /([Mm])uft/, 'must' ],
			[ /nefe/, 'nese' ],
			[ /nefs/, 'ness' ],
			[ /nfate/, 'nsate' ],
			[ /nfel(?=\b|s|led|l[oe]rs?)/, 'nsel' ],
			[ /nfive/, 'nsive' ],
			[ /oaft/, 'oast' ], // coast, etc
			[ /obf/, 'obs' ],
			[ /([Oo])bfe/, '$1bse' ], // observ
			[ /ofed/, 'osed' ],
			[ /offi/, 'ossi' ], // possible
			[ /ofition/, 'osition' ], // position, etc.
			[ /ofity/, 'osity' ],
			[ /oftil/, 'ostil' ], // hostile
			[ /ouf\b/, 'ous' ],
			[ /oufly/, 'ously' ],
			[ /([Pp])aft/, '$1ast' ],
			[ /hraf/, 'hras' ], // phrase
			[ /paff/, 'pass' ], // pass/age, for pafs, see mafs
			[ /([Pp])leaf/, '$1leas' ],
			[ /([Pp])of(e|t)/, '$1os$2' ], // post, pose, compose...
			[ /(?<=P|p)urfu/, 'ursu' ],
			[ /(?<=R|r)ef([pfs]|en|ume|ump)/, 'es$1' ],
			[ /([Rr])eleaf/, '$1eleas' ],
			[ /(?<=R|r)aif(e|i)/, 'ais$1' ], // raising, raised/r
			[ /\b([Aa]r|[Rr])if([ie])/, '$1is$2' ], // a/rising/ed/es
			[ /rofec/, 'rosec' ], // prosecute
			[ /rofef([sf])/, 'rofess' ],
			[ /rofp/, 'rosp' ],
			[ /urpof/, 'urpos' ],
			[ /([Qq])ueft/, '$1uest' ],
			[ /reafo/, 'reaso' ],
			[ /refea/, 'resea' ],
			[ /refi/, 'resi' ],
			[ /([Tt])afte/, '$1aste' ],
			[ /(?<=T|t)eft/, 'est' ],
			[ /terfect/, 'tersect' ], // intersect, but not perfect, etc
			[ /hefe/, 'hese' ], // these
			[ /([Hh])ofe/, '$1ose' ], // those, whose
			[ /tereft/, 'terest' ],
			[ /traft/, 'trast' ],
			[ /ranf/, 'rans' ], // trans-
			[ /ufe/, 'use' ],
			[ /uftom/, 'ustom' ],
			[ /vaft/, 'vast' ],
			[ /(?<=V|v)erf/, 'ers' ], // verse, versus
			[ /([Vv])eff/, 'vess' ],
			[ /verf([eyo])/, 'vers$1' ], // verse, verso -versy
			[ /vife/, 'vise' ], // advise..
			[ /([Vv])ifi/, '$1isi' ],
			[ /ifdom/, 'isdom' ],
			[ /xift/, 'xist' ]
		];

		process_editor( editor, new PartialWordRegexProcessor( long_s_reps ) );
	};

	const template_cleanup = function ( editor ) {
		const header = editor.forField( '#wpHeaderTextbox' );
		const footer = editor.forField( '#wpFooterTextbox' );

		// {{c}} to {{center}}
		editor.replace( /{{c\|/g, '{{center|' );
		header.replace( /{{c\|/g, '{{center|' );
		footer.replace( /{{c\|/g, '{{center|' );

		// {{rh}} to {{RunningHeader}}
		header.replace( /\n?{{rh\|/gi, '{{RunningHeader|' );

		// more cleanup
		editor
		// {{hws}} & {{hwe}} expanded
			.replace( /{{hws\|/g, '{{hyphenated word start|' )
			.replace( /{{hwe\|/g, '{{hyphenated word end|' )

		// {{di}} expanded
			.replace( /{{di\|/g, '{{dropinitial|' )

		// {{hi}} expanded
			.replace( /{{hi\|/g, '{{hanging indent|' )

		// {{sm}} expanded
			.replace( /{{sm\|/g, '{{smaller|' )

		// {{...}} replaced
		// expand diacritical templates
		// .replace(/{{\.{3}}}/g, '…')

		// expand diacritical templates
		// eslint-disable-next-line no-useless-concat
			.replace( /{{(ae|oe|\w[:`'~^-])}}/g, '{' + '{subst:$1}}' )

		// convert {{—}} to —
			.replace( /{{—}}/g, '—' );

		// M<sup>c</sup> to {{Mc}}
		editor.replace( /M<sup>c<\/sup>/g, '{{Mc}}' );
		header.replace( /M<sup>c<\/sup>/g, '{{Mc}}' );

		// section tag fix
		editor.replace( /<section (begin|end)=(\w[^/]+)\/>/g,
			'<section $1="$2"/>' );

		// refs don't have space before them
		editor.replace( /\s<ref/g, '<ref' );
	};

	const do_extra_functions = function ( editor ) {

		const header = editor.forField( '#wpHeaderTextbox' );
		const footer = editor.forField( '#wpFooterTextbox' );

		Cleanup.cleanupFunctions.forEach( function ( v ) {
			v( editor, header, footer );
		} );
	};

	const do_replaceSmartQuotes = function ( editor ) {
		// replace smart quotes
		editor
			.replace( /“ /g, '"' )
			.replace( / ”/g, '"' )
			.replace( /[“”]/g, '"' )
			.replace( /‘ /g, "'" )
			.replace( / ’/g, "'" )
			.replace( /[‘’]/g, "'" );
	};

	const collapse_line_breaks = function ( editor ) {
		// stuff to do only if the page doesn't contain a <poem> tag:
		if ( editor.get().indexOf( '<poem>' ) === -1 ) {

			// first, a hack! [T230415]
			const short_line_thresh = Cleanup.shortLineThreshold;

			if ( short_line_thresh > 0 ) {
				const lines = editor.get().split( /\r?\n/ );

				for ( let i = 0; i < lines.length - 1; i++ ) {
					if ( ( lines[ i ].length < short_line_thresh ) &&
							lines[ i ].match( /[.!?'"”’—]\s*$/ ) &&
							lines[ i + 1 ].match( /\s*['"“‘A-Z0-9]/ ) ) {
						lines[ i ] += '\n';
					}
				}

				editor.set( lines.join( '\n' ) );
			}

			editor

			// remove single line breaks; preserve multiple.
			// not if there's a tag, template, table syntax either side of line break
				.replace( /([^>}\n])\n(?!( *\||[{}<]|\n|=|\*|#))/g, '$1 $2' )

			// collapse sequences of spaces into a single space
				.replace( /  +/g, ' ' )

			// two quotes are probably two lines
				.replace( /" "/g, '"\n\n"' );
		}
	};

	// Collapse paras where the second para starts lowercase (so it's probably
	// bogus).
	const collapseSuspiciousParagraphs = function ( editor ) {

		if ( editor.get().indexOf( '<poem>' ) === -1 ) {
			editor

			// remove paragraph breaks if the second para starts lowercase
				.replace( /\n\n+(?=[a-z])/g, ' ' );
		}
	};

	const do_small_abbrs = function ( editor, abbr_list ) {

		for ( const abbr of abbr_list ) {

			let re_str = '';
			let good = '';

			for ( let i = 0; i < abbr.length; i++ ) {
				re_str += abbr[ i ] + '[.,]? ?';
				good += abbr[ i ] + '.';
			}

			re_str = '(\\s)' + re_str + '(?=\\s)'; // new word, but not in template
			const re = new RegExp( re_str, 'g' );

			const smallAbbrTemplate = 'asc';

			good = `$1{{${smallAbbrTemplate}|${good}}}`;

			editor.replace( re, good );
		}
	};

	const markProofread = function () {
		// eslint-disable-next-line no-jquery/no-global-selector
		$( 'span.quality3 input' ).trigger( 'click' );
	};

	const set_summary = function ( summary_text ) {
		// eslint-disable-next-line no-jquery/no-global-selector
		$( '#wpSummary' ).val( summary_text );
	};

	const do_markProofread = function () {
		// if doing cleanup, must be proofreading
		markProofread();

		if ( Cleanup.editSummary ) {
			set_summary( Cleanup.editSummary ); // clear old summary
		}
	};

	// The main cleanup function
	// Editor: the templatescript editor object
	function do_cleanup( editor ) {

		// Any clenaups that need the context of the old line breaks
		do_pre_collapse_cleanup( editor );

		// Do this before line collapses
		if ( Cleanup.remove_running_header ) {
			process_editor( editor,
				new RunningHeaderProcessor( Cleanup.runningHeaderPatterns ) );
		}

		// Do this first, so we can correct words across collapsed line breaks
		collapse_line_breaks( editor );

		if ( Cleanup.collapseSuspiciousParagraphs ) {
			collapseSuspiciousParagraphs( editor );
		}

		// Generic cleanup
		do_generic_cleanup( editor );

		// OCR and scanno fixing

		// Do the simple replacements first, as it's easier to write these
		// if you don't have to guess what intermediate state the page is in
		if ( Cleanup.additionalOcrReplacements.length > 0 ) {
			process_editor( editor,
				new PartialWordRegexProcessor( Cleanup.additionalOcrReplacements ) );
		}

		do_ocr_fixes( editor );
		do_multiword_fixes( editor );

		if ( Cleanup.italiciseForeign ) {
			do_foreign_italics( editor );
		}

		if ( Cleanup.italicWords.length > 0 ) {
			process_editor( editor, new ItaliciseProcessor( Cleanup.italicWords ) );
		}

		do_whole_words_reps( editor );

		if ( Cleanup.doLongSReplacements ) {
			doLongSReplacements( editor );
		}

		if ( Cleanup.doTemplateCleanup ) {
			template_cleanup( editor );
		}

		if ( Cleanup.replaceSmartQuotes ) {
			do_replaceSmartQuotes( editor );
		}

		do_small_abbrs( editor, Cleanup.smallAbbreviations );

		// Any extra functions
		do_extra_functions( editor );

		if ( Cleanup.markProofread ) {
			do_markProofread();
		}
	}

	function do_cleanup_wrapper( editor ) {

		log( DEBUG, 'Cleaning up...' );

		try {
			do_cleanup( editor );
		} catch ( e ) {
			log( ERROR, e );
		}

		log( DEBUG, 'Cleanup done.' );
	}

	function find_first_diff_pos( a, b ) {
		const shorterLength = Math.min( a.length, b.length );

		for ( let i = 0; i < shorterLength; i++ ) {
			if ( a[ i ] !== b[ i ] ) {
				return i;
			}
		}

		if ( a.length !== b.length ) {
			return shorterLength;
		}

		return -1;
	}

	function zip( arrays ) {
		return arrays[ 0 ].map( function ( _, i ) {
			return arrays.map( function ( array ) {
				return array[ i ];
			} );
		} );
	}

	let test_test_to_restore = null;

	function do_cleanup_test( editor ) {

		const text = editor.get();
		test_test_to_restore = text;

		do_cleanup( editor );

		const cleaned = editor.get();

		// Load the "expected" subpage and see if the text matches

		mw.loader.using( 'mediawiki.api' ).done( function () {
			const api = new mw.Api();
			api.get( {
				action: 'query',
				titles: mw.config.get( 'wgPageName' ) + '/expected',
				prop: 'revisions',
				rvprop: 'content',
				rvslots: 'main',
				formatversion: 2,
				rvlimit: 1
			} ).done(
				function ( data ) {
					const expected = data.query.pages[ 0 ].revisions[ 0 ].slots.main.content;

					let colour = 'green';

					if ( expected !== cleaned ) {
						log( ERROR, "Expected text doesn't match!" );

						const pairs = zip( [ expected.split( '\n' ), cleaned.split( '\n' ) ] );

						for ( const pr of pairs ) {
							if ( pr[ 0 ] !== pr[ 1 ] ) {
								log( ERROR, 'Line mismatch' );
								log( ERROR, `Expected: '${pr[ 0 ]}', Got: '${pr[ 1 ]}'` );

								if ( pr[ 0 ] && pr[ 1 ] ) {
									const indx = find_first_diff_pos( pr[ 0 ], pr[ 1 ] );

									log( ERROR, pr[ 0 ].slice( indx ) );
									log( ERROR, pr[ 1 ].slice( indx ) );
								}
							}
						}

						colour = 'red';
					}

					// eslint-disable-next-line no-jquery/no-global-selector
					$( '.wikiEditor-ui' ).css( 'outline', '2px solid ' + colour );
				} );
		} ); // end using
	}

	function do_cleanup_test_restore( editor ) {

		if ( test_test_to_restore ) {
			editor.set( test_test_to_restore );
		}

		// eslint-disable-next-line no-jquery/no-global-selector
		$( '.wikiEditor-ui' ).css( 'outline', '' );
	}

	function add_templatescript() {

		$.ajax( '//tools-static.wmflabs.org/meta/scripts/pathoschild.templatescript.js', {
			dataType: 'script',
			cache: true
		} ).then( function () {

			const cleanup_entry = {
				name: Cleanup.actionTitle,
				position: 'cursor',
				script: do_cleanup_wrapper,
				enabled: true
			};

			if ( Cleanup.cleanupAccesskey ) {
				cleanup_entry.accessKey = Cleanup.cleanupAccesskey;
			}

			const entries = [
				cleanup_entry
			];

			if ( Cleanup.enableTesting ) {
				entries.push( {
					name: 'Test cleanup',
					script: do_cleanup_test
				} );

				entries.push( {
					name: 'Restore pre-cleanup',
					script: do_cleanup_test_restore
				} );
			}

			// eslint-disable-next-line no-undef
			pathoschild.TemplateScript.add(
				entries, {
					category: Cleanup.portletCategory,
					forNamespaces: Cleanup.activeNamespaces
				} // common fields
			);
		} );
	}

	function really_run() {
		log( DEBUG, 'Really_run' );
		mw.hook( signature + '.config' ).fire( Cleanup );

		if ( Cleanup.enable ) {
			add_templatescript();
		} else {
			log( DEBUG, 'Cleanup disabled' );
		}
	}

	function run() {
		if ( Cleanup.started ) {
			return;
		}
		Cleanup.started = true;
		really_run();
	}

	$.when( mw.loader.using( 'user' ), $.ready ).always( run );

// eslint-disable-next-line no-undef
}( jQuery, mediaWiki ) );
Wikimore

User:Inductiveload/cleanup.js