/*
* OCR cleanup script
*
* Mostly a bunch of regexes and prayer
*/
/* eslint-disable camelcase, no-restricted-syntax */
( function ( $, mw ) {
'use strict';
const version = '0.1';
const signature = 'wsCleanup';
const DEBUG = 0;
const INFO = 1;
const ERROR = 2;
const Cleanup = {
logLevel: ERROR,
enable: true,
testFunctions: [],
enableTesting: mw.config.get( 'wgTitle' ).endsWith( 'cleanup-test' ),
portletCategory: 'page',
activeNamespaces: [ 'page' ],
actionTitle: 'WsCleanup',
additionalOcrReplacements: [],
disabledReplacements: [],
cleanupFunctions: [],
italicWords: [],
doLongSReplacements: false,
doTemplateCleanup: true,
remove_running_header: true,
replaceSmartQuotes: true,
collapseSuspiciousParagraphs: true,
shortLineThreshold: 45,
possibleLanguages: [ 'en' ], // 'fr', 'es', 'de', 'zh-pinyin' ],
italiciseForeign: true,
smallAbbreviations: [],
runningHeaderPatterns: [
/^([ivxlcIVLXC.,]+|[iI0-9.,]+)\s+([A-Z[\]\s^*\-–—.,]*)\s*$/,
/^([A-Z\s[\]^*\-–—.,]*)\s+([ivxlcIVLXC.,]+|[iI0-9.,]+)\s*$/,
/^\s*(\d+|[A-Z[\] ]+)\s*$/
],
smallAbbrTemplate: 'smaller',
editSummary: '/* Proofread */',
markProofread: true,
cleanupAccesskey: 'c'
};
function log( level, s ) {
if ( level >= Cleanup.logLevel ) {
// eslint-disable-next-line no-console
let log_fn = console.log;
if ( level >= ERROR ) {
// eslint-disable-next-line no-console
log_fn = console.error;
}
log_fn( 'Cleanup: ', s );
}
}
class CleanupProcessor {
constructor() {}
process( /* text */ ) {
throw new Error( 'Processors must implement process()' );
}
name() {
throw new Error( 'Processors must implement name()' );
}
}
function process_editor( editor, processor ) {
let text = editor.get();
log( INFO, `Processing editor with ${processor.name()}` );
text = processor.process( text );
editor.set( text );
}
class WholeWordRegexProcessor extends CleanupProcessor {
constructor( reps ) {
super();
this.reps = reps;
}
process( text ) {
log( DEBUG, `Making ${this.reps.length} replacements` );
for ( const v of this.reps ) {
const good = v[ 1 ];
const bad = v[ 0 ];
const re = new RegExp( '\\b' + bad + '\\b', 'g' );
text = text.replace( re, good );
}
return text;
}
name() {
return 'Generic whole word regexes';
}
}
function pageMayHaveLangs( deniedLangs ) {
const hasLangs = Cleanup.possibleLanguages.filter(
( value ) => deniedLangs.includes( value )
);
return hasLangs.length > 0;
}
class PartialWordRegexProcessor extends CleanupProcessor {
constructor( reps ) {
super();
this.reps = reps;
}
process( text ) {
log( DEBUG, `Making ${this.reps.length} replacements` );
this.reps.forEach( ( v, i ) => {
const options = v[ 2 ];
let skip = false;
Cleanup.disabledReplacements.forEach( ( dv ) => {
if ( dv[ 0 ].source === v[ 0 ].source ) {
// no repl - skip all, else only skip if repl also matches
if ( !dv[ 1 ] || dv[ 1 ] === v[ 1 ] ) {
skip = true;
}
}
} );
if ( skip ) {
log( DEBUG, `Skipped disabled replacement: ${v[ 0 ].source} -> ${v[ 1 ]}` );
return;
}
if ( options && options.notLangs ) {
if ( pageMayHaveLangs( options.notLangs ) ) {
log( DEBUG, `Skipped replacement with denied language: ${v[ 0 ].source} (due to ${options.notLangs})` );
return;
}
}
if ( options && options.onlyLangs ) {
if ( !pageMayHaveLangs( options.onlyLangs ) ) {
log( DEBUG, `Skipped replacement as no allowed language: ${v[ 0 ].source} (due to ${options.onlyLangs})` );
return;
}
}
try {
const newflags = 'g' + v[ 0 ].flags.replace( 'g', '' );
// \b doesn't match useful things like unicode, so fix that up
// this can't do everything but it might help
const newSource = v[ 0 ].source;
// \b at the the start - replace with non-consuming space-or-start
// .replace( /^\\b/, '(?<=^|[\\s\\-;:\'",.!?–—{}\\[]\\|])' );
text = text.replace( new RegExp( newSource, newflags ), v[ 1 ] );
} catch ( error ) {
log( ERROR, `Error in ${i}th replacement: ${v}` );
throw error;
}
} );
return text;
}
name() {
return 'Generic partial word regexes';
}
}
/**
* Make replacements for things that cannot be a suffix in a word, but instead
* must be a new word (i.e. a space has gone missing _before_ the match)
*/
class BannedSuffixProcessor extends CleanupProcessor {
constructor( suffix_list ) {
super();
this.suffix_list = suffix_list;
}
process( text ) {
for ( const v of this.suffix_list ) {
const newflags = 'g' + v.flags.replace( 'g', '' );
const regex = new RegExp( '(\\w+)(' + v.source + ')', newflags );
text = text.replace( regex, '$1 $2' );
}
return text;
}
name() {
return 'Banned suffixes';
}
}
/**
* Make replacements for things that cannot be a prefix in a word, but instead
* must be a previous word (i.e. a space has gone missing _after_ the match)
*/
class BannedPrefixProcessor extends CleanupProcessor {
constructor( prefix_list ) {
super();
this.prefix_list = prefix_list;
}
process( text ) {
for ( const v of this.prefix_list ) {
const newflags = 'g' + v.flags.replace( 'g', '' );
text = text.replace( new RegExp( '(' + v.source + ')(\\w+)', newflags ), '$1 $2' );
}
return text;
}
name() {
return 'Banned prefixes';
}
}
/**
* Make replacements for words that cannot stand alone, but would most likely be
* suffixes of previous words (i.e. a space has been inserted _before_ the match)
*/
class OrphanSuffixProcessor extends CleanupProcessor {
constructor( reps ) {
super();
this.reps = reps;
}
process( text ) {
for ( const v of this.reps ) {
const newflags = 'g' + v.flags.replace( 'g', '' );
text = text.replace( new RegExp( '[\\s\\-](' + v.source + '\\b)', newflags ), '$1' );
}
return text;
}
name() {
return 'Orphan suffixes';
}
}
/**
* Make replacements for words that cannot stand alone, but would most likely be
* prefixes of following words (i.e. a space has been inserted _afteR_ the match)
*/
class OrphanPrefixProcessor extends CleanupProcessor {
constructor( reps ) {
super();
this.reps = reps;
}
process( text ) {
for ( const v of this.reps ) {
const newflags = 'gi' + v.flags.replace( /[gi]/, '' );
text = text.replace( new RegExp( '(\\b' + v.source + ')[\\s\\-]', newflags ), '$1' );
}
return text;
}
name() {
return 'Orphan prefixes';
}
}
/**
* Wrap selected matches in italics
*/
class ItaliciseProcessor extends CleanupProcessor {
constructor( reps ) {
super();
this.reps = reps;
}
process( text ) {
for ( const v of this.reps ) {
const newflags = 'g' + v.flags.replace( /[gi]/, '' );
text = text.replace( new RegExp( '(?<!\'\')(' + v.source + ')', newflags ), "''$1''" );
}
return text;
}
name() {
return 'Italics';
}
}
/*
* These functions need the original line breaks
*/
const do_pre_collapse_cleanup = function ( editor ) {
const reps = [
// remove trailing spaces at the end of each line
[ / +\n/, '\n' ],
// treat these symbols as hyphens
[ /[⌐¬]/, '-' ],
// join words that are hyphenated across a line break
// (but leave "|-" table syntax alone)
// Capitals keep their hyphen e.g. non-European
[ /([^|])-\n(?=[ÁÀA-ZÉÈÖ])/, '$1-' ],
// everything else loses the hyphen
[ /([^|])-\n(?=[\w])/, '$1' ]
];
process_editor( editor, new PartialWordRegexProcessor( reps ) );
};
class RunningHeaderProcessor extends CleanupProcessor {
constructor( rh_patterns ) {
super();
this.rh_patterns = rh_patterns;
}
name() {
return 'Trim running header patterns';
}
process( text ) {
text = text.split( /\r?\n/ );
let new_start_line = 0;
for ( const line of text ) {
if ( line.trim().length === 0 ) {
new_start_line += 1;
continue;
}
let found = false;
for ( const pattern of this.rh_patterns ) {
if ( pattern.test( line ) ) {
new_start_line += 1;
found = true;
break;
}
}
if ( !found ) {
break;
}
}
return text.slice( new_start_line ).join( '\n' );
}
}
const do_generic_cleanup = function ( editor ) {
// various cleanup
const reps = [
// Digitized by Google (kill)
[ /\s?D[ijl]g[ijl]t[ijl][sz][eco]d\s+by[^\n]*\s+([6G][Oo0Q]{2}g[lIf][eco])?/, '' ],
[ /\bG[oO0]{2}gle\b/, '' ],
// Remove highly suspicious chars
[ /[■•]/, '' ],
// remove trailing whitespace preceding a hard line break
[ / +<br *\/?>/, '<br />' ],
// remove trailing whitespace at the end of page text
[ /\s+$/, '' ],
// remove trailing spaces at the end of refs
[ / +<\/ref>/, '</ref>' ],
// remove trailing spaces at the end of template calls
[ / +}}/, '}}' ],
// lines containing only punctuation are likely junk
[ /^[.,^]$/m, '' ],
// convert double-hyphen to mdash (avoiding breaking HTML comment syntax)
[ /([^!])--([^>])/, '$1—$2' ],
// Remove spaces around hyphens between words
// Eg. pack -house -> pack-house
[ /(\w) ?- ?(\w)/, '$1-$2' ],
// remove unwanted spaces before punctuation marks
[ / ([);:?!,.])/, '$1' ],
// ensure spaces after punctuation marks
[ /([);:?!,.])([^ 0-9\n}|"'’”])/, '$1 $2' ],
// ...but double punctuation doesn't get any spaces
[ /([);:?!,.]) +([\n);:?!,.\]]|$)/, '$1$2' ],
// Double full-stop is probably just (3 or 4 is OK - ellipsis)
[ /(\w)\.\. (?=\w)/, '$1. ' ],
// no spaces for inter-numeric punctuation
[ /([0-9][,]) +([0-9]{3}(?![0-9]))/, '$1$2' ],
// quotes at start of line can't be a close
[ /^(['"]) (?=[A-Za-z])/m, '$1' ],
// quotes at end of line can't be an open
[ / (['"])$/m, '$1' ],
// no space in "'s"
[ / ?' ?s([\n ])/, '\'s$1' ],
[ /\( +/, '(' ],
[ / +\)/, ')' ],
[ / *— */, '—' ],
// Date ranges
[ /([0-9]{3,4})-([0-9]{2,4})/, '$1–$2' ],
// figures
[ / ?, ?ooo/, ',000' ],
// q.v. to q. v.
[ /q\.v\./, 'q. v.' ],
// i.e.
[ /\bi\.? ?e\.(?!')/, "''i.e.''" ],
// & c. to &c.
[ / ?& ?[coe][.,]([,]?)/, ' &c.$1' ],
// this is an old pound noation
// with a slash after a space
[ /([0-9]) ?[/]\.(?=\s)/, "$1''l.''" ],
// No spaces between num and st/nd/rd
[ /([0-9]) (st|nd|rd)\b/, '$1$2' ],
[ /ty(one|two|three|four|five|six|seven|eight|nine|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth)/, 'ty-$1' ],
// fi ligature to fi
[ /fi/, 'fi' ],
[ /ſ/, 'f' ],
[ /_/, ' ' ]
];
process_editor( editor, new PartialWordRegexProcessor( reps ) );
};
const do_ocr_fixes = function ( editor ) {
const reps = [
// some apostrophes probably bogus at word start
[ /\b([vw])'([a-z])/, '$1$2' ],
// some mis-read full-stops
[ /\b(?<=Mr|Mrs|Mssrs|Ms)'/, '.' ],
// ^ -> '' : delete spurious carets
[ /(?<=w)\^/, '' ],
// ! -> l
[ /ua!(?=\s)/, 'ual' ],
// / -> f
[ /\/ellow/, 'fellow' ],
// / -> t
[ /(\s)\/he\b/, '$1the' ],
// £ -> f
[ /£f\b/, 'ff' ],
// « -> s
[ /(?<=\w)«(?=\s)/, 's' ],
// $ -> s
[ /(?<=[a-z])\$/, 's' ],
// }' -> y
[ /r}'/, 'ry' ],
// ' -> y
[ /(?<=\b[Vv]er)'/, 'ery' ],
[ />(?=['"])/, '?' ],
// } -> ?
[ /(?<=[a-z]) }/, '?' ],
[ /\('(?=yc)/, 'C' ],
// 'I' -> T
[ /(?<=\W)'[IJ]'(?=\w)/, 'T' ],
// 0 -> O
[ /\b0[*']([BNR])/, "O'$1" ], // Irish names
// 1 -> i
[ /(?<=\. )1(?=n|s|t)/, 'I' ],
[ /1(?=n|s|t)/, 'i' ], // hard to tell In or in
// avoid units, dates, and "1 of", "1 to" and "1 in"
[ / 1 (?![0-9A-Z]|(or|to|in|of)\b|inch|mi\b|mile|ft|foot|cm|cent(i|\b)|dollar|pound|yard|metr|mm|km|kilo|acre|hect[ao])/, ' I ' ],
// 4 -> d
[ /4oor/, 'door' ],
[ /e4\b/, 'ed' ],
// 6 -> o
[ /\b6(?=[a-z])/, 'o' ], // 6n, 6f, etc
// 8 -> S
[ /\b8(?=\w|\b)/, 'S' ], // 8o, etc, but not 8o00
// 8i -> th
[ /\b8i/, 'th' ],
// a -> e
[ /(?<=[Jj]udg)a/, 'e' ],
// a -> f
[ /\baf\b/, 'of' ],
// a -> n
[ /\baad/, 'and' ],
[ /upoa/, 'upon' ],
[ /\bia\b/, 'in' ],
[ /(?<=[Rr])emaia/, 'emain' ],
// a -> s
[ /riaon/, 'rison' ],
[ /wera\b/, 'wers' ],
[ /\beap/, 'esp' ],
// AA -> w
[ /\b(AA|AV)(?=[a-z]{2})/, 'w$1' ],
[ /\bnat\b/, 'not' ],
// ae -> nc
[ /aaee(|s|d)\b/, 'ance$1' ],
// Av -> w
[ /Av(ill|ith|ere\b|est|here|hat|as\b|ould|ho|or|hich|hen|ell|eigh|ise|eak|rit|ron)/,
'w$1' ],
[ /AV(ill|ith|ere\b|est|here|hat|as\b|ould|ho|or|hich|hen|ell|eigh|ise|eak|rit|ron)/,
'W$1' ],
[ /(?<=[a-z])AV\b/, 'w' ],
// Avli -> wh
[ /\bAvli(ich|om?(ever)?|en|ere|ether|y)\b/, 'wh$1' ],
[ /\bAVli(ich|om?(ever)?|en|ere|ether|y)\b/, 'Wh$1' ],
// b -> e
[ /\b([Tt])hb/, '$1he' ],
// b -> h
[ /\bbow(so|ever|itz|beit)/, 'how$1' ], // watch for bowl...
[ /\b(?<=[Tt])be(?=y\b|a\b|se\b|ir\b)/, 'he$1' ],
[ /\b(?<=[Ww])b(?=i|e)/, 'h' ], // which, when
[ /\bbas(|n't|ten)\b/, 'has$1' ],
[ /\bber(|self|eto)\b/, 'her$1' ],
[ /\bbim(|self)\b/, 'him$1' ],
[ /([Ww])hicb/, '$1hich' ],
[ /\b([Ss])bow/, '$1how' ],
// b -> o
[ /(?<=\b[Ss])b/, 'o' ],
// b -> r
[ /mbeb\b/, 'mber' ],
[ /dmibal/, 'dmiral' ],
[ /xtba/, 'xtra' ],
[ /Victobia/, 'Victoria' ],
// B -> E
[ /\b(?<=TH|THR)B/, 'E' ],
// B -> R
[ /Bailw/, 'Railw' ],
[ /Boyal/, 'Royal' ],
[ /\bFBO/, 'FRO' ],
// c -> e
[ /cx(?![ivxcdm]+\b)/, '$1ex' ], // mind roman numerals
[ /becn/, 'been' ],
[ /\bbcen/, 'been' ],
[ /(C|c)lcar/, '$1lear' ],
[ /(a|u|o|p)pces\b/, '$1pees' ], // rupees,...
[ /(C|c)asc(\b|(?=\w)[^a])/, '$1ase$2' ],
[ /\bwc\b/, 'we' ],
[ /(?<=[Ss]t|\b[Tt])cam/, 'eam' ],
[ /(S|s)evc/, '$1eve' ], // several/severe
[ /([Gg])rcat/, '$1reat' ],
[ /([fvh])crence/, '$1erence' ],
[ /\b(?<=[Hh])c\b/, 'e' ], // hc -> he
[ /\bcn(?!i)/, 'en' ],
[ /\bmcn\b/, 'men' ],
[ /((?=\w)[^ao]|\b)rcs/, '$1res' ], // avoid arcs/orcs
[ /\Borcs\b/, 'ores' ], // but it can be a suffix of ores
[ /\bpcople(|s)\b/, 'people$1' ],
[ /\b&e\.(?=\s|$)/, '&c.' ],
[ /catc(|d)\b/, 'cate$1' ],
[ /\bcight/, 'eight' ],
[ /nccessar/, 'necessar' ],
[ /\b([Ww])cr/, '$1er' ],
[ /([^Aaeou])rcat/, '$1reat' ],
[ /\b([Oo])nc(|s)\b/, '$1ne$2' ],
[ /(?<=\b[Ss])[ec][ec](?=m|ing)/, 'ee' ], // seem, seeing
[ /(?<=g)mics\b/, 'mies' ],
[ /(?<=\b[\Ss])tr[ce][ce]ct/, 'treet' ], // street
[ /ocict/, 'ociet' ], // society
[ /cither/, 'either' ], // cither exists, but...
[ /(?<=\b[Ss])[ce][ce](?=d|\b|ing|m)/, 'ee' ], // see, seed, seeing
[ /(?<=\b[Ss])c(?=er|ct)/, 'e' ], // seer... (not secretary)
[ /(?<![ln])icf/, 'ief' ], // grief
[ /c(?=ver|lectr)/, 'e' ], // ever, every, electric
[ /(?<=[Pp])copl[ce]/, 'eople' ], // people
[ /(?<=[Gg]rac|[Rr]os)c/, 'e' ], // grace, rose
[ /(?<=[Cc]ru|[Yy]i)cl/, 'el' ], // cruel, yield, etc
[ /cl(?=\b|l|f)/, 'el' ], // inc. scfl -> self
[ /ncral(?=(?:s|ly|ity|ities)\b)/, 'neral' ], // general-
[ /cth(?!ood|eroy|roat|yma|esis|etic|lip|idro|i\b)/, 'eth' ], // maketh, etc
[ /tcd\b/, 'ted' ],
[ /\b(t|tsz|sz)c\b/, '$1e' ], // chinese
// ce -> œ
[ /(?<=[Mm]an)ce(?=u)/, 'œ' ],
// ci -> d
[ /(P|p)rociu/, '$1rodu' ],
[ /\bacidition(s|)\b/, 'addition$1' ],
// ci -> ici
[ /offci/, 'offici' ],
// cnce: ence
[ /cnce\b/, 'ence' ],
[ /clves\b/, 'elves' ],
// bom to born
[ /\bbom\b/, 'born' ],
// c -> d
[ /aciva/, 'adva' ], // advantag...
// c -> g
[ /(\B[^\bzlp])inc\b/, '$1ing' ],
// c -> o
[ /\bcwn/, 'own' ],
[ /cc(?=ln|ld|mp|lum|n|resp|s)/, 'co' ], // Lincoln, cold, company, ...
[ /\bcc(?=urt)/, 'co' ], // court, not accurtation
[ /\bcught/, 'ought' ],
// c -> s
[ /\b([dD])icre/, '$1isre' ], // disregard
// ci -> d
[ /eci\b/, 'ed' ],
// d -> i
[ /\bwdth/, 'with' ],
// d -> o
[ /d(?=mp|wn)/, 'o' ], // eg. compose, town
[ /fdr/, 'for' ],
// dl -> 31
[ /\b[Sd3]lst\b/, '31st' ],
// e -> a
[ /\bscele(|s|d)\b/, 'scale$1' ],
// e -> c
[ /\be(ome)\b/, 'c$1' ],
[ /rcet/, 'rect' ], // direct...
[ /struet/, 'struct' ],
[ /enee\b/, 'ence' ],
[ /expeet/, 'expect' ],
[ /((?=\B)[^n]|[oi]n)speet/, 'spect' ], // avoid speet and Nunspeet
[ /taeh/, 'tach' ], // detach
[ /\bwhieh(|ever)\b/, 'which$1' ],
[ /\bfec\b/, 'fee' ],
[ /execpt/, 'except' ],
[ /([^q])uet(ing|ed)\b/, '$1ucted' ], // conducted
[ /&e\./, '&c.' ],
[ /(?<=[Uu]n)ele(?=s?\b)/, 'cle' ],
// é -> è
[ /ére\b/, 'ère' ], // No words end with acute-e ére
// E -> F
[ /E(rom )/, 'F$1' ],
// e -> o
[ /\bef\b/, 'of' ],
[ /\bfrem\b/, 'from' ],
[ /\bse\b/, 'so', { notLangs: [ 'es', 'fr', 'zh-pinyin' ] } ],
// e -> r
[ /rthee(?!ls)/, 'rther' ], // further, northern
[ /outhee(?!ls|l\b)/, 'outher' ], // southern/ly
[ /([^r])eoad/, '$1road' ], // broad
// e -> s
[ /\beo(|uth)\b/, 'so' ],
[ /\bthoee\b/, 'those' ],
// el -> d
[ /\belyn/, 'dyn' ],
[ /itel\b/, 'ited' ], // cited, united,...
// -eney -> -ency (sad for Sweeny Todd)
[ /eney\b/, 'ency' ],
// er -> ev
[ /\berery/, 'every' ],
// é -> c
[ /([aeiou])é(t)/, '$1c$2' ],
// f -> nothing
[ /\bhighfer/, 'higher' ],
// f -> i
[ /anfes\b/, 'anies' ],
[ /stfan/, 'stian' ],
// f -> l
[ /(?<=[Aa])farm/, 'larm' ],
// f -> t
[ /\b(|in)difterent/, 'different' ],
[ /\bfwo/, 'two' ],
// f -> r
[ /(?<=\bB)[ft]it(?=ish|ain)/, 'rit' ],
// ff -> fl
[ /\bff(ood)\b/, 'fl$1' ],
// ff -> ñ
[ /(?<=[Ss])paf[ifl]a\b/, 'paña' ],
// g -> ç
[ /(?<=Mendon?)ga\b/, 'ça' ],
[ /(?<=Gu?on?)g(?=all?o)\b/, 'ç' ],
[ /Lorengo/, 'Lorenço' ],
// G -> 6
[ /\bG([0-9]*)th\b/, '6$1th' ],
// h -> b
[ /([Dd])ouht/, '$1oubt' ],
[ /\bhe(en)\b/, 'be$1' ],
[ /(Oo])hser/, '$1bser' ], // observe
[ /\bhio/, 'bio' ],
[ /\bemh/, 'emb' ],
[ /\bheyo/, 'beyo' ],
[ /\bohs\B/, 'obs' ],
[ /\bhy\b/, 'by' ],
[ /\bhe(?=ings?|en\b|an\b)/, 'be' ],
[ /\bhene(?!icos|n|q)/, 'bene' ],
// h -> c
[ /\bhareful(|ly)/, 'careful$1' ],
// h -> im
[ /\bh(?=nony|nonies)\b/, 'im' ],
// h/U -> li
[ /\b(h|U)(fe|ke|ttle)\b/, 'li$2' ],
[ /nghs([ht])/, 'nglis$1' ], // English, etc
// h -> n
[ /\bih(?![ilr])/, 'in' ],
[ /lahd(?='?s?\b|ing'?s?\b)/, 'land' ],
// h -> li
[ /\bhv[ec](?=s|)\b/, 'live' ],
[ /(?=\b[Aa])hve\b/, 'live' ],
[ /hng(?=s|ly)?\b/, 'ling' ],
[ /dehc/, 'delic' ], // delicate, etc
// h -> lt
[ /cuh(?=(|y)\b)/, 'cult' ], // difficult(y), etc
// H -> li
[ /\bHke/, 'like' ],
// H -> ll
[ /(?<=\bA|[a-z])H/, 'll' ],
// hv -> lw
[ /(?<=[Aa]|ai|l)hvay/, 'lway' ], // always, railway, spillway
// convert i9 to 19, etc.
[ /[il]([0-9])/, '1$1' ],
// i -> 1
[ /\b[Il][Iil]th\b/, '11th' ],
[ /(?<=[0-9])ist\b/, '1st' ],
// I -> 1
[ /\bIst\b/, '1st', { notLangs: [ 'de' ] } ],
// i -> nothing
[ /\bsomie/, 'some' ],
[ /sielf/, 'self' ],
[ /\b([Tt])hi(ey|ese)\b/, '$1h$2' ],
[ /senise/, 'sense' ],
[ /(?<=[Ff])irom/, 'rom' ],
// I -> nothing
// See also T -> nothing
// i -> a
[ /\bnime(ed|ly)/, 'namely' ],
// i -> f
[ /\bior(\b|m)/, 'for$1' ],
[ /(I|i)nior/, '$1nfor' ],
[ /([^m])afi(a|o)/, '$1aff$2' ],
[ /\ba[ií]f/, 'aff' ],
[ /([rhlf])iei(s|ly|)\b/, '$1ief$2' ], // brief
// i -> j
[ /(in|b|con|de|a)iect/, '$1ject' ],
[ /\biett(y|ies)/, 'jett$1' ],
// i -> l
[ /([a-z])abie\b/, '$1able' ],
[ /ficuit(|y)/, 'ficult$1' ],
[ /enerai/, 'eneral' ],
[ /\biab(o|ou)r/, 'lab$1r' ],
[ /cicar/, 'clear' ],
[ /shali(\b|ow)/, 'shall$1' ],
[ /(i)abie\b/, '$1able' ], // reliable, ...
[ /reiig/, 'relig' ],
[ /([aeiou])riy\b/, '$1rly' ],
[ /\b(un|)iaw/, '$1law' ],
[ /\bgloi(y|ious)/, 'glor$1' ],
[ /tiy\b/, 'tly' ],
[ /iais\b/, 'ials' ], // materials...
[ /\b(Ii)li(s?\b|ness)/, '$1ll$2' ],
[ /(?<=[Ss]e)if/, 'lf' ], // self
// -isli -> -ish
[ /(\w)isli\b/, '$1ish' ],
// i -> r
[ /eiy(?![ua])/, 'ery' ],
[ /([Ff])iist/, '$1irst' ],
[ /([Gg])ieat/, '$1reat' ],
[ /\b([Pp])oit(?![ior])/, '$ort' ], // port/ion
[ /beied\b/, 'bered' ],
// i -> t
[ /(a|o|i)iion/, '$1tion' ],
[ /leci\b/, 'lect' ],
[ /aier/, 'ater' ], // material
[ /\bmulii/, 'multi' ],
[ /\bihe/, 'the' ], // the, there...
[ /nir(ies|y)/, 'ntr$1' ], // country
[ /\bio(|wards?|gether)\b/, 'to$1' ],
[ /\bihat\b/, 'that' ],
[ /enily\b/, 'ently' ],
[ /ciion/, 'ction' ],
[ /(?<=[Bb]u)i/, 't' ],
[ /Stewari/, 'Stewart' ],
// i' in a word -> r (not 's)
[ /(?<=[a-z])i'(?=[a-rt-z]|s\w)/, 'r' ],
// i^ > r
[ /(?<=[a-z])i\^/, 'r' ],
// i- -> r (be more careful than ^, - can be right)
[ /(?<=Yo)i-/, 'r' ],
// I -> f
[ /\bIor([^gim]|\b)/, 'for$1' ],
// I -> l
[ /\b[l1I]' ?(?=[AEIOUÉÈaeiouéè]\w)/, 'l\'' ],
// I' at word start -> f (except I'd. I'm, I'll, etc)
[ /\bI'([a-ce-kn-uw-z])/, 'f$1' ],
// I- -> L
[ /\bI-ord/, 'Lord' ],
// I^ -> P
[ /\bI\^/, 'P' ],
// id -> nl
[ /\boidy/, 'only' ],
// id -> ul
[ /\bshoidd/, 'should' ],
// if -> i
[ /(?<=\b[Oo])if\b/, 'f' ],
// If -> N (happens in cap'd words)
[ /\b([A-Z]+)If\b/, '$1N' ],
// ii -> a
[ /\biind\b/, 'and' ],
[ /\biimount/, 'amount' ],
// II -> H
[ /\bII(e|[a-z]{2,})\b/, 'H$1' ],
// ii -> h
[ /tiie/, 'the' ],
[ /hicii/, 'hich' ], // which
// II -> M
[ /II(?=r|s)/, 'M' ],
// ii -> n
[ /aiis(?!m)/, 'ans' ],
[ /co(?:ii|tt)c/, 'conc' ],
// ii -> u
[ /(?<=\b[SsBbMm])ii/, 'u' ],
[ /\bii(?!\b|i)/, 'u' ], // avoid roman nums iii
[ /iiim(?=s?\b)/, 'ium' ],
[ /(?<=[Yy])oii/, 'ou' ],
// ii -> ü
[ /(?<=\bHs?)iian\b/, 'üan' ],
[ /\bMiiller/, 'Müller' ],
[ /\bYii(?=n\b|an\b)/, 'Yü' ],
[ /\bTriib/, 'Trüb' ],
// -iiig -> -ing
[ /iiig\b/, 'ing' ],
// ij -> h
[ /tija(?!j)/, 'tha' ],
[ /([Tt])ij([ae])/, '$1h$2' ],
// il -> H
[ /(\W |\n)il(e|im|er)/, '$1 H$2' ],
// Il -> H
[ /\bIlo(?![ck]no|ilo|ko|na|ne\b|ngot|nka|rin|ts?\b|tycin|well)/, 'Ho' ],
// in -> m
[ /soine/, 'some' ],
[ /inod(er|[^e])/, 'mod$1' ], // avoid ..node...
[ /ninent/, 'nment' ], // government/s
[ /\bcomin([au])/, 'commu$1' ], // community, communication, command
[ /\biny(|self)\b/, 'my$1' ],
[ /\binen\b/, 'men' ],
[ /([^mst])inent/, '$1ment' ], // document...
[ /(to|for|by|with|told|tell|let|g[ia]ve|from|towards|[oui]nto|under) ine\b/, '$1 me' ], // ine could be a suffix, so hit the common ones by ngram
[ /\bimined/, 'immed' ],
[ /\binean(|s)\b/, 'means' ],
[ /\bMohainn/, 'Mohamm' ],
[ /sinug/, 'smug' ],
[ /inforin/, 'inform' ],
[ /\bhiin(self|)\b/, 'him$1' ],
[ /\b([Ee])nin(i|e)/, '$1nm$2' ], // enmity, enmesh..
[ /\b([Ff])roin\b/, '$1rom' ],
[ /([Mm])einb/, '$1emb' ],
// in -> th
[ /(?<=(?:[Ii]n|[Tt]o|[Ff]or) )ine(?=\b|re\b|se\b|ir\b)/, 'the' ],
// io -> w
[ /\bneio(|ly)\b/, 'new$1' ],
// ir -> n
[ /\biir/, 'in' ],
// it -> n
[ /meitt/, 'ment' ],
// iv -> j
[ /\biv(?=st\b)/, 'ju' ],
// iv -> w
[ /\bneiv(|ly)\b/, 'new$1' ],
[ /tiveen/, 'tween' ],
// IVI -> M
[ /\bIVI(?=[a-z])/, 'M' ],
// j -> f
[ /\boj\b/, 'of' ],
// j -> i
[ /thjs/, 'this' ],
// J -> I
[ /\bJowa/, 'Iowa' ],
// J -> G
[ /\b\(J(?=uide)/, 'G' ],
// J -> l
[ /\bJibert/, 'libert' ],
[ /\b(?<=[Bb])jood/, 'lood' ], // blood
[ /ojher/, 'other' ],
// j -> y
[ /ojal/, 'oyal' ],
[ /\b([Mm])anj\b/, '$1any' ],
[ /\b([Tt])hej\b/, '$1hey' ],
// Ji -> h
[ /Jiave/, 'have' ],
[ /tJie/, 'the' ],
// jl -> d
[ /arjl/, 'ard' ],
// jj -> g
[ /jjht/, 'ght' ],
// j}3^ -> y
[ /(3|j|\})\^/, 'y' ],
// k -> ic
[ /whkh/, 'which' ],
// kl -> d
[ /Eklinb/, 'Edinb' ],
// K -> E
[ /Kng/, 'Eng' ],
// l -> nothing
[ /\b(|in)diflferent/, '$1different' ],
[ /\beitlher\b/, 'either' ],
[ /eaclh/, 'each' ],
[ /Clhin(a|ese)/, 'Chin$1' ],
[ /(?<=[Ff]l|[Dd]r|ang|[Qq]|iq|)uild/, 'uid' ], // fluid etc
[ /(?<=\b[Tt])(?:lh|hl|jh|hj)(?=[ieo])/, 'h' ], // the, these, those, etc
// l -> d
[ /listor/, 'distor' ], // distort...
// l -> f
[ /\bol\b/, 'of' ],
[ /\bl(orm)\b/, 'f$1' ],
// l -> i
[ /fui(\b|ness\b)/, 'ful$1' ],
[ /(d|D)ipio/, '$1iplo' ],
[ /(P|p)arll/, '$1arli' ],
[ /\bWilllam/, 'William' ],
[ /\b([Ff])lc/, '$1ic' ], // fiction
[ /\b(Tt])helr/, '$1heir' ],
[ /(?<=[Rr]|[Vv]|[Dd]|[Tt]|[g]|[Ff]|[Mm])ellc/, 'elic' ], // relic, delicate,
// l -> I
[ /"\blon(a|ian)/, 'Ion$1' ],
[ /\bl'(ve|ll)\b/, "I'$1" ],
[ /\blt('?s|self)\b/, 'it$1' ],
// l -> h
[ /(a|o)rslip/, '$1rship' ], // scholarship, warships, worship
[ /\b([Ww])hicl/, 'which' ],
[ /(\w)encl\b/, 'ench' ], // french, bench...
// l ->li
[ /\blke/, 'like' ],
// l -> t
[ /([0-9])lh\b/, '$1th' ],
[ /\boul/, 'out' ],
[ /([Aa])fler/, '$1fter' ],
[ /ifl(?=\b|ness|ly)/, 'ift' ], // swift
// la -> h
[ /\bthrougla/, 'through' ],
[ /\btla(?<!c)/, 'th' ],
// li -> b
[ /\blio([^n])/, 'bio$1' ], // not lion...
[ /liject/, 'bject' ], // subject
// li -> lh
[ /\botlier(|s|wise)/, 'others' ],
[ /\b([Mm])onarcli(|s|y)/, '$1onarch$2' ],
// lT -> ff
[ /di(lT|flP)ere/, 'differe' ],
// l) -> b
[ /al\) ?le\b/, 'able' ],
// l^ -> f
[ /l\^(?=[a-z])/, 'f' ],
// li -> b
[ /\bliy\b/, 'by' ],
// li -> h ... "the", "them", "their", "with", "much", "here" and whe etcetera
[ /([tT][Jl]i)(e|at|is|an|em|ear|eir|en|ither|ose|rough|ree)\b/i, 'th$2' ],
[ /\b([SsWw])lie/, '$1he' ], // she, when...
[ /\b([Ww])li(at|ole)/, '$1h$2' ], // what, whole
[ /(wlicli|ivhic(li|h)|wliich|wiiich|whicli)/, 'which' ],
[ /liurcli/, 'hurch' ],
[ /\bli(ave|ere|is|ad|ard)/, 'h$1' ],
[ /\bIl(is)\b/, 'H$1' ],
[ /witli/, 'with' ],
[ /mucli\b/, 'much ' ],
[ /\blias/, ' has' ],
[ /\bwlio/, 'who' ],
[ /\b(an|)otlier\b/, '$1other' ],
[ /ealtli/, 'ealth' ],
[ /([Cc])lii/, '$1hi' ], // China/ese...
[ /([SsMu]ucli)/, '$1uch' ],
[ /cliann/, 'chann' ],
[ /ubhs/, 'ublis' ], // publish
[ /\bliate/, 'hate' ],
[ /liion/, 'hion' ], // fashion
[ /(?<=[Tt])liing/, 'hing' ], // thing
[ /(?<=[Nn]e|[Ee])itlier/, 'ither' ], // either, neither
[ /(?<=[Cc]|\b)liarm/, 'harm' ],
// li -> k
[ /([LlBb])ooli(\b|s)/, '$1ook\b' ],
// llt -> th
[ /\bllt(e)\b/, 'th$1' ],
// lli -> th
[ /\blli(at|e)\b/, 'th$1' ],
// ln -> b
[ /suln/, 'sub' ],
[ /([Hh])md/, '$1ind' ],
// lu -> hi
[ /(?<=[a-z][^li])lucal/, 'hical' ], // -graphical
// m -> in
[ /mg\b/, 'ing' ],
[ /\bopm/, 'opin' ],
[ /Chm(a|ese)/, 'Chin$1' ],
[ /(?<=\b[Pp]la)m/, 'in' ],
// m -> n
[ /\bFramce/, 'France' ],
[ /\bFremch/, 'French' ],
[ /\bJume\b/, 'June' ],
// m -> on
[ /atim\b/, 'ation' ],
[ /\b(V|v)erbation\b/, '$1erbatim' ], // fix verbatim
// m -> rn
[ /ceming\b/, 'cerning' ],
[ /\b([Un]w|[Ww])om\b/, '$1orn' ],
[ /(?<=[Nn]orth|[Ss]outh|[Ee]ast|[Ww]est)em\b/, 'ern' ],
[ /(?<=B[ij[oö])m\b/, 'rn' ],
[ /Foumier/, 'Fournier' ],
// m -> un
[ /\bmorth/, 'unorth' ],
// m -> w
[ /\b([Nn])em([^aeo]|\b)/, '$1ew$2' ], // new, newly, news
// mn -> nm
[ /mnent/, 'nment' ],
// mu -> nm
[ /\bumu(?=[aeiou])/, 'unm' ],
// M -> N
[ /\bNongol/, 'Mongol' ],
// n -> a
[ /(G|g)rent/, '$1reat' ],
[ /\bns/, 'as' ],
[ /ncknow/, 'acknow' ],
// n -> h
[ /\btn(e|a)/, 'th$1' ],
[ /\bwn/, 'wh' ],
[ /([Ss])mitn/, '$1mith' ],
// n -> in
[ /(?<=[^Eaeiou])ng\b/, 'ing' ], // -ing
// n -> m
[ /(?<=I|i)nperi/, 'mperi' ], // imperial
[ /(?<=H|h)inse/, 'imse' ], // himself
[ /iun\b/, 'ium' ],
[ /(?<=\b[a-z]\w+l)don/, 'dom' ], // no lowercase ends ldon
[ /(?<=[Nn])unber/, 'umber' ],
[ /stanp/, 'stamp' ],
[ /\bn(?=ores?\b|oreover)/, 'm' ],
// n -> o
[ /\bnf/, 'of' ],
// n -> ri
[ /scnb/, 'scrib' ],
// n -> u
[ /\bont (of|the|to|in|a|that|and|for|with|by)\b/, 'out $1' ], // ont may be suffix, filter by common ngram
[ /([Nn])nm(?!a)/, 'num' ],
[ /snb/, 'sub' ],
[ /onsly\b/, 'ously' ],
[ /(C|c|w|W|Sh|sh)onld/, '$1ould' ],
[ /\b([Th])h(r?)ongh/, '$1h$2ouogh' ], // though, through-
[ /\b([Aa])bont\b/, '$1bout' ],
[ /thongh/, 'though' ],
[ /\b([Cc])onrt/, '$1ourt' ], // court
// na -> m
[ /\b([Hh])ina(|self)\b/, '$1im$2' ],
// ni -> m
[ /(?<=\b|[Hh]ere-?|[Hh]ence-?)froni(?=\b|age|ward)/, 'from' ],
[ /(?<=\b[Ww])honi/, 'hom' ],
[ /\bhini/, 'him' ],
[ /(?<=in|)hunian/, 'human' ],
[ /\bnian(?=u|ly|kind)/, 'man' ], // not too general, mind pinyin
[ /\brenio/, 'remo' ],
[ /\bni(?=ak)/, 'm' ],
[ /niouth/, 'mouth' ], // mouth, Plymouth, etc
[ /(?<=[Cc]o)ni(?=plet)/, 'm' ], // complete
// ni -> m
[ /\bnie\b/, 'me', { notLangs: [ 'de', 'pl', 'zh-pinyin' ] } ],
[ /\bnian/, 'man', { notLangs: [ 'zh-pinyin' ] } ],
[ /\btians/, 'trans', { notLangs: [ 'zh-pinyin' ] } ],
// nn -> rm
[ /(?<=[Ff])onn(?!ish)/, 'orm' ], // formula, form, etc
// nv -> rw
[ /nva(?=y|rd)/, 'rwa' ], // afterward, Norway
// o -> a
[ /\bouth(or|en)/, 'auth$1' ], // authority...
[ /fovo(u?)r/, 'favo$1r' ],
[ /\b([Cc])ous([ae])/, '$1aus$2' ], // cause
// o -> c
[ /jeot/, 'ject' ],
[ /(?<=[Oo])oo(?=as|i[cp]|u[pl]|lu)/, 'cc' ],
[ /(?<=[Oo])co(?=asi|lus|lud|upa|upi|ur)/, 'cc' ], // occasion, occur,
[ /(?<=[Ss]uc)oe/, 'ce' ], // success
[ /(?<=[Aa]c)o(?=us[ae]|ept|iden|ord)/, 'c' ], // accuse, accept
[ /(?<=[Aa]r|ac)oh(?=[io])/, 'ch' ], // archi..., Gracchi,
// o -> e
[ /(?<=dis|\b)rospect/, 'respect' ],
[ /turo\b/, 'ture' ],
[ /([d])loss/, '$1less' ], // endless
[ /\b([Mm])ako\b/, '$1ake' ],
[ /\b([Mm])ado\b/, '$1ade' ],
[ /noss(?=\b|es|like)/, 'ness' ],
[ /\bcomo\b/, 'come', { notLangs: [ 'es' ] } ],
// o -> n
[ /tioos/, 'tions' ], // could be o -> u, but choose one
[ /iog(|s)\b/, 'ing$1' ],
// o -> u
[ /egolar/, 'egular' ], // regular
// ol -> d
[ /nolix/, 'ndix' ],
// p -> d
[ /ecorp([^o]?)\b/, 'ecord$1' ],
// p -> f
[ /\bop\b/, 'of' ],
// P -> F
[ /\bP(ee)\b/, 'F$1' ],
[ /\bOP\b/, 'OF' ],
// p -> g
[ /inp\b/, 'ing' ],
[ /(?<!u)prap/, 'grap' ],
// p -> n
[ /apd\b/, 'and' ],
// p -> o
[ /prth/, 'orth' ],
// P -> ?
[ /([a-z])P\b/, '$1?' ],
// q -> o
[ /qf/, 'of' ],
// Q -> G
[ /\bGu(?=ite?|ee[rn]|i[dzvxp]|ir[^o]|in[tq]|iet|ick|ibb)/, 'Qu' ],
// r -> c
[ /jert/, 'ject' ], // object, etc
[ /(\w)reive/, '$1ceive' ], // perceive, receive, etc
[ /anrs\b/, 'ani\'s' ], // names ending in ani + 's
// r -> i'
[ /prs\b/, 'pi\'s' ],
// r -> n
[ /\bupor\b/, 'upon' ],
// r -> v
[ /(he|[iasolurn])sire/, '$1sive' ],
[ /siveless/, 'siveness' ], // after sire->sive
[ /\b(M|m)orement/, '$1ovement' ],
[ /\b(G|g)orernment/, '$1overnment' ],
[ /\b([Oo])bserr/, '$1bserv' ],
// r -> t
[ /(?<=\b[Ii])r\b/, 't' ],
// r^ -> p
[ /\br\^/, 'p' ],
// ri -> n
[ /(?<=\b[Mm]e)ri\b/, 'n' ],
// ri -> u
[ /ectrial/, 'ectual' ],
// rj -> n
[ /\birj/, 'in' ],
// rn -> m
[ /([aie])urn\b/, '$1um' ],
[ /\brern/, 'rem' ],
[ /ernent/, 'ement' ],
[ /\brn/, 'm' ],
// s -> a
[ /grsph/, 'graph' ],
[ /csuse/, 'cause' ],
// s -> m
[ /\b([Ss])ees(ing|ingly|ed|s)\b/, '$1eem$2' ], // seemed
// sb -> sh
[ /\bsb(e|all)\b/, 'sh$1' ],
// sc -> g
[ /insc\b/, 'ing' ],
// t-> c
[ /ettual/, 'ectual' ],
[ /fetted/, 'fected' ],
// t -> f
[ /\bot\b/, 'of' ],
[ /fitty/, 'fifty' ],
// t -> i
[ /shtp/, 'ship' ],
[ /(?<=[Bb]u|[Cc]h|[Mm])tld/, 'ild' ],
[ /(?<=[Bb]u|[Gg]u?|[Tt]|[Ss]|[Ff]|[Ww])tlt/, 'ilt' ],
[ /\btn\b/, 'in' ],
// T -> nothing (and some I -> nothing)
[ /\bw [IT] (?=as|hich|hen|hether|ho)/, 'w' ], // w T as > was, etc
// T -> I
[ /(?<!\bw )\bT(?=\b|t)/, 'I' ],
[ /T(?=reland|rish)/, 'I' ],
// t -> l
[ /abte\b/, 'able' ],
[ /(?<=[WwCc]|[Ss]h)outd/, 'ould' ],
// t -> r
[ /\b(?<=[Ff])ot(?!h|o|i|u|m|c)/, 'or' ],
[ /\b(?<=[Ff])t(ance|ench)/, 'r' ],
[ /ntt(?=y|ies)/, 'ntr' ], // country
[ /(?<=[Ll]ive)t(?=s|p|\b)/, 'r' ], // liver, Liverpool
// T -> Y
[ /\b(?<=JUL|JOURNE|M|WA)T\b/, 'Y' ],
[ /\b(?<=MON|TUES|WEDNES|THURS|FRI|SATUR|SUN|)DAT\b/, 'DAY' ],
// ti -> h
[ /\b([Oo])ttier(?=\b|[^eis])/, '$1ther' ],
// ti -> n
[ /tioti/, 'tion' ],
// ti -> u
[ /\btipon/, 'upon' ],
// to -> h
[ /\bttoe(?![ds]\b)/, 'the' ],
// U -> li, see h/U
[ /(?<=\b|[a-z])Uon(?=s?)/, 'lion' ],
[ /(?<=[a-z])Ung(?=s?)/, 'ling' ],
// u -> a
[ /Junu([^b])/, 'Janu$1' ],
[ /\bund\b/, 'and' ],
// u -> c
[ /([Dd])ouum/, '$1ocum' ],
// u -> h
[ /(?<=\b[Tt])u(?=e[^s]|at\b)/, 'h' ], // the, there, these, etc (not Tuesday)
// u -> n
[ /\baud\b/, 'and' ],
[ /meut(\b|[^e])/, 'ment$1' ],
[ /siau(|s)\b/, 'sian$1' ], // Persians...
[ /\b(P|p)ersou(|s)/, '$1erson$2' ],
[ /erument/, 'ernment' ],
[ /([Jj])uuc/, 'junc' ],
[ /taiu/, 'tain' ],
[ /\biu(|to|ward)\b/, 'in$1' ],
[ /\bauy(|where|body)\b/, 'any' ],
[ /\biuto\b/, 'into' ],
[ /kuow/, 'know' ],
[ /iug(s|ed|ly|)\b/, 'ing$1' ],
[ /auswer/, 'answer' ],
// u -> ii
// [ /(?<=\b[clxv]*)u(?=i*)/, 'ii' ], // roman numerals
// "U" -> "ll" when preceded by a lowercase letter.
// "U" -> "li"
[ /(?<=[a-z])U(?=c)/, 'li' ], // relic
[ /(?<=[a-z])U(?!c)/, 'll' ], // not relic
// un -> m
[ /\bimuned/, 'immed' ],
// ui -> m ... "must", etc
[ /\bui(ust)\b/, 'm$1' ],
// v -> r
[ /[Mm]emov/, 'memor' ],
// v -> u
[ /\b([Nn])vm/, '$1um' ],
// v -> y
[ /\bv(ear|our|ou)s?\b/, 'y$1' ],
[ /\b(B|b|M|m|the)v/, '$1y' ],
[ /\b(A|a)nv(\b|w)/i, '$1ny$2' ],
[ /vield/, 'yield' ],
[ /encv\b/, 'ency' ],
[ /\b(?<=[GgHh])aye\b/, 'ave' ],
[ /([Aa])bbev/, '$1bbey' ],
[ /demv\b/, 'demy' ],
[ /mplov/, 'mploy' ], // employ-...
[ /itv\b/, 'ity' ],
[ /(?<=[Vv])erv\b/, 'ery' ],
[ /(?<=(Mon|Tues|Wednes|Thurs|Fri|\b)da)v(?=s?\b)/, 'y' ],
// v -> w
[ /\bvr/, 'wr' ],
// v^ -> w
[ /\bv[\^/]([a-z])/, 'w$1' ],
// vc -> we
[ /\bvc\b/, 'we' ],
// vd -> wi
[ /vd(ll|th)/, 'wi$1' ],
// V -> m
[ /\bV(iss|rs|r)\b/, 'M$1' ],
// Vh ->Wh
[ /\bVh/, 'Wh' ],
// V' -> W
[ /\bV'/, 'W' ],
// Vi -> M
[ /\bVir\b/, 'Mr' ],
// vir -> w
[ /hovir(?!u)/, 'how' ],
// vn -> wi
[ /vn(ll|th)/, 'wi$1' ],
// VV -> W
[ /\bVV(e)\b/, 'W$1' ],
// w -> m
[ /mewt(?!tide)/, 'ment' ],
// w r -> w (not sure what this is about)
[ /\bw r (?=e\b|[aeoiu]\w)/, 'w' ],
// X -> N
[ /\bX(?=o)/, 'N' ],
// xv -> w
[ /xvho/, 'who' ],
[ /xvay/, 'way' ],
[ /txvo/, 'two' ],
// y -> v
[ /([Ss])ery(a|i)/, '$1erv$2' ],
[ /tiye(|ly|ness|nesses|s)\b/, 'tive$1' ],
[ /eyies\b/, 'evies' ],
[ /(?<=\b(?:[Hh]a|[BbGg]ra))ye\b/, 've' ], // have, grave, brave
[ /\b([Oo])by(?=\B)/, '$1bv' ],
[ /\b(?<=Gene)ya/, 'va' ],
[ /\bevent/, 'event' ],
[ /vent(?=\b|s|ed|or|ing|y\b|ies|ral|ro|ur|il|ri)/, 'vent' ],
// Y -> T
[ /\bY(?=he)/, 'T' ],
// Y -> V
[ /\b(?<=GENE)YA/, 'VA' ],
[ /\bEYENT/, 'EVENT' ],
[ /VENT(?=\b|S|ED|OR|ING|Y\b|IES|RAL|RO|UR|IL|RI)/, 'VENT' ],
// z -> x
[ /\bezc/, 'exc' ],
// -> Rome/Roman
[ /(E|K)om(e|an|ish)([ .,\n])/, 'Rom$2$3' ],
// d', l', m', n' (not s', or english possesives get messed with)
[ /(^|\s)([MmDdLlNnJjSsCc]|[Qq]u|[Jj]usqu)(' | ')(?=[AaEeIiOoUuÁáÀàéÉèÈ])/, "$1$2'" ]
];
process_editor( editor, new PartialWordRegexProcessor( reps ) );
};
const do_multiword_fixes = function ( editor ) {
let reps = [
// hyphens more likely to be em-dash
[ /(<?=[a-z])-(the)\b/, '—$1' ],
// Missing spaces
// theCap unlikely to be right
[ /\b(a|an|of|by|the)(?=[A-Z])/, '$1 ' ],
// single cap in a word probably a dropped space
// watch for Mc/Mac
// needs lookbehind really
[ /\b(\w[a-z]*[abd-z])([A-Z][a-z]+\b)/, '$1 $2' ],
// ance is a suffix when it's not ancestor's prefix
[ /[\s-]ance(?! st[or])\b/, 'ance' ],
[ /\bal though/, 'although' ],
// and<dropped space>
// not many words start and
[ /\band((?=[a-z])[^raoei])/, 'and $1' ],
[ /\bbet ween/, 'between' ],
// I
[ /I(am\b|had|was|will|can|shall|did)/, 'I $1' ],
// he
[ /([Hh]e)(had|did|can|will|was)/, '$1 $2' ],
// him
[ /(?<=\b([Hh]im))t/, ' t' ], // e.g. himto -> him to
[ /notbe/, 'not be' ], // cannot be, not being, ...
[ /([deos])n(' | ')t\b/, '$1n\'t' ],
[ /\bcom m/, 'comm' ],
[ /(<?=in|\b)com par/, 'compar' ],
// government can only be -a, -s, -e
[ /(overnment)((?=\w)[^sae])/, '$1 $2' ],
[ /((?=\w)[^sa])may/, '$1 may' ], // dismay/gamay are the only words end in may
[ /\bme(of|to|for|that)\b/, 'me $1' ],
[ /(s|t)my\b/, '$1 my' ], // -my isn't always a likey suffix
[ /\bof(a|b|c|d|g|m|n|p|s|w)/, 'of $1' ], // of my/self, etc words that can't start of-
[ /\bof(our|my|some|him|her|his)\b/, 'of $1' ],
// of merged left, careful of Russian names...
[ /(Earl|Duke|Queen|King|Baron|most|all|some|many)of/, '$1 of' ],
[ /([a-z])which/, '$1 which' ], // only wrong for everwhich
// no word ends -many except overmany
[ /([^Oo]?[^v]?[^e]?[^r\s])many/, '$1 many' ],
// she
[ /([Ss]he)(had|did|will|was)/, '$1 $2' ],
[ /\bthus(?!ly|\b)/, 'thus ' ], // no words start thus
// some obvious loss of spaces after 'the'
[ /\bthe(?=h|me[nm]|mer[c]|mo|im|un|wh)/, 'the ' ],
// and before 'the'
[ /\b(\w[^aoniy\s])the\b/, '$1 the' ],
// before 'to'
[ /\b(thing)to\b/, '$1 to' ],
[ /(u|n|r) (dices?)\b/, '$1$2' ],
[ /\bun der/, 'under' ],
[ /\brene w(ed|al|abl)\b/, 'renew$1' ],
[ /\bre turn/, 'return' ],
// words ending in cious that lost a space
[ /cious((?=[a-z])[^enl])/, 'cious $1' ],
// Spurious spaces
[ /\b(P|p)ro ceed/, '$1roceed' ],
[ /\b(P|p)ro ced/, '$1roced' ],
[ /(C|c)on cl/, '$1oncl' ], // con clude
[ /(un)?ans wer(a|e|s|\b)/, '$1answer$2' ],
[ /same(a|b|c|f|g|h|i|j|k|m|o|p|q|u|v|w|x|y|z)/, 'same $1' ],
[ /\bho w/, 'how' ], // however...
[ /\b(dis|)satis fact/, '$1satisfact' ],
[ /\bendo (wed|wing|wments?)/, 'endo$1' ],
[ /\bre[ -](quest|quire|solute)/, 're$1' ],
[ /\bwasnot\b/, 'was not' ],
[ /\b(ly)(worked)\b/, '$1-$2' ],
// missing hyphens
[ /\binchief(?=s?\b)/, 'in-chief' ],
[ /(?<=y)public(?=s?\b)/, '-public' ], // notary-public, ...
// Lone quotes at the start of a quotation
[ /(?<=(said|answered|replied|shouted|thought|whispered|murmured|muttered|), ") /, '' ],
// spurious punctuation, eg why. not, but avoid e.g. i.e. etc
[ /([a-z]{3,})\. ([a-z])/, '$1 $2' ]
];
process_editor( editor, new PartialWordRegexProcessor( reps ) );
// These are things that are never suffixes
// eg. hecould -> he could
reps = [
/(c|sh|w)ould(n't)?/
];
process_editor( editor, new BannedSuffixProcessor( reps ) );
// These can never be prefixes
// so insert spaces after then
reps = [
/[Aa](?=number|bond\b|comm|rece|reci[^b])/,
/a(?=dele)/,
/be(?=my)/,
/but(?=al)/, // but all, but always
/come(?=to)/,
/great(?=m|p|r)/,
/[HhSsGg]ave(?=my)/, // h/gave my/self
/me(?=wit|tow)/,
/means/,
/of(?=the)/,
/sent(?=as)/,
/some(?=[cm])/,
/that(?=can|d|w)/, // that will
/the(?=mes|tr|e\w)/,
/(?:un|)usual(?!s|ness|ly)/,
/I(?=h[eiou])/,
/I(?=ha[^b])/, // I have/had
/with(?=a\b|a[^lm]|all)/,
/with(?=his|her|it|th|ha)/
];
process_editor( editor, new BannedPrefixProcessor( reps ) );
// if we see these on their own, they are prefixes of the next word
// These can be slightly aggressive, as they only fire if the prefix is
// already isolated - they won't break up existing words
let orphans = [
/(a|fo)llo/, // allocate, follow
/(un|)acknow/,
/(|[Ii]n)conse/, // consequence, consecrate
/circum/,
/combin?/,
/(|[Ii]n)compa/,
/(|[iI]n)comple/,
/(|[Ii]n)corp/,
/\w*corres?/,
/diffi/, // difficult, diffident
/dis/, // very few words end dis, so an orphan is likely a prefix
/decla?/, // ration can't be a simple suffix
/ered/,
/exper?/,
/helio/,
/inex/,
/medi/, // medicine/s, medical
/misbe/,
/(|in)oppor/,
/(|dis|co-?|acc|in|sub|super)ordin?/,
/[Pp]arti/,
/[Pp]hilo/,
/(|im|mal)prac/,
/(|im)practi/,
/pre/, // pre is occasionally a suffix, but it's
/(|un)[Pp]rinci/,
/reca/,
/(|p|un|under)recom/, // recommend
/repre/,
/(|un|tran)sub/,
/suc/, // success...
/(|un)sug/, // suggest, sugary../
/sur/, // sur-
/trans/,
/undis/,
/whatso/
];
process_editor( editor, new OrphanPrefixProcessor( orphans ) );
// if we see these on their own, they're suffixes of the prior word
orphans = [
/astic/,
/ated/,
/atory/,
/(|ond|ti)ar(y|ies)/,
/tably/,
/butors?/,
/cating(|ly)/,
/cellation(|s)/,
/cien(cy|t)/,
/ciples?/,
/dences?/,
/derable/,
/digent(|s)/,
/dit(y|ies)/,
/drawals?/, // only withdrawal
/ested(|ly|ness)/,
/esque(|ly)/,
/ficial\w*/,
/geous(|ly|ness|nesses)/,
/gences?/,
/hend(|s|ing)/,
/iast\w*/,
/ings?/, // ing is rarely a prefix, much more likely to be -ing if it occurs alone
/lants/,
/lated/,
/lative(s|ly|)/, // comp-, decla-
/ligent(|ly|sia|sias)/,
/mations?/, // not motions
/munication?/,
/ments?/,
/mence\w*/, // commmence
/mitted(|ly|ness)/,
/nect(ed|ions?)/,
/nence/,
/nese/,
/nien(ce|ces|ced|t)/,
/m?on(ing|ed)/, // summoned, commisioned...
/pan(y|ies)/,
/pensat\w+/, // compensate
/plet(ed|ion|ions)/,
/politan\w*/,
/pl?oration(|s|al)?/,
/rative(s|ly|)/, // comp-, decla-
/rit(ies|y)/,
/rence(|d|s)/,
/saries/, // anniversaries...
/sion\w*/,
/siderable\w*/, // avoid sideral/sideration
/sume(\b|[^r]\w*|r[^i]\w*)/, // avoid -sumeria
/stantly/,
/tain(ed|s)/,
/[as]tr[au]ction(|s|al|ary|ally)/,
/[szt]?[aoiue]?tion(|s|al|ally)/, // not ration
/tages?/,
/ti[vn]ely/,
/tinual(|ly|ness|ity)/,
/tinuous(|ly|ness)/,
/b?ilit(ies|y)/,
/vid(es|ing)/,
/wered/
];
process_editor( editor, new OrphanSuffixProcessor( orphans ) );
};
const do_foreign_italics = function ( editor ) {
const reps = [
/\bad (hoc|.*um|.*em)\b/,
/de facto/,
/quid pro quo/,
/locum tenens/,
/\b[Ii]bid\b/
];
process_editor( editor, new ItaliciseProcessor( reps ) );
};
const do_whole_words_reps = function ( editor ) {
// simple whole-word replacements
const reps = [
];
process_editor( editor, new WholeWordRegexProcessor( reps ) );
};
const doLongSReplacements = function ( editor ) {
const long_s_reps = [
// fix bad long se replacements
[ /ƒ/, 'f' ],
[ /ʃ/, 's' ],
[ /([^i])fic\b/, '$1sic' ],
[ /([Ee])aft/, '$1ast' ],
[ /([W])eft/, '$1est' ], // assume Weft is West, but weft is like fabric
[ /(af|un)?focia/, '$1socia' ],
[ /(?<=[Aa])ff(embl|ign)/, 'ss$1' ], // assign, assemble..
[ /(A|a)nfwer/, '$1nswer' ],
[ /(ef)?fent/, '$1sent' ], // essential, sent, sentinel
[ /(other|like)wife/, '$1wise' ],
[ /\bfide\b/, 'side' ],
[ /\bfo\b/, 'so' ],
[ /\breft/, 'rest' ],
[ /([Aa])bfo/, '$1bso' ],
[ /ccef[fs]/, 'ccess' ],
[ /bfurd/, 'bsurd' ],
[ /affif/, 'assist' ],
[ /aff(um|ur|er)/, 'ass$1' ], // assume, assure
[ /(?<=A|a)fc/, 'sc' ], // ascent
[ /Afia/, 'Asia' ],
[ /(?<=A|a)fk/, 'sk' ], // ask
[ /aftard/, 'astard' ],
[ /aftic/, 'astic' ],
[ /afty/, 'asty' ],
[ /([Aa])lfo/, '$1lso' ],
[ /([Aa])pfe/, '$1pse' ],
[ /([Aa])ufp/, '$1usp' ],
[ /baffy/, 'bassy' ],
[ /([Bb])afe/, '$1ase' ],
[ /([Bb]|[Cc]r)eft/, '$1est' ],
[ /([Cc])afua/, '$1asua' ],
[ /([Cc])auf/, '$1aus' ],
[ /([Cc])eaf(?!a)/, '$1eas' ],
[ /ceff/, 'cess' ], // necessary
[ /cefs\b/, 'cess' ], // princess, process
[ /([Cc])heft/, '$1hest' ],
[ /Chrif/, 'Chris' ],
[ /cife/, 'cise' ],
[ /([Cc])laf[fs]/, '$1lass' ],
[ /([Cc])lofe/, '$1lose' ],
[ /([Cc])onf(id|t|eq)/, '$1ons$2' ], // const, conseq...
[ /([Cc])ourfe/, '$1ourse' ],
[ /([Cc])oft/, '$1ost' ],
[ /([Cc])roff\B/, '$1ross' ], // cross-
[ /([Cc])rofs\b/, '$1ross' ], // cross
[ /([Dd])efcr/, '$1escr' ],
[ /dorf(e|es|ed|ing|ings|ment)/, 'dors$1' ],
[ /efer([vt])/, 'eser$1' ], // deserve-, desert-
[ /([dD])if([ocprgqst]|ad)/, '$1is$2' ], // dis-
[ /\b([dD])if([^f]\w)/, '$1is$2' ],
[ /([Dd])iffol/, '$1issol' ],
[ /([Dd])efir/, '$1esir' ],
[ /efour/, 'esour' ],
[ /offef[fs]/, 'ossess' ],
[ /feffion/, 'session' ], // session (possesion comes later)
[ /(?<![A-Z]|ff|\b)eff(|ed|ion|ing|ly)/, 'ess$1' ], // express, etc
[ /([Ee])fpe/, '$1spe' ], // especial
[ /([Ee])fq/, '$1sq' ],
[ /(?<=R|r|t|l|p)egift/, 'egist' ], // regist.., strategist, etc
[ /(?<=en)lift/, 'list' ],
[ /fenf(e|es|ed|ing|ings)\b/, 'sens$1' ],
[ /enf(e|es|ed|ing|ings)\b/, 'ens$1' ],
[ /([Bb])eft(\b|ed|ing)/, '$1est$1' ],
[ /([^kgrdw])eft\b/, '$1est' ], // -est
[ /efide/, 'eside' ],
[ /(?<=R|r)efort/, 'esort' ],
[ /(?<=R|r|t|l|p)egift/, 'egist' ], // regist.., strategist, etc
[ /([Ee])fta/, '$1sta' ], // establish
[ /([Ee])fti/, '$1sti' ], // estimate
[ /enfes/, 'enses' ],
[ /ennf/, 'enns' ], // Pennsylv etc
[ /erfal/, 'ersal' ],
[ /erfon/, 'erson' ],
[ /erfua/, 'ersua' ],
[ /erfue/, 'ersue' ],
[ /erfui/, 'ersui' ],
[ /eruf/, 'erus' ],
[ /fa(cr|fe|ga|id|le|lut|lt|tis|w\b|nds?\b)/, 'sa$1' ],
[ /\bfay/, 'say' ],
[ /\bfa(ve|vi)/, 'sa$1' ],
[ /(?<=F|\bf)alf/, 'als' ], // false
[ /fatif(?!e)/, 'satis' ],
[ /fca([^s])/, 'sca$1' ], // scarce, scant, etc (not briefcase)
[ /fchem/, 'schem' ],
[ /fc(ie|ious|ure|en|rib|rip)/, 'sc$1' ], // science, conscious, secure
[ /fenf/, 'sens' ],
[ /fe(a\b|af|cl|co|iz)/, 'se$1' ], // season, seclude, second
[ /fee(m|n|ing)/, 'see$1' ], // seen, seem
[ /fe(ek|gr|duc)/, 'se$1' ],
[ /felec/, 'selec' ],
[ /fel(f|v)/, 'sel$1' ],
[ /(?<=[Aa]b|[Ii]n)fence/, 'sence' ],
[ /fepar/, 'separ' ],
[ /feri([eo])/, 'seri$1' ],
[ /fervi/, 'servi' ],
[ /\bfet(|ting|s|ter)\b/, 'set' ],
[ /fettle(\b|m|s)/, 'settle$1' ], // fettle is a word, but settle is way more common
[ /feve(ra|n)/, 'seve$1' ], // severla, seven
[ /fhew/, 'shew' ],
[ /(?<=\ba?)fide(?=s?\b)/, 'side' ],
[ /fing(le|u)/, 'sing$1' ], // single, singular
[ /fis\b/, 'sis' ], // -sis
[ /ffidu/, 'ssidu' ], // Assiduous
[ /fh(al|ut|ip|o)/, 'sh$1' ],
[ /inifter/, 'inister' ],
[ /fidera/, 'sidera' ], // considerable/ation/ate
[ /fift(?!h)/, 'sist' ], // subsist, consist
[ /filen/, 'silen' ],
[ /fign/, 'sign' ],
[ /fimi/, 'simi' ],
[ /fince/, 'since' ],
[ /fion/, 'sion' ],
[ /firft/, 'first' ],
[ /fite\b/, 'site' ],
[ /fitive/, 'sitive' ],
[ /fitu/, 'situ' ],
[ /flaught/, 'slaught' ],
[ /flowl/, 'slowl' ],
[ /flowne/, 'slowne' ],
[ /fm(an|en|all|oth|ooth)/, 'sm$1' ], // small, helmsmen, smooth
[ /focie/, 'socie' ],
[ /fole/, 'sole' ],
[ /foli/, 'soli' ],
[ /folv/, 'solv' ],
[ /fome/, 'some' ],
[ /foon/, 'soon' ],
[ /foph/, 'soph' ], // -sopher/y
[ /fourc/, 'sourc' ],
[ /fouth/, 'South' ],
[ /fov/, 'sov' ],
[ /fpade/, 'spade' ],
[ /fpawn/, 'spawn' ],
[ /fpeak/, 'speak' ],
[ /fpec/, 'spec' ],
[ /fpee/, 'spee' ],
[ /fpir/, 'spir' ], // spirir, spiral,
[ /ft(air|an|at|eem|ep|ill|on|oo|r|ud|y)/, 'st$1' ],
[ /\bft(\w)/, 'st$1' ],
[ /fubf/, 'subs' ], // do before fub
[ /fub/, 'sub' ],
[ /fucc/, 'succ' ],
[ /fuch/, 'such' ],
[ /fued/, 'sued' ],
[ /\bfu(e|es|ings?)\b/, 'su$1' ],
[ /fuf(p)/, 'sus$1' ],
[ /fuff/, 'suff' ],
[ /fund(?!rais)/, 'sund' ],
[ /fumm/, 'summ' ], // summit, summary
[ /fuit/, 'suit' ],
[ /fuper/, 'super' ],
[ /fupp/, 'supp' ],
[ /fu(re|rv)/, 'su$1' ],
[ /fw(ay|ear|orn)/, 'sw$1' ],
[ /fyf/, 'sys' ],
[ /fym/, 'sym' ],
[ /grefs/, 'gress' ],
[ /hift/, 'hist' ],
[ /(?<=[Hh])(ea|o|oa|ou)rf/, '$1rs' ], // house, hearse, horse
[ /i[sf]cuff/, 'iscuss' ],
[ /ifh/, 'ish' ],
[ /ifm\b/, 'ism' ],
[ /ifo\b/, 'iso' ],
[ /ifon/, 'ison' ],
[ /iftic/, 'istic' ],
[ /([Ii])ffu/, '$1ssu' ],
[ /illuf/, 'illus' ],
[ /(I|i)nft/, '$1nst' ],
[ /\b(?<=i|I)fl/, 'sl' ], // isle, island
[ /Jefus/, 'Jesus' ],
[ /(?<=J|j|I|i)urif/, 'uris' ],
[ /([Jj])uft/, '$1ust' ],
[ /([Ll])aft/, '$1ast' ], // last, lastly, etc
[ /lefia/, 'lesia' ],
[ /([Ll])egif/, '$1egis' ], // legislation...
[ /([^ie])efs/, '$1ess' ], // -ess
[ /(?<=l|L)eff/, 'less' ], // -ess-
[ /lifle/, 'lisle' ],
[ /lifh/, 'lish' ],
[ /lufiv/, 'lusiv' ],
[ /([MmPp])afs\b/, '$1ass' ],
[ /([Mm])i(fs\b|ff\B)/, '$1iss' ], // miss, missing
[ /([Mm])i(f\B)/, '$1is' ], // mistake
[ /Missifippi/, 'Missisippi' ],
[ /Missiffippi/, 'Mississippi' ],
[ /([Mm])oft/, 'most' ],
[ /mongft/, 'mongst' ],
[ /([Mm])uft/, 'must' ],
[ /nefe/, 'nese' ],
[ /nefs/, 'ness' ],
[ /nfate/, 'nsate' ],
[ /nfel(?=\b|s|led|l[oe]rs?)/, 'nsel' ],
[ /nfive/, 'nsive' ],
[ /oaft/, 'oast' ], // coast, etc
[ /obf/, 'obs' ],
[ /([Oo])bfe/, '$1bse' ], // observ
[ /ofed/, 'osed' ],
[ /offi/, 'ossi' ], // possible
[ /ofition/, 'osition' ], // position, etc.
[ /ofity/, 'osity' ],
[ /oftil/, 'ostil' ], // hostile
[ /ouf\b/, 'ous' ],
[ /oufly/, 'ously' ],
[ /([Pp])aft/, '$1ast' ],
[ /hraf/, 'hras' ], // phrase
[ /paff/, 'pass' ], // pass/age, for pafs, see mafs
[ /([Pp])leaf/, '$1leas' ],
[ /([Pp])of(e|t)/, '$1os$2' ], // post, pose, compose...
[ /(?<=P|p)urfu/, 'ursu' ],
[ /(?<=R|r)ef([pfs]|en|ume|ump)/, 'es$1' ],
[ /([Rr])eleaf/, '$1eleas' ],
[ /(?<=R|r)aif(e|i)/, 'ais$1' ], // raising, raised/r
[ /\b([Aa]r|[Rr])if([ie])/, '$1is$2' ], // a/rising/ed/es
[ /rofec/, 'rosec' ], // prosecute
[ /rofef([sf])/, 'rofess' ],
[ /rofp/, 'rosp' ],
[ /urpof/, 'urpos' ],
[ /([Qq])ueft/, '$1uest' ],
[ /reafo/, 'reaso' ],
[ /refea/, 'resea' ],
[ /refi/, 'resi' ],
[ /([Tt])afte/, '$1aste' ],
[ /(?<=T|t)eft/, 'est' ],
[ /terfect/, 'tersect' ], // intersect, but not perfect, etc
[ /hefe/, 'hese' ], // these
[ /([Hh])ofe/, '$1ose' ], // those, whose
[ /tereft/, 'terest' ],
[ /traft/, 'trast' ],
[ /ranf/, 'rans' ], // trans-
[ /ufe/, 'use' ],
[ /uftom/, 'ustom' ],
[ /vaft/, 'vast' ],
[ /(?<=V|v)erf/, 'ers' ], // verse, versus
[ /([Vv])eff/, 'vess' ],
[ /verf([eyo])/, 'vers$1' ], // verse, verso -versy
[ /vife/, 'vise' ], // advise..
[ /([Vv])ifi/, '$1isi' ],
[ /ifdom/, 'isdom' ],
[ /xift/, 'xist' ]
];
process_editor( editor, new PartialWordRegexProcessor( long_s_reps ) );
};
const template_cleanup = function ( editor ) {
const header = editor.forField( '#wpHeaderTextbox' );
const footer = editor.forField( '#wpFooterTextbox' );
// {{c}} to {{center}}
editor.replace( /{{c\|/g, '{{center|' );
header.replace( /{{c\|/g, '{{center|' );
footer.replace( /{{c\|/g, '{{center|' );
// {{rh}} to {{RunningHeader}}
header.replace( /\n?{{rh\|/gi, '{{RunningHeader|' );
// more cleanup
editor
// {{hws}} & {{hwe}} expanded
.replace( /{{hws\|/g, '{{hyphenated word start|' )
.replace( /{{hwe\|/g, '{{hyphenated word end|' )
// {{di}} expanded
.replace( /{{di\|/g, '{{dropinitial|' )
// {{hi}} expanded
.replace( /{{hi\|/g, '{{hanging indent|' )
// {{sm}} expanded
.replace( /{{sm\|/g, '{{smaller|' )
// {{...}} replaced
// expand diacritical templates
// .replace(/{{\.{3}}}/g, '…')
// expand diacritical templates
// eslint-disable-next-line no-useless-concat
.replace( /{{(ae|oe|\w[:`'~^-])}}/g, '{' + '{subst:$1}}' )
// convert {{—}} to —
.replace( /{{—}}/g, '—' );
// M<sup>c</sup> to {{Mc}}
editor.replace( /M<sup>c<\/sup>/g, '{{Mc}}' );
header.replace( /M<sup>c<\/sup>/g, '{{Mc}}' );
// section tag fix
editor.replace( /<section (begin|end)=(\w[^/]+)\/>/g,
'<section $1="$2"/>' );
// refs don't have space before them
editor.replace( /\s<ref/g, '<ref' );
};
const do_extra_functions = function ( editor ) {
const header = editor.forField( '#wpHeaderTextbox' );
const footer = editor.forField( '#wpFooterTextbox' );
Cleanup.cleanupFunctions.forEach( function ( v ) {
v( editor, header, footer );
} );
};
const do_replaceSmartQuotes = function ( editor ) {
// replace smart quotes
editor
.replace( /“ /g, '"' )
.replace( / ”/g, '"' )
.replace( /[“”]/g, '"' )
.replace( /‘ /g, "'" )
.replace( / ’/g, "'" )
.replace( /[‘’]/g, "'" );
};
const collapse_line_breaks = function ( editor ) {
// stuff to do only if the page doesn't contain a <poem> tag:
if ( editor.get().indexOf( '<poem>' ) === -1 ) {
// first, a hack! [T230415]
const short_line_thresh = Cleanup.shortLineThreshold;
if ( short_line_thresh > 0 ) {
const lines = editor.get().split( /\r?\n/ );
for ( let i = 0; i < lines.length - 1; i++ ) {
if ( ( lines[ i ].length < short_line_thresh ) &&
lines[ i ].match( /[.!?'"”’—]\s*$/ ) &&
lines[ i + 1 ].match( /\s*['"“‘A-Z0-9]/ ) ) {
lines[ i ] += '\n';
}
}
editor.set( lines.join( '\n' ) );
}
editor
// remove single line breaks; preserve multiple.
// not if there's a tag, template, table syntax either side of line break
.replace( /([^>}\n])\n(?!( *\||[{}<]|\n|=|\*|#))/g, '$1 $2' )
// collapse sequences of spaces into a single space
.replace( / +/g, ' ' )
// two quotes are probably two lines
.replace( /" "/g, '"\n\n"' );
}
};
// Collapse paras where the second para starts lowercase (so it's probably
// bogus).
const collapseSuspiciousParagraphs = function ( editor ) {
if ( editor.get().indexOf( '<poem>' ) === -1 ) {
editor
// remove paragraph breaks if the second para starts lowercase
.replace( /\n\n+(?=[a-z])/g, ' ' );
}
};
const do_small_abbrs = function ( editor, abbr_list ) {
for ( const abbr of abbr_list ) {
let re_str = '';
let good = '';
for ( let i = 0; i < abbr.length; i++ ) {
re_str += abbr[ i ] + '[.,]? ?';
good += abbr[ i ] + '.';
}
re_str = '(\\s)' + re_str + '(?=\\s)'; // new word, but not in template
const re = new RegExp( re_str, 'g' );
const smallAbbrTemplate = 'asc';
good = `$1{{${smallAbbrTemplate}|${good}}}`;
editor.replace( re, good );
}
};
const markProofread = function () {
// eslint-disable-next-line no-jquery/no-global-selector
$( 'span.quality3 input' ).trigger( 'click' );
};
const set_summary = function ( summary_text ) {
// eslint-disable-next-line no-jquery/no-global-selector
$( '#wpSummary' ).val( summary_text );
};
const do_markProofread = function () {
// if doing cleanup, must be proofreading
markProofread();
if ( Cleanup.editSummary ) {
set_summary( Cleanup.editSummary ); // clear old summary
}
};
// The main cleanup function
// Editor: the templatescript editor object
function do_cleanup( editor ) {
// Any clenaups that need the context of the old line breaks
do_pre_collapse_cleanup( editor );
// Do this before line collapses
if ( Cleanup.remove_running_header ) {
process_editor( editor,
new RunningHeaderProcessor( Cleanup.runningHeaderPatterns ) );
}
// Do this first, so we can correct words across collapsed line breaks
collapse_line_breaks( editor );
if ( Cleanup.collapseSuspiciousParagraphs ) {
collapseSuspiciousParagraphs( editor );
}
// Generic cleanup
do_generic_cleanup( editor );
// OCR and scanno fixing
// Do the simple replacements first, as it's easier to write these
// if you don't have to guess what intermediate state the page is in
if ( Cleanup.additionalOcrReplacements.length > 0 ) {
process_editor( editor,
new PartialWordRegexProcessor( Cleanup.additionalOcrReplacements ) );
}
do_ocr_fixes( editor );
do_multiword_fixes( editor );
if ( Cleanup.italiciseForeign ) {
do_foreign_italics( editor );
}
if ( Cleanup.italicWords.length > 0 ) {
process_editor( editor, new ItaliciseProcessor( Cleanup.italicWords ) );
}
do_whole_words_reps( editor );
if ( Cleanup.doLongSReplacements ) {
doLongSReplacements( editor );
}
if ( Cleanup.doTemplateCleanup ) {
template_cleanup( editor );
}
if ( Cleanup.replaceSmartQuotes ) {
do_replaceSmartQuotes( editor );
}
do_small_abbrs( editor, Cleanup.smallAbbreviations );
// Any extra functions
do_extra_functions( editor );
if ( Cleanup.markProofread ) {
do_markProofread();
}
}
function do_cleanup_wrapper( editor ) {
log( DEBUG, 'Cleaning up...' );
try {
do_cleanup( editor );
} catch ( e ) {
log( ERROR, e );
}
log( DEBUG, 'Cleanup done.' );
}
function find_first_diff_pos( a, b ) {
const shorterLength = Math.min( a.length, b.length );
for ( let i = 0; i < shorterLength; i++ ) {
if ( a[ i ] !== b[ i ] ) {
return i;
}
}
if ( a.length !== b.length ) {
return shorterLength;
}
return -1;
}
function zip( arrays ) {
return arrays[ 0 ].map( function ( _, i ) {
return arrays.map( function ( array ) {
return array[ i ];
} );
} );
}
let test_test_to_restore = null;
function do_cleanup_test( editor ) {
const text = editor.get();
test_test_to_restore = text;
do_cleanup( editor );
const cleaned = editor.get();
// Load the "expected" subpage and see if the text matches
mw.loader.using( 'mediawiki.api' ).done( function () {
const api = new mw.Api();
api.get( {
action: 'query',
titles: mw.config.get( 'wgPageName' ) + '/expected',
prop: 'revisions',
rvprop: 'content',
rvslots: 'main',
formatversion: 2,
rvlimit: 1
} ).done(
function ( data ) {
const expected = data.query.pages[ 0 ].revisions[ 0 ].slots.main.content;
let colour = 'green';
if ( expected !== cleaned ) {
log( ERROR, "Expected text doesn't match!" );
const pairs = zip( [ expected.split( '\n' ), cleaned.split( '\n' ) ] );
for ( const pr of pairs ) {
if ( pr[ 0 ] !== pr[ 1 ] ) {
log( ERROR, 'Line mismatch' );
log( ERROR, `Expected: '${pr[ 0 ]}', Got: '${pr[ 1 ]}'` );
if ( pr[ 0 ] && pr[ 1 ] ) {
const indx = find_first_diff_pos( pr[ 0 ], pr[ 1 ] );
log( ERROR, pr[ 0 ].slice( indx ) );
log( ERROR, pr[ 1 ].slice( indx ) );
}
}
}
colour = 'red';
}
// eslint-disable-next-line no-jquery/no-global-selector
$( '.wikiEditor-ui' ).css( 'outline', '2px solid ' + colour );
} );
} ); // end using
}
function do_cleanup_test_restore( editor ) {
if ( test_test_to_restore ) {
editor.set( test_test_to_restore );
}
// eslint-disable-next-line no-jquery/no-global-selector
$( '.wikiEditor-ui' ).css( 'outline', '' );
}
function add_templatescript() {
$.ajax( '//tools-static.wmflabs.org/meta/scripts/pathoschild.templatescript.js', {
dataType: 'script',
cache: true
} ).then( function () {
const cleanup_entry = {
name: Cleanup.actionTitle,
position: 'cursor',
script: do_cleanup_wrapper,
enabled: true
};
if ( Cleanup.cleanupAccesskey ) {
cleanup_entry.accessKey = Cleanup.cleanupAccesskey;
}
const entries = [
cleanup_entry
];
if ( Cleanup.enableTesting ) {
entries.push( {
name: 'Test cleanup',
script: do_cleanup_test
} );
entries.push( {
name: 'Restore pre-cleanup',
script: do_cleanup_test_restore
} );
}
// eslint-disable-next-line no-undef
pathoschild.TemplateScript.add(
entries, {
category: Cleanup.portletCategory,
forNamespaces: Cleanup.activeNamespaces
} // common fields
);
} );
}
function really_run() {
log( DEBUG, 'Really_run' );
mw.hook( signature + '.config' ).fire( Cleanup );
if ( Cleanup.enable ) {
add_templatescript();
} else {
log( DEBUG, 'Cleanup disabled' );
}
}
function run() {
if ( Cleanup.started ) {
return;
}
Cleanup.started = true;
really_run();
}
$.when( mw.loader.using( 'user' ), $.ready ).always( run );
// eslint-disable-next-line no-undef
}( jQuery, mediaWiki ) );