User:SgtLion/PathfinderHTMLtoWikiPage

From D&D Wiki

Jump to: navigation, search

Since there's no reasonable place to put this, I'll stick it on another Userpage. This is Python 3.x code that will take the HTML of a page like this prd page and turn it into a complete wiki page. Tables and all. Regex is fun.

To Use[edit]

  1. Save the HTML section you need converted into a file called "input.txt"
  2. Run the python script (open command prompt, type "python <filenameofthiscode>.py")
  3. output.txt will be ready to paste into dandwiki

The only caveat is that the type of Footer at the bottom likely needs changing depending on the article. I can't regex the intent of a page, sadly.

import re, os

def finddir():
    a = ''
    b = os.path.realpath(__file__).count('\\')
    c = 0
    while a.count('\\') < b:
        a = a + os.path.realpath(__file__)[c]
        c += 1
    return a

with open(finddir()+'input.txt','r',errors='ignore') as inpfile:
    inp = inpfile.read()

buffer = ''
print(inp.count("<blockquote>"))

for a in range(inp.count("<blockquote>")): #For every blockquote block there is
    buffer = inp[inp.find("<blockquote>")+12:inp.find("</blockquote>")]
    print('---------------\n' + buffer + '---------------\n')
    buffer = re.sub('\n',                           '\n:',                          buffer)
    inp = inp[:inp.find("<blockquote>")] + buffer + inp[inp.find("</blockquote>")+13:]

inp = re.sub(' class="stat\-block\-title">([ a-zA-Z0-9()="\',#./-]+)(?=</p>)',    r"> ====\1====",          inp)
inp = re.sub('<\/*h1[ a-zA-Z0-9="\'#./\-\(\)\+&]*>',    '==',                                               inp)
inp = re.sub('<\/*h2[ a-zA-Z0-9="\'#./\-\(\)\+&]*>',    '===',                                              inp)
inp = re.sub('<\/*h3[ a-zA-Z0-9="\'#./\-\(\)\+&]*>',    '====',                                             inp)
inp = inp.replace('==',                                 '=',                                                2)
inp = re.sub('[\t ]+(?===)',                            '',                                                 inp)
inp = re.sub('<br[ a-zA-Z0-9="#./-]*>',                 "\n",                                               inp)
inp = re.sub('</*blockquote[ a-zA-Z0-9="#./-]*>',       "",                                                 inp)
inp = re.sub('<b[ a-zA-Z0-9="#./-]*>',                  "\n'''",                                            inp)
inp = re.sub('</b[ a-zA-Z0-9="#./-]*>',                 "'''",                                              inp)
inp = re.sub('<i[ a-zA-Z0-9="#./-]*>',                  "\n''",                                             inp)
inp = re.sub('</i *>',                                  "''",                                               inp)
inp = re.sub('</*em>',                                  "''",                                               inp)
inp = re.sub('</*strong>',                              "'''",                                              inp)
inp = re.sub('([ a-zA-Z0-9()="\',#./-]+)(?=</a>)',      r"[[PFSRD:\1|\1]]",                                 inp)
inp = re.sub('</*a[ a-zA-Z0-9=()"\'#_.,/-]*>',          "",                                                 inp)
inp = re.sub('</*div[ a-zA-Z0-9="#./-]*>',              "",                                                 inp)
inp = re.sub('</*nobr[ a-zA-Z0-9="#./-]*>',             "",                                                 inp)
inp = re.sub('\n[\t ]+',                                '\n',                                               inp)
inp = re.sub('</*thead[ a-zA-Z0-9="#./-]*>',            "",                                                 inp)
inp = re.sub('<tbody[ a-zA-Z0-9="#./-]*>',              "|-",                                               inp)
inp = re.sub('</*tbody[ a-zA-Z0-9="#./-]*>',            "",                                                 inp)
inp = re.sub('</*tfoot[ a-zA-Z0-9="#./-]*>',            "",                                                 inp)
inp = re.sub('<table[ a-zA-Z0-9=";:#./-]*>',            '\n\n{|class="pathfinder"\n|+',                     inp)
inp = re.sub('<caption[ a-zA-Z0-9="#./-]*>',            '\n<div>{{#anc:',                                   inp)
inp = re.sub('</caption *>',                            '}}</div>\n|-',                                     inp)
inp = re.sub('</tr *>\n\n*<tr *>',                      "\n|-\n|",                                          inp)
inp = re.sub('</td *><td',                              "||<td",                                            inp)
inp = re.sub('</*td *>',                                "",                                                 inp)
inp = re.sub('</tr[ a-zA-Z0-9="\'#./-]*>',              "",                                                 inp)
inp = re.sub('</table>',                                "\n|}",                                             inp)
inp = re.sub('<tr[ a-zA-Z0-9="\'#./-]*>',               "!",                                                inp)
inp = re.sub('</th *><th *>',                           "\n!",                                              inp)
inp = re.sub('<th *>',                                  "!",                                                inp)
inp = re.sub('</th *>',                                 "",                                                 inp)
inp = re.sub('<td +(colspan *= *"[0-9]0*"[ a-zA-Z0-9=:;"#./-]*)>',           r"\1 | ",                      inp)
inp = re.sub('<th +(rowspan *= *"[0-9]0*"[ a-zA-Z0-9=:;"#./-]*)>',           r"\n! \1 | ",                  inp)
inp = re.sub('<th +(colspan *= *"[0-9]0*"[ a-zA-Z0-9=:;"#./-]*)*>',           r"\n! \1 | ",                 inp)
inp = re.sub('<th>',                                    '!',                                                inp)
inp = re.sub('!\n',                                     "",                                                 inp)
inp = re.sub('<td[ :;a-zA-Z0-9="#./-]*>',               "",                                                 inp)
inp = re.sub('<li[ a-zA-Z0-9=";:#./-]*>',               '*',                                                inp)
inp = re.sub('</li[ a-zA-Z0-9=";:#./-]*>',              '',                                                 inp)
inp = re.sub('</*ul>',                                  '',                                                 inp)
inp = re.sub('\n\n\n',                                  "\n\n",                                             inp)
inp = re.sub('\|\+\!',                                  "|+\n!",                                            inp)
inp = re.sub("\*\n\'\'",                                "*''",                                              inp)
inp = re.sub(" \n''",                                   " ''",                                              inp)
inp = re.sub('\n ',                                     '\n',                                               inp)
inp = re.sub('(<p[\' a-zA-Z0-9="#.\'/,\-\*]*>)\n',      '\1',                                               inp)
inp = re.sub('</*p[ a-zA-Z0-9\*=\'"#.,+/\)\:\(\-]*>',   '\n',                                               inp)
inp = '{{OGL Top}}\n' + inp + '\n{{PFSRD Footer}}'
inp = re.sub('{{OGL Top}}[\n ]+',                       "{{OGL Top}}\n",                                    inp)
inp = re.sub('\n+{{PFSRD Footer}}',                     "\n{{PFSRD Footer}}",                               inp)
inp = re.sub('�',                                       '',                                                 inp)
inp = re.sub('\[\[PFSRD\:\(Ex\)\|\(Ex\)\]\]',           "([[PFSRD:Ex|Ex]])",                                inp)
inp = re.sub('\[\[PFSRD\:\(Su\)\|\(Su\)\]\]',           "([[PFSRD:Su|Su]])",                                inp)
inp = re.sub('\[\[PFSRD\:\(Sp\)\|\(Sp\)\]\]',           "([[PFSRD:Sp|Sp]])",                                inp)
inp = re.sub('\(Sp\)',                                  "([[PFSRD:Sp|Sp]])",                                inp)
inp = re.sub('\(Su\)',                                  "([[PFSRD:Su|Su]])",                                inp)
inp = re.sub('\(Ex\)',                                  "([[PFSRD:Ex|Ex]])",                                inp)
inp = re.sub('•  \n',                    "* ",                                               inp)
inp = re.sub('''©2002-2016 Paizo, Inc.®
Paizo Inc., Paizo, Pathfinder, and the Pathfinder logo are registered trademarks of Paizo Inc., and
Pathfinder Roleplaying Game and Pathfinder Campaign Setting are trademarks of Paizo Inc.''','',             inp)
inp = re.sub('\[\[PFSRD\:Report a Problem\|Report a Problem\]\]', '',                                       inp)
inp = re.sub('\[\[PFSRD\:Open Game License\|Open Game License\]\]\.', '',                                   inp)
inp = re.sub('\n\n\n+',                                 "\n\n",                                             inp)
inp = re.sub('\|\n1st',                                 '! 1<sup>st</sup>',                                 inp)
inp = re.sub('\n2nd',                                   ' || 2<sup>nd</sup>',                               inp)
inp = re.sub('\n3rd',                                   ' || 3<sup>rd</sup>',                               inp)
inp = re.sub('\n4th',                                   ' || 4<sup>th</sup>',                               inp)
inp = re.sub('\n5th',                                   ' || 5<sup>th</sup>',                               inp)
inp = re.sub('\n6th',                                   ' || 6<sup>th</sup>',                               inp)
inp = re.sub('\n7th',                                   ' || 7<sup>th</sup>',                               inp)
inp = re.sub('\n8th',                                   ' || 8<sup>th</sup>',                               inp)
inp = re.sub('\n9th',                                   ' || 9<sup>th</sup>',                               inp)
inp = re.sub('\|-\n!',                                  '|-\n|',                                            inp)
inp = re.sub('\|-\n\|\n',                               '|-\n',                                             inp)
inp = re.sub('!!',                                      '!',                                                inp)
inp = re.sub('\(Str\)',                                 '([[PFSRD:Str|Str]])',                              inp)
inp = re.sub('\(Dex\)',                                 '([[PFSRD:Dex|Dex]])',                              inp)
inp = re.sub('\(Con\)',                                 '([[PFSRD:Con|Con]])',                              inp)
inp = re.sub('\(Wis\)',                                 '([[PFSRD:Wis|Wis]])',                              inp)
inp = re.sub('\(Cha\)',                                 '([[PFSRD:Cha|Cha]])',                              inp)
inp = re.sub('\(Int\)',                                 '([[PFSRD:Int|Int]])',                              inp)
inp = re.sub(r"\[\[PFSRD\:'''",                         r"'''[[PFSRD:",                                     inp)
inp = re.sub(r"\[\[PFSRD\:''",                          r"''[[PFSRD:",                                      inp)
inp = re.sub(r"''']]",                                  r"]]'''",                                           inp)
inp = re.sub(r"'']]",                                   r"]]''",                                            inp)
inp = re.sub("''*\|''*",                                "|",                                                inp)
inp = re.sub("\n'''\n",                                 "\n'''",                                            inp)
inp = re.sub("\n\|\!",                                  "\n!",                                              inp)
inp = re.sub("\n\'\'\,\ \'\'",                          "",                                                 inp)
inp = re.sub(u'\u2022 ',                                '* ',                                               inp)
inp = re.sub("\'\'\* ",                                 "* ''",                                             inp)
inp = re.sub('</*span[ a-zA-Z0-9="#./-]*>',             "",                                                 inp)
inp = re.sub("\|\|\n''",                                "||''",                                             inp)
inp = re.sub("\|\-\n''",                                "|-\n|''",                                          inp)
inp = re.sub("''''''",                                  "'''",                                              inp)
inp = re.sub("'''''([ a-zA-Z]*'?[ a-zA-Z\[\]\:\|\(\)]*)''([ a-zA-Z\*=\"#.,+/\)\:\(\-])", r"'''''\1'''''\2", inp)
inp = re.sub("\n *",                                    "\n",                                               inp)
inp = re.sub("\n'''\n",                                 "\n",                                               inp)
inp = re.sub("
",                                     "",                                                 inp)
inp = re.sub("\[\[\[",                                  "[[[",                             inp)
inp = re.sub("\ +\.",                                   ".",                                                inp)
inp = re.sub("\ +\;",                                   ";",                                                inp)
inp = re.sub(":\n+",                                    ":",                                                inp)
for a in ['bard','sorcerer/wizard','cleric','druid','ranger','alchemist','antipaladin','bloodrager','inquisitor','magus','paladin','shaman','summoner','witch','elementalist wizard']:
    for b in ['0','1','2','3','4','5','6','7','8','9']:
        if inp.find(a + ' ' + b) > -1:
            inp = inp + '\n[[Category:' + a.title() + ' ' + b + ']]'
for a in ['evocation','abjuration','enchantment','divination','conjuration','necromancy','universal','illusion']:
    if a in inp[:200]:
        inp = inp + '\n[[Category:' + a.title() + ' School]]'
for a in ['[light]']:
    if a in inp[:200]:
        inp = inp + '\n[[Category:' + a.strip('[').strip(']').title() + ' Effect]]'
inp = inp + '\n[[Category:None]]'
try:
    print(inp)
except:
    pass
with open(finddir()+'output.txt','w') as outfile:
    outfile.write(inp)
    
''' PathfinderHTMLtoWikiPage is a simple HTML to mediawiki format conversion program.
    Copyright (C) 2014, 2015, 2016, 2017 SgtLion

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>. -SgtLion'''
Personal tools
Home of user-generated,
homebrew pages!
system reference documents
admin area
Terms and Conditions for Non-Human Visitors