ソース
from google.colab import drive
drive.mount('./drive')
import re
import unicodedata
def remove_control_characters(s):
return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
def strset(outstr):
outstr = re.sub('<blockquote[\s\S]*?</blockquote>?', '', outstr)
outstr = re.sub('<pre[\s\S]*?</pre>?', '', outstr)
outstr = re.sub('<table[\s\S]*?</table>?', '', outstr)
outstr = re.sub('<img [^>]+>?', '', outstr)
outstr = re.sub('<a [^>]+>([\s\S]*?)</a>?', '\\1', outstr)
outstr = re.sub('<iframe [^>]+>[\s\S]*?</iframe>?', '', outstr)
outstr = re.sub('<cite[\s\S]*?</cite>?', '', outstr)
outstr = re.sub('<video[\s\S]*?</video>?', '', outstr)
outstr = re.sub('<[^>]+?>([\s\S]*?)</[ -~]+>?', '\\1', outstr)
outstr = re.sub('<br />', '\\n', outstr)
outstr = re.sub('<[^>]+>?', '\\n', outstr)
outstr = re.sub('^[ -~]*\\n', '', outstr)
outstr = re.sub('\\n{2,}', '\\n', outstr)
return outstr
outFlg = False
postFlg = True
outstr = ""
f = open('/content/drive/My Drive/blogpost.txt', 'r', encoding="utf-8")
t = f.read()
f.close()
t = strset(t)
with open('tmp.txt', mode='w+', encoding="utf-8") as f:
f.write(t)
with open('tmp.txt', encoding="utf-8") as f:
for line in f:
line = remove_control_characters(line)
if line.startswith("COMMENT:"):
outFlg = False
elif line.startswith("AUTHOR:"):
outFlg = False
postFlg = True
elif re.fullmatch('^CATEGORY: [C-Wptv].*', line):
outFlg = False
postFlg = False
elif line == '-----':
outFlg = True
elif line == '--------':
postFlg = True
outFlg = True
outstr = outstr + '\n話題変更\n'
else:
if postFlg & outFlg:
if re.fullmatch('[ -~]+', line) == None:
outstr = outstr + strset(line) + '\n'
outstr = re.sub('&#x[0-9A-F]+;', '', outstr)
outstr = re.sub('\\n{2,}', '\\n', outstr)
outstr = re.sub('\n話題変更\n', '<|endoftext|>\\n', outstr)
with open('/content/drive/My Drive/blogpostData.txt', mode='w+', encoding="utf-8") as f:
f.write(outstr)
何も考えないで実行したんで、削除したカテゴリが
Excel/ Outlook/ vba/ Word/ Microsoft Office/ python3/ PowerPoint/ iPad Pro/ iPhone/ DQ11/ JavaScript/ jQuery/ C#/ wordpress/ MHW/ twitter/ php/ google/ FSO/ WSH
意図しないものも消えてます(;´Д`)