/opt/alt/python37/lib64/python3.7
"""Shared support for scanning document type declarations in HTML and XHTML. This module is used as a foundation for the html.parser module. It has no documented public API and should not be used directly. """ import re _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match _commentclose = re.compile(r'--\s*>') _markedsectionclose = re.compile(r']\s*]\s*>') # An analysis of the MS-Word extensions is available at # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf _msmarkedsectionclose = re.compile(r']\s*>') del re class ParserBase: """Parser base class which provides some common support methods used by the SGML/HTML and XHTML parsers.""" def __init__(self): if self.__class__ is ParserBase: raise RuntimeError( "_markupbase.ParserBase must be subclassed") def error(self, message): raise NotImplementedError( "subclasses of ParserBase must override error()") def reset(self): self.lineno = 1 self.offset = 0 def getpos(self): """Return current line number and offset.""" return self.lineno, self.offset # Internal -- update line number and offset. This should be # called for each piece of data exactly once, in order -- in other # words the concatenation of all the input strings to this # function should be exactly the entire input. def updatepos(self, i, j): if i >= j: return j rawdata = self.rawdata nlines = rawdata.count("\n", i, j) if nlines: self.lineno = self.lineno + nlines pos = rawdata.rindex("\n", i, j) # Should not fail self.offset = j-(pos+1) else: self.offset = self.offset + j-i return j _decl_otherchars = '' # Internal -- parse declaration (for use by subclasses). def parse_declaration(self, i): # This is some sort of declaration; in "HTML as # deployed," this should only be the document type # declaration ("<!DOCTYPE html...>"). # ISO 8879:1986, however, has more complex # declaration syntax for elements in <!...>, including: # --comment-- # [marked section] # name in the following list: ENTITY, DOCTYPE, ELEMENT, # ATTLIST, NOTATION, SHORTREF, USEMAP, # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM rawdata = self.rawdata j = i + 2 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" if rawdata[j:j+1] == ">": # the empty comment <!> return j + 1 if rawdata[j:j+1] in ("-", ""): # Start of comment followed by buffer boundary, # or just a buffer boundary. return -1 # A simple, practical version could look like: ((name|stringlit) S*) + '>' n = len(rawdata) if rawdata[j:j+2] == '--': #comment # Locate --.*-- as the body of the comment return self.parse_comment(i) elif rawdata[j] == '[': #marked section # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA # Note that this is extended by Microsoft Office "Save as Web" function # to include [if...] and [endif]. return self.parse_marked_section(i) else: #all other declaration elements decltype, j = self._scan_name(j, i) if j < 0: return j if decltype == "doctype": self._decl_otherchars = '' while j < n: c = rawdata[j] if c == ">": # end of declaration syntax data = rawdata[i+2:j] if decltype == "doctype": self.handle_decl(data) else: # According to the HTML5 specs sections "8.2.4.44 Bogus # comment state" and "8.2.4.45 Markup declaration open # state", a comment token should be emitted. # Calling unknown_decl provides more flexibility though. self.unknown_decl(data) return j + 1 if c in "\"'": m = _declstringlit_match(rawdata, j) if not m: return -1 # incomplete j = m.end() elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": name, j = self._scan_name(j, i) elif c in self._decl_otherchars: j = j + 1 elif c == "[": # this could be handled in a separate doctype parser if decltype == "doctype": j = self._parse_doctype_subset(j + 1, i) elif decltype in {"attlist", "linktype", "link", "element"}: # must tolerate []'d groups in a content model in an element declaration # also in data attribute specifications of attlist declaration # also link type declaration subsets in linktype declarations # also link attribute specification lists in link declarations self.error("unsupported '[' char in %s declaration" % decltype) else: self.error("unexpected '[' char in declaration") else: self.error( "unexpected %r char in declaration" % rawdata[j]) if j < 0: return j return -1 # incomplete # Internal -- parse a marked section # Override this to handle MS-word extension syntax <![if word]>content<![endif]> def parse_marked_section(self, i, report=1): rawdata= self.rawdata assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" sectName, j = self._scan_name( i+3, i ) if j < 0: return j if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}: # look for standard ]]> ending match= _markedsectionclose.search(rawdata, i+3) elif sectName in {"if", "else", "endif"}: # look for MS Office ]> ending match= _msmarkedsectionclose.search(rawdata, i+3) else: self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) if not match: return -1 if report: j = match.start(0) self.unknown_decl(rawdata[i+3: j]) return match.end(0) # Internal -- parse comment, return length or -1 if not terminated def parse_comment(self, i, report=1): rawdata = self.rawdata if rawdata[i:i+4] != '<!--': self.error('unexpected call to parse_comment()') match = _commentclose.search(rawdata, i+4) if not match: return -1 if report: j = match.start(0) self.handle_comment(rawdata[i+4: j]) return match.end(0) # Internal -- scan past the internal subset in a <!DOCTYPE declaration, # returning the index just past any whitespace following the trailing ']'. def _parse_doctype_subset(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) j = i while j < n: c = rawdata[j] if c == "<": s = rawdata[j:j+2] if s == "<": # end of buffer; incomplete return -1 if s != "<!": self.updatepos(declstartpos, j + 1) self.error("unexpected char in internal subset (in %r)" % s) if (j + 2) == n: # end of buffer; incomplete return -1 if (j + 4) > n: # end of buffer; incomplete return -1 if rawdata[j:j+4] == "<!--": j = self.parse_comment(j, report=0) if j < 0: return j continue name, j = self._scan_name(j + 2, declstartpos) if j == -1: return -1 if name not in {"attlist", "element", "entity", "notation"}: self.updatepos(declstartpos, j + 2) self.error( "unknown declaration %r in internal subset" % name) # handle the individual names meth = getattr(self, "_parse_doctype_" + name) j = meth(j, declstartpos) if j < 0: return j elif c == "%": # parameter entity reference if (j + 1) == n: # end of buffer; incomplete return -1 s, j = self._scan_name(j + 1, declstartpos) if j < 0: return j if rawdata[j] == ";": j = j + 1 elif c == "]": j = j + 1 while j < n and rawdata[j].isspace(): j = j + 1 if j < n: if rawdata[j] == ">": return j self.updatepos(declstartpos, j) self.error("unexpected char after internal subset") else: return -1 elif c.isspace(): j = j + 1 else: self.updatepos(declstartpos, j) self.error("unexpected char %r in internal subset" % c) # end of buffer reached return -1 # Internal -- scan past <!ELEMENT declarations def _parse_doctype_element(self, i, declstartpos): name, j = self._scan_name(i, declstartpos) if j == -1: return -1 # style content model; just skip until '>' rawdata = self.rawdata if '>' in rawdata[j:]: return rawdata.find(">", j) + 1 return -1 # Internal -- scan past <!ATTLIST declarations def _parse_doctype_attlist(self, i, declstartpos): rawdata = self.rawdata name, j = self._scan_name(i, declstartpos) c = rawdata[j:j+1] if c == "": return -1 if c == ">": return j + 1 while 1: # scan a series of attribute descriptions; simplified: # name type [value] [#constraint] name, j = self._scan_name(j, declstartpos) if j < 0: return j c = rawdata[j:j+1] if c == "": return -1 if c == "(": # an enumerated type; look for ')' if ")" in rawdata[j:]: j = rawdata.find(")", j) + 1 else: return -1 while rawdata[j:j+1].isspace(): j = j + 1 if not rawdata[j:]: # end of buffer, incomplete return -1 else: name, j = self._scan_name(j, declstartpos) c = rawdata[j:j+1] if not c: return -1 if c in "'\"": m = _declstringlit_match(rawdata, j) if m: j = m.end() else: return -1 c = rawdata[j:j+1] if not c: return -1 if c == "#": if rawdata[j:] == "#": # end of buffer return -1 name, j = self._scan_name(j + 1, declstartpos) if j < 0: return j c = rawdata[j:j+1] if not c: return -1 if c == '>': # all done return j + 1 # Internal -- scan past <!NOTATION declarations def _parse_doctype_notation(self, i, declstartpos): name, j = self._scan_name(i, declstartpos) if j < 0: return j rawdata = self.rawdata while 1: c = rawdata[j:j+1] if not c: # end of buffer; incomplete return -1 if c == '>': return j + 1 if c in "'\"": m = _declstringlit_match(rawdata, j) if not m: return -1 j = m.end() else: name, j = self._scan_name(j, declstartpos) if j < 0: return j # Internal -- scan past <!ENTITY declarations def _parse_doctype_entity(self, i, declstartpos): rawdata = self.rawdata if rawdata[i:i+1] == "%": j = i + 1 while 1: c = rawdata[j:j+1] if not c: return -1 if c.isspace(): j = j + 1 else: break else: j = i name, j = self._scan_name(j, declstartpos) if j < 0: return j while 1: c = self.rawdata[j:j+1] if not c: return -1 if c in "'\"": m = _declstringlit_match(rawdata, j) if m: j = m.end() else: return -1 # incomplete elif c == ">": return j + 1 else: name, j = self._scan_name(j, declstartpos) if j < 0: return j # Internal -- scan a name token and the new position and the token, or # return -1 if we've reached the end of the buffer. def _scan_name(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) if i == n: return None, -1 m = _declname_match(rawdata, i) if m: s = m.group() name = s.strip() if (i + len(s)) == n: return None, -1 # end of buffer return name.lower(), m.end() else: self.updatepos(declstartpos, i) self.error("expected name token at %r" % rawdata[declstartpos:declstartpos+20]) # To be overridden -- handlers for unknown objects def unknown_decl(self, data): pass
.
Edit
..
Edit
__future__.py
Edit
__phello__.foo.py
Edit
__pycache__
Edit
_bootlocale.py
Edit
_collections_abc.py
Edit
_compat_pickle.py
Edit
_compression.py
Edit
_dummy_thread.py
Edit
_markupbase.py
Edit
_osx_support.py
Edit
_py_abc.py
Edit
_pydecimal.py
Edit
_pyio.py
Edit
_sitebuiltins.py
Edit
_strptime.py
Edit
_sysconfigdata_dm_linux_x86_64-linux-gnu.py
Edit
_sysconfigdata_m_linux_x86_64-linux-gnu.py
Edit
_threading_local.py
Edit
_weakrefset.py
Edit
abc.py
Edit
aifc.py
Edit
antigravity.py
Edit
argparse.py
Edit
ast.py
Edit
asynchat.py
Edit
asyncio
Edit
asyncore.py
Edit
base64.py
Edit
bdb.py
Edit
binhex.py
Edit
bisect.py
Edit
bz2.py
Edit
cProfile.py
Edit
calendar.py
Edit
cgi.py
Edit
cgitb.py
Edit
chunk.py
Edit
cmd.py
Edit
code.py
Edit
codecs.py
Edit
codeop.py
Edit
collections
Edit
colorsys.py
Edit
compileall.py
Edit
concurrent
Edit
config-3.7m
Edit
configparser.py
Edit
contextlib.py
Edit
contextvars.py
Edit
copy.py
Edit
copyreg.py
Edit
crypt.py
Edit
csv.py
Edit
ctypes
Edit
curses
Edit
dataclasses.py
Edit
datetime.py
Edit
dbm
Edit
decimal.py
Edit
difflib.py
Edit
dis.py
Edit
distutils
Edit
doctest.py
Edit
dummy_threading.py
Edit
email
Edit
encodings
Edit
ensurepip
Edit
enum.py
Edit
filecmp.py
Edit
fileinput.py
Edit
fnmatch.py
Edit
formatter.py
Edit
fractions.py
Edit
ftplib.py
Edit
functools.py
Edit
genericpath.py
Edit
getopt.py
Edit
getpass.py
Edit
gettext.py
Edit
glob.py
Edit
gzip.py
Edit
hashlib.py
Edit
heapq.py
Edit
hmac.py
Edit
html
Edit
http
Edit
idlelib
Edit
imaplib.py
Edit
imghdr.py
Edit
imp.py
Edit
importlib
Edit
inspect.py
Edit
io.py
Edit
ipaddress.py
Edit
json
Edit
keyword.py
Edit
lib-dynload
Edit
lib2to3
Edit
linecache.py
Edit
locale.py
Edit
logging
Edit
lzma.py
Edit
macpath.py
Edit
mailbox.py
Edit
mailcap.py
Edit
mimetypes.py
Edit
modulefinder.py
Edit
multiprocessing
Edit
netrc.py
Edit
nntplib.py
Edit
ntpath.py
Edit
nturl2path.py
Edit
numbers.py
Edit
opcode.py
Edit
operator.py
Edit
optparse.py
Edit
os.py
Edit
pathlib.py
Edit
pdb.py
Edit
pickle.py
Edit
pickletools.py
Edit
pipes.py
Edit
pkgutil.py
Edit
platform.py
Edit
plistlib.py
Edit
poplib.py
Edit
posixpath.py
Edit
pprint.py
Edit
profile.py
Edit
pstats.py
Edit
pty.py
Edit
py_compile.py
Edit
pyclbr.py
Edit
pydoc.py
Edit
pydoc_data
Edit
queue.py
Edit
quopri.py
Edit
random.py
Edit
re.py
Edit
reprlib.py
Edit
rlcompleter.py
Edit
runpy.py
Edit
sched.py
Edit
secrets.py
Edit
selectors.py
Edit
shelve.py
Edit
shlex.py
Edit
shutil.py
Edit
signal.py
Edit
site-packages
Edit
site.py
Edit
smtpd.py
Edit
smtplib.py
Edit
sndhdr.py
Edit
socket.py
Edit
socketserver.py
Edit
sqlite3
Edit
sre_compile.py
Edit
sre_constants.py
Edit
sre_parse.py
Edit
ssl.py
Edit
stat.py
Edit
statistics.py
Edit
string.py
Edit
stringprep.py
Edit
struct.py
Edit
subprocess.py
Edit
sunau.py
Edit
symbol.py
Edit
symtable.py
Edit
sysconfig.py
Edit
tabnanny.py
Edit
tarfile.py
Edit
telnetlib.py
Edit
tempfile.py
Edit
test
Edit
textwrap.py
Edit
this.py
Edit
threading.py
Edit
timeit.py
Edit
token.py
Edit
tokenize.py
Edit
trace.py
Edit
traceback.py
Edit
tracemalloc.py
Edit
tty.py
Edit
types.py
Edit
typing.py
Edit
unittest
Edit
urllib
Edit
uu.py
Edit
uuid.py
Edit
venv
Edit
warnings.py
Edit
wave.py
Edit
weakref.py
Edit
webbrowser.py
Edit
wsgiref
Edit
xdrlib.py
Edit
xml
Edit
xmlrpc
Edit
zipapp.py
Edit
zipfile.py
Edit