Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
273 changes: 267 additions & 6 deletions htmltree/htmltree.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
# -*- coding: utf-8 -*-

# __pragma__('kwargs')
# __pragma__('xglobs')
# __pragma__('tconv')
# __pragma__('opov')

"""
Description: Provides a general html tree class, HtmlElement and wrapper
functions for most standard non-obsolete HTML tags.
Expand All @@ -22,7 +28,7 @@
Copyright 2017 Ellis & Grant, Inc.
License: MIT License
"""
# __pragma__('kwargs')


def KWElement(tag, *content, **attrs):
"""
Expand Down Expand Up @@ -1053,6 +1059,253 @@ def Textarea(*content, **attrs):
"""
return KWElement('textarea', *content, **attrs)

#######################################################################
## HTML parser
#######################################################################

# Global variables
__tagsDict = None

def __buildTagsDict():
"""
Internal function for building a mapping of all available tags.
"""
global __tagsDict
__tagsDict = {}

for cname in globals().keys():
if cname[0] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
continue

cl = globals()[cname]

try:
inst = cl()

if isinstance(inst, HtmlElement):
__tagsDict[cname.lower()] = cl

except:
pass

# print(__tagsDict)

def parseHtml(html):
"""
Parses the provided HTML code according to the objects defined in the htmltree-library.
"""

def scanWhite(l):
"""
Scan and return whitespace.
"""

ret = ""
while l and l[0] in " \t\r\n\v":
ret += l.pop(0)

return ret

def scanWord(l):
"""
Scan and return a word.
"""

ret = ""
while l and l[0] not in " \t\r\n\v" + "<>=\"'":
ret += l.pop(0)

return ret

def buildElem(elem):
try:
elem = elem[0](*elem[3], **elem[2])
except:
print(elem[2])
elem = elem[0](**elem[2])

return elem

global __tagsDict

# Obtain tag descriptions
if __tagsDict is None:
__buildTagsDict()

# Prepare stack and input
stack = []
ret = []
html = [ch for ch in html]

# Parse
while html:
tag = None
text = ""

# ugly...
singles = []
while stack and stack[-1][1] in ["br", "input", "img", "meta"]:
singles.append(buildElem(stack.pop()))

if stack:
stack[-1][3].extend(singles)
else:
ret.extend(singles)

while html:
#print("html", html)
#print(stack)

ch = html.pop(0)

# Comment
if html and ch == "<" and "".join(html[:3]) == "!--":
html = html[3:]
while html and "".join(html[:3]) != "-->":
html.pop(0)

html = html[3:]

# Opening tag
elif html and ch == "<" and html[0] != "/":
# Append plain text (if not only whitespace)
if (text and ((len(text) == 1 and text in ["\t "])
or not all([ch in " \t\r\n\v" for ch in text]))):
if stack:
stack[-1][3].append(text)
else:
ret.append(text)

tag = scanWord(html)
if tag.lower() in __tagsDict:
break

text += ch + tag

# Closing tag
elif html and stack and stack[-1][1] and ch == "<" and html[0] == "/":
# Append plain text (if not only whitespace)
if (text and ((len(text) == 1 and text in ["\t "])
or not all([ch in " \t\r\n\v" for ch in text]))):
if stack:
stack[-1][3].append(text)
else:
ret.append(text)

junk = ch
junk += html.pop(0)

tag = scanWord(html)
junk += tag

# print("/", tag.lower(), stack[-1][1].lower())
if stack[-1][1].lower() == tag.lower():
junk += scanWhite(html)
if html and html[0] == ">":
html.pop(0)

elem = buildElem(stack.pop())

if not stack:
ret.append(elem)
else:
stack[-1][3].append(elem)

tag = None
break

text += junk
tag = None

else:
text += ch

# Create tag
if tag and tag.lower() in __tagsDict:
#print(tag)
stack.append((__tagsDict[tag.lower()], tag.lower(), {}, []))

# print("tag", tag)

while html:
scanWhite(html)
if not html:
break

# End of tag >
if html[0] == ">":
html.pop(0)
break

# Closing tag at end />
elif html[0] == "/":
html.pop(0)
scanWhite(html)

if html[0] == ">":
elem = buildElem(stack.pop())

if not stack:
ret.append(elem)
else:
stack[-1][3].append(elem)

html.pop(0)
break

att = scanWord(html)
att = att.lower() #fixme: This is split into two lines due to a Transcrypt bug
val = att

if not att:
html.pop(0)
continue

# Attribute
scanWhite(html)
if html[0] == "=":
html.pop(0)
scanWhite(html)

if html[0] in "\"'":
ch = html.pop(0)

val = ""
while html and html[0] != ch:
val += html.pop(0)

html.pop(0)

if att == "style":
if "style" not in stack[-1][2]:
stack[-1][2]["style"] = {}

for dfn in val.split(";"):
if not ":" in dfn:
continue

att, val = dfn.split(":", 1)

# print(tag, "style", att.strip(), val.strip())
stack[-1][2]["style"][att.strip()] = val.strip()
else:
stack[-1][2][att] = val

continue

# Unclosed tags?
while stack:
ret.append(buildElem(stack.pop()))

if ret:
if len(ret) > 1:
return ret

return ret[0]

return None


#######################################################################
## Interactive Elememts (Experimental. Omitted for now.)
#######################################################################
Expand All @@ -1061,16 +1314,24 @@ def Textarea(*content, **attrs):
## Web Components (Experimental. Omitted for now.)
#######################################################################

# __pragma__('nokwargs')

## The 'skip' pragma tells the Transcrypt Python to JS transpiler to
## ignore a section of code. It's needed here because the 'run as script'
## idiom causes an error in Transcrypt and has no meaning in that context.
## Putting the pragmas in comments means they'll be ignored and cause no
## problems in a real python interpreter.

# __pragma__ ('skip')

if __name__ == '__main__':
import doctest
doctest.testmod()
# __pragma__ ('noskip')
print(parseHtml("""
<html>
<head>
<meta content="Your Name Here" name="author">
</head>
<body style="background-color:black;">
<h1 class="banner" style="color:green;">
Hello, htmltree!
</h1>
</body>
</html>
""").render())