Michael-F-Ellis · phorward · Sep 18, 2018 · Sep 20, 2018
diff --git a/htmltree/htmltree.py b/htmltree/htmltree.py
@@ -1,4 +1,10 @@
 # -*- coding: utf-8 -*-
+
+# __pragma__('kwargs')
+# __pragma__('xglobs')
+# __pragma__('tconv')
+# __pragma__('opov')
+
 """
 Description: Provides a general html tree class, HtmlElement and wrapper
 functions for most standard non-obsolete HTML tags.
@@ -22,7 +28,7 @@
 Copyright 2017 Ellis & Grant, Inc.
 License: MIT License
 """
-# __pragma__('kwargs')
+
 
 def KWElement(tag, *content, **attrs):
     """
@@ -1053,6 +1059,253 @@ def Textarea(*content, **attrs):
     """
     return KWElement('textarea', *content, **attrs)
 
+#######################################################################
+## HTML parser
+#######################################################################
+
+# Global variables
+__tagsDict = None
+
+def __buildTagsDict():
+    """
+    Internal function for building a mapping of all available tags.
+    """
+    global __tagsDict
+    __tagsDict = {}
+
+    for cname in globals().keys():
+        if cname[0] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+            continue
+
+        cl = globals()[cname]
+
+        try:
+            inst = cl()
+
+            if isinstance(inst, HtmlElement):
+                __tagsDict[cname.lower()] = cl
+
+        except:
+            pass
+
+    # print(__tagsDict)
+
+def parseHtml(html):
+    """
+	Parses the provided HTML code according to the objects defined in the htmltree-library.
+	"""
+
+    def scanWhite(l):
+        """
+		Scan and return whitespace.
+		"""
+
+        ret = ""
+        while l and l[0] in " \t\r\n\v":
+            ret += l.pop(0)
+
+        return ret
+
+    def scanWord(l):
+        """
+		Scan and return a word.
+		"""
+
+        ret = ""
+        while l and l[0] not in " \t\r\n\v" + "<>=\"'":
+            ret += l.pop(0)
+
+        return ret
+
+    def buildElem(elem):
+        try:
+            elem = elem[0](*elem[3], **elem[2])
+        except:
+            print(elem[2])
+            elem = elem[0](**elem[2])
+
+        return elem
+
+    global __tagsDict
+
+    # Obtain tag descriptions
+    if __tagsDict is None:
+        __buildTagsDict()
+
+    # Prepare stack and input
+    stack = []
+    ret = []
+    html = [ch for ch in html]
+
+    # Parse
+    while html:
+        tag = None
+        text = ""
+
+        # ugly...
+        singles = []
+        while stack and stack[-1][1] in ["br", "input", "img", "meta"]:
+            singles.append(buildElem(stack.pop()))
+
+        if stack:
+            stack[-1][3].extend(singles)
+        else:
+            ret.extend(singles)
+
+        while html:
+            #print("html", html)
+            #print(stack)
+
+            ch = html.pop(0)
+
+            # Comment
+            if html and ch == "<" and "".join(html[:3]) == "!--":
+                html = html[3:]
+                while html and "".join(html[:3]) != "-->":
+                    html.pop(0)
+
+                html = html[3:]
+
+            # Opening tag
+            elif html and ch == "<" and html[0] != "/":
+                # Append plain text (if not only whitespace)
+                if (text and ((len(text) == 1 and text in ["\t "])
+                              or not all([ch in " \t\r\n\v" for ch in text]))):
+                    if stack:
+                        stack[-1][3].append(text)
+                    else:
+                        ret.append(text)
+
+                tag = scanWord(html)
+                if tag.lower() in __tagsDict:
+                    break
+
+                text += ch + tag
+
+            # Closing tag
+            elif html and stack and stack[-1][1] and ch == "<" and html[0] == "/":
+                # Append plain text (if not only whitespace)
+                if (text and ((len(text) == 1 and text in ["\t "])
+                              or not all([ch in " \t\r\n\v" for ch in text]))):
+                    if stack:
+                        stack[-1][3].append(text)
+                    else:
+                        ret.append(text)
+
+                junk = ch
+                junk += html.pop(0)
+
+                tag = scanWord(html)
+                junk += tag
+
+                # print("/", tag.lower(), stack[-1][1].lower())
+                if stack[-1][1].lower() == tag.lower():
+                    junk += scanWhite(html)
+                    if html and html[0] == ">":
+                        html.pop(0)
+
+                        elem = buildElem(stack.pop())
+
+                        if not stack:
+                            ret.append(elem)
+                        else:
+                            stack[-1][3].append(elem)
+
+                        tag = None
+                        break
+
+                text += junk
+                tag = None
+
+            else:
+                text += ch
+
+        # Create tag
+        if tag and tag.lower() in __tagsDict:
+            #print(tag)
+            stack.append((__tagsDict[tag.lower()], tag.lower(), {}, []))
+
+            # print("tag", tag)
+
+            while html:
+                scanWhite(html)
+                if not html:
+                    break
+
+                # End of tag >
+                if html[0] == ">":
+                    html.pop(0)
+                    break
+
+                # Closing tag at end />
+                elif html[0] == "/":
+                    html.pop(0)
+                    scanWhite(html)
+
+                    if html[0] == ">":
+                        elem = buildElem(stack.pop())
+
+                        if not stack:
+                            ret.append(elem)
+                        else:
+                            stack[-1][3].append(elem)
+
+                        html.pop(0)
+                        break
+
+                att = scanWord(html)
+                att = att.lower() #fixme: This is split into two lines due to a Transcrypt bug
+                val = att
+
+                if not att:
+                    html.pop(0)
+                    continue
+
+                # Attribute
+                scanWhite(html)
+                if html[0] == "=":
+                    html.pop(0)
+                    scanWhite(html)
+
+                    if html[0] in "\"'":
+                        ch = html.pop(0)
+
+                        val = ""
+                        while html and html[0] != ch:
+                            val += html.pop(0)
+
+                        html.pop(0)
+
+                if att == "style":
+                    if "style" not in stack[-1][2]:
+                        stack[-1][2]["style"] = {}
+
+                    for dfn in val.split(";"):
+                        if not ":" in dfn:
+                            continue
+
+                        att, val = dfn.split(":", 1)
+
+                        # print(tag, "style", att.strip(), val.strip())
+                        stack[-1][2]["style"][att.strip()] = val.strip()
+                else:
+                    stack[-1][2][att] = val
+
+                continue
+
+    # Unclosed tags?
+    while stack:
+        ret.append(buildElem(stack.pop()))
+
+    if ret:
+        if len(ret) > 1:
+            return ret
+
+        return ret[0]
+
+    return None
+
+
 #######################################################################
 ## Interactive Elememts (Experimental. Omitted for now.)
 #######################################################################
@@ -1061,16 +1314,24 @@ def Textarea(*content, **attrs):
 ## Web Components (Experimental. Omitted for now.)
 #######################################################################
 
-# __pragma__('nokwargs')
 
 ## The 'skip' pragma tells the Transcrypt Python to JS transpiler to
 ## ignore a section of code. It's needed here because the 'run as script'
 ## idiom causes an error in Transcrypt and has no meaning in that context.
 ## Putting the pragmas in comments means they'll be ignored and cause no
 ## problems in a real python interpreter.
 
-# __pragma__ ('skip')
+
 if __name__ == '__main__':
-    import doctest
-    doctest.testmod()
-# __pragma__ ('noskip')
+    print(parseHtml("""
+    <html>
+      <head>
+        <meta content="Your Name Here" name="author">
+      </head>
+      <body style="background-color:black;">
+        <h1 class="banner" style="color:green;">
+          Hello, htmltree!
+        </h1>
+      </body>
+    </html>
+    """).render())