Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/stencil/ooxml.clj
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

;; Content Break: http://officeopenxml.com/WPtextSpecialContent-break.php
(def br :xmlns.http%3A%2F%2Fschemas.openxmlformats.org%2Fwordprocessingml%2F2006%2Fmain/br)
(def tab :xmlns.http%3A%2F%2Fschemas.openxmlformats.org%2Fwordprocessingml%2F2006%2Fmain/tab)

(def type :xmlns.http%3A%2F%2Fschemas.openxmlformats.org%2Fwordprocessingml%2F2006%2Fmain/type)

Expand Down
46 changes: 32 additions & 14 deletions src/stencil/postprocess/whitespaces.clj
Original file line number Diff line number Diff line change
@@ -1,44 +1,62 @@
(ns stencil.postprocess.whitespaces
(:require [clojure.string :refer [includes? starts-with? ends-with? index-of]]
"Logics of handling whitespace characters in OOXML text runs.

The code visits all <w/t> elements and modifies content where necessary:
- If content starts or ends with whitespace, add a space=preserve attribute.
- Replace \n characters with <w/br /> elements.
- Replace \t characters with <w/tab /> elements."
(:require [clojure.string :refer [starts-with? ends-with? index-of]]
[clojure.zip :as zip]
[stencil.ooxml :as ooxml]
[stencil.util :refer :all]))
[stencil.util :refer [dfs-walk-xml-node zipper?]]))

;; Returns smallest index of c1 or c2 in s or nil when not found.
(defn- first-index-of [s c1 c2]
(assert (string? s))
(let [idx1 (index-of s c1)
idx2 (index-of s c2)]
(if (and idx1 idx2)
(min idx1 idx2)
(or idx1 idx2))))

(defn- should-fix? [element]
(when (and (map? element)
(= ooxml/t (:tag element))
(not-empty (:content element)))
(or (starts-with? (first (:content element)) " ")
(ends-with? (last (:content element)) " ")
(some #(includes? % "\n") (:content element)))))
(some #(first-index-of (str %) \newline \tab) (:content element)))))

(defn- multi-replace [loc items]
(assert (zipper? loc))
(assert (not-empty items))
(reduce (comp zip/right zip/insert-right) (zip/replace loc (first items)) (next items)))

;; (defn- lines-of [s] (enumeration-seq (java.util.StringTokenizer. s "\n" true)))
;; (defn- lines-of [s] (remove #{""} (interpose "\n" (clojure.string/split s "\n" -1))))

(defn- lines-of [s]
(if-let [idx (index-of s "\n")]
;; Returns a lazy seq of substrings split by \t or \n, separators included.
(defn- split-str [s]
(assert (string? s))
(if-let [idx (first-index-of s \newline \tab)]
(if (zero? idx)
(cons "\n" (lazy-seq (lines-of (subs s 1))))
(list* (subs s 0 idx) "\n" (lazy-seq (lines-of (subs s (inc idx))))))
(list* (subs s 0 1) (lazy-seq (split-str (subs s 1))))
(list* (subs s 0 idx) (subs s idx (inc idx)) (lazy-seq (split-str (subs s (inc idx))))))
(if (empty? s) [] (list s))))

(defn- item->elem [item]
(defn- str->element [item]
(cond (= "\n" item)
,,,{:tag ooxml/br}
(= "\t" item)
,,,{:tag ooxml/tab}
(or (starts-with? item " ") (ends-with? item " "))
,,,{:tag ooxml/t :content [item] :attrs {ooxml/space "preserve"}}
:else
,,,{:tag ooxml/t :content [item]}))

(defn- fix-elem-node [loc]
(->> (apply str (:content (zip/node loc)))
(lines-of)
(map item->elem)
(->> (:content (zip/node loc))
(apply str)
(split-str)
(map str->element)
(multi-replace loc)))

(defn fix-whitespaces [xml-tree] (dfs-walk-xml-node xml-tree should-fix? fix-elem-node))

22 changes: 16 additions & 6 deletions test/stencil/postprocess/whitespaces_test.clj
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
(ns stencil.postprocess.whitespaces-test
(:require [clojure.test :refer [deftest is are testing]]
(:require [clojure.test :refer [deftest is testing]]
[clojure.data.xml :as xml]
[clojure.zip]
[stencil.eval :as eval]
[stencil.process :refer :all]
[stencil.process]
[stencil.util]
[stencil.model :as model]))


Expand Down Expand Up @@ -30,6 +32,11 @@
"<w:a xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\"><w:t>two lines: first</w:t><w:br/><w:t>second</w:t><w:t xml:space=\"preserve\"> </w:t></w:a>"
"<x:a xmlns:x=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\"><x:t>two lines: {%=x </x:t><x:t>%} </x:t></x:a>"
{"x" "first\nsecond"}))
(testing "tabulator"
(test-equals
"<w:a xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\"><w:t>two entries: first</w:t><w:tab/><w:t>second</w:t></w:a>"
"<x:a xmlns:x=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\"><x:t>two entries: {%=x %}</x:t></x:a>"
{"x" "first\tsecond"}))
(testing "existing space=preserve attributes are kept intact"
(test-equals
"<w:a xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\" xml:space=\"preserve\"> Hello </w:a>"
Expand All @@ -43,10 +50,13 @@
:when (and (var? v) (= target (.ns v)))]
(eval `(defn ~(symbol (str "-" k)) [~'& args#] (apply (deref ~v) args#)))))

(deftest test-lines-of
(is (= ["ab" "\n" "\n" "bc"] (-lines-of "ab\n\nbc")))
(is (= ["\n" "xy" "\n"] (-lines-of "\nxy\n")))
(is (= () (-lines-of ""))))
(declare -split-str -multi-replace)

(deftest test-split-str
(is (= ["ab" "\n" "\n" "bc"] (-split-str "ab\n\nbc")))
(is (= ["\n" "xy" "\n"] (-split-str "\nxy\n")))
(is (= ["a" "\t" "\n" " b"] (-split-str "a\t\n b")))
(is (= () (-split-str ""))))

(deftest test-multi-replace
(let [tree (stencil.util/xml-zip {:tag :a :content ["x" "y" "z"]})
Expand Down