1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 """Functions for parsing, processing, and formatting Python structured text.
17
18 See "Structured Text Formatting
19 Rules":http://www.python.org/sigs/doc-sig/stext.html for more
20 information.
21 """
22
23
24
25
26
27 import cStringIO
28 import htmlentitydefs
29 import re
30 import string
31 import sys
32
33
34
35
36
37 html_help_text = '''
38 <h4>Structured Text</h4>
39
40 <p>Structured text is a simple set of conventions for formatting
41 ordinary text. Usually, you can simply type ordinary text where QM
42 expects structured text; the resulting output will be line-wrapped, with
43 paragraph breaks indicated by blank lines.</p>
44
45 <p>Structured text also provides simple ways of adding elements such as
46 bulleted and numbered lists, bold and italics text, monospaced text, and
47 hyperlinks to ordinary text. For example, to obtain bold text, place a
48 pair of asterisks on either side of it. Thus,
49 <blockquote><pre>
50 **hello, there**
51 </pre></blockquote>
52 is displayed as
53 <blockquote>
54 <b>hello, there</b>
55 </blockquote>Structured text may be displayed as HTML, as plain text,
56 or in other formats. The exact format of the output will depend on the
57 capabilities of the output system.</p>
58
59 <p>For a complete description of structured text rules, see the
60 <a href="http://www.python.org/sigs/doc-sig/stext.html">Structured Text
61 Formatting Rules</a>.</p>
62 '''
63
64
65
66
67
96
97
98
100 """Formatter for generating plain text from structured text."""
101
102 __style_markers = {
103 "emphasized" : "*",
104 "strong" : "**",
105 "underlined" : "_",
106 "literal" : "'",
107 "verbatim" : "'''",
108 }
109
110
111 - def __init__(self,
112 output_file=sys.stdout,
113 width=78,
114 indent_size=2,
115 indent=0,
116 list_bullet="-"):
117 """Create a new HTML formatter.
118
119 'output_file' -- A file object to which HTML source is
120 written."""
121
122 self.__output_file = output_file
123 self.__width = width
124 self.__col = 0
125 self.__indent = indent
126 self.__indent_size = indent_size
127 self.__list_bullet = list_bullet
128 self.__list_depth = 0
129 self.__current_link_target = None
130 self.__link_targets = []
131
132
134 """End the processed text document."""
135
136
137
138 if self.__link_targets:
139 self.__NextLine()
140 for index in range(0, len(self.__link_targets)):
141
142
143 target = self.__link_targets[index]
144 self.WriteText("[%d] %s" % (index + 1, target))
145 self.__NextLine()
146
147
149 """Write ordinary text."""
150
151
152
153
154 words = re.split("( )", text)
155
156 words = filter(None, words)
157
158 start_of_line = 0
159 for word in words:
160
161 if self.__col + len(word) > self.__width:
162
163 self.__NextLine()
164 self.__IndentTo(self.__indent)
165 start_of_line = 1
166
167 if start_of_line:
168 if string.strip(word) == "":
169
170 continue
171 else:
172
173 start_of_line = 0
174
175 self.__Write(word)
176
177
179 """Start a list environment of type 'type'."""
180
181
182
183 if type == "paragraph" and self.__list_depth > 0:
184 self.__indent = self.__indent + self.__indent_size
185
186 self.__list_depth = self.__list_depth + 1
187
188
190 """End a list environment of type 'type'."""
191
192
193 self.__list_depth = self.__list_depth - 1
194
195
196 if type == "paragraph" and self.__list_depth > 0:
197 self.__indent = self.__indent - self.__indent_size
198
199
201 """Begin an element to the environment of type 'type'.
202
203 'label' -- If type is "ordered list", this is the label for
204 this list element."""
205
206 self.__IndentTo(self.__indent)
207
208 if type == "ordered list":
209 self.__Write("%s " % label)
210 elif type == "unordered list":
211 self.__Write("%s " % self.__list_bullet)
212 elif type == "definition list":
213 pass
214
215
217 """Finish the definition of a term in a definition list."""
218
219 self.__Write(" -- ");
220
221
223 """End an element in the environment of type 'type'."""
224
225 if type == "paragraph":
226
227
228 if self.__col > self.__indent:
229 self.__NextLine()
230
231 self.__NextLine()
232
233
238
239
244
245
247 """Being a hyperlink to 'target'."""
248
249
250 assert self.__current_link_target is None
251
252
253 self.__current_link_target = target
254
255
257 """End a hyperlink."""
258
259
260
261 target = self.__current_link_target
262 assert target is not None
263 self.__current_link_target = None
264
265
266 try:
267 reference_number = self.__link_targets.index(target) + 1
268 except ValueError:
269
270
271
272 self.__link_targets.append(target)
273 reference_number = len(self.__link_targets)
274
275 self.__Write(" [%d]" % reference_number)
276
277
278
279
281 if col > self.__col:
282 self.__Write(" " * (col - self.__col))
283
284
286 self.__output_file.write(text)
287 self.__col = self.__col + len(text)
288
289
291 self.__Write("\n")
292 self.__col = 0
293
294
295
431
432
433
435 """Parser and formatter for Python structured text."""
436
437
438 __punctuation = "[%s]" % "][)(.,!?;:'\" "
439
440
441 __bullet_regex = re.compile("^[-o*] +")
442
443
444 __sequence_regex = re.compile("^([A-Za-z]+\.|[0-9]+\.?)+ +")
445
446
447
448 __definition_regex = re.compile("^(.*) +-- +")
449
450
451 __collapse_regex = re.compile(" *\n *", re.MULTILINE)
452
453
454 __indent_regex = re.compile("^ *")
455
456
457
458
459 __literal_regex = re.compile("( +|^)'([^']+)'(%s+|$)" % __punctuation)
460
461
462
463
464 __strong_regex = re.compile("( +|^)\*\*([^*]+)\*\*(%s+|$)" % __punctuation)
465
466
467
468
469 __emph_regex = re.compile("( +|^)\*([^*]+)\*(%s+|$)" % __punctuation)
470
471
472
473
474 __underline_regex = re.compile("( +|^)_([^_]+)_(%s+|$)" % __punctuation)
475
476
477
478
479 __link_regex = re.compile('"([^"]*)"')
480
481
482
483 __link_footnote_regex = re.compile('\n\\.\\. *"([^"]*)" *([^ \n]*)[^\n]*')
484
485
486
487 __non_nestable_types = [
488 "paragraph",
489 ]
490
491
492 - def __init__(self, formatter):
493 """Create a new structured text processor.
494
495 'formatter' -- The formatter to use to generate output."""
496
497 self.__stack = []
498 self.__formatter = formatter
499 self.__hyperlinks = {}
500
501
502 - def NormalizeSpaces(self, text):
503 """Return 'text' with spaces normalized."""
504
505
506 text = string.replace(text, "\t", " ")
507
508 return string.strip(text) + " "
509
510
511 - def __call__(self, text):
512 """Process structured text 'text'."""
513
514
515
516 position = 0
517 while position < len(text):
518
519 match = self.__link_footnote_regex.search(text[position:])
520 if match is None:
521
522 break
523 else:
524
525 link_text = string.strip(match.group(1))
526 link_target = match.group(2)
527 self.__hyperlinks[link_text] = link_target
528
529 text = text[:match.start() + position] \
530 + text[match.end() + position:]
531
532 position = match.start()
533
534
535 paragraphs = get_paragraphs(text)
536
537
538 for paragraph in paragraphs:
539
540 match = _verbatim_regexp.match(paragraph)
541 if match:
542 if self.__stack:
543 indentation = self.__stack[-1][1]
544 else:
545 indentation = 0
546 self.__SetType("verbatim", indentation)
547 self.__formatter.StartStyle("verbatim")
548 self.__formatter.WriteText(match.group(1)[3:-3])
549 self.__formatter.EndStyle("verbatim")
550 continue
551
552 indents = self.__indent_regex.findall(paragraph)
553
554
555 indentation = min(map(len, indents))
556
557 paragraph = paragraph[indentation:]
558
559
560 if paragraph == "":
561 continue
562
563
564 first_line = string.split(paragraph, "\n", 1)[0]
565
566
567 match = self.__bullet_regex.match(first_line)
568 if match is not None:
569
570
571 self.__SetType("unordered list", indentation)
572
573
574 match_length = len(match.group(0))
575 indentation = indentation + match_length
576 paragraph = paragraph[match_length:]
577 else:
578
579 match = self.__sequence_regex.match(first_line)
580 if match is not None:
581
582
583 self.__SetType("ordered list", indentation,
584 label=match.group(1))
585
586
587 match_length = len(match.group(0))
588 indentation = indentation + match_length
589 paragraph = paragraph[match_length:]
590 else:
591 match = self.__definition_regex.match(first_line)
592
593 if match is not None:
594
595
596 self.__SetType("definition list", indentation,
597 label=match.group(1))
598
599
600 match_length = len(match.group(0))
601 indentation = indentation + match_length
602 paragraph = paragraph[match_length:]
603
604
605
606 paragraph = self.__collapse_regex.sub(" ", paragraph)
607
608 paragraph = self.NormalizeSpaces(paragraph)
609
610 self.__SetType("paragraph", indentation)
611 self.__WriteText(paragraph)
612
613
615 """Stop processing text, and do any necessary cleanup."""
616
617
618 while self.__stack:
619 top_type, top_indentation = self.__stack[-1]
620
621 self.__formatter.EndItem(top_type)
622
623 self.__PopType()
624
625 self.__formatter.End()
626
627
628
629
630 - def __PushType(self, type, indentation):
631 """Start a new environment."""
632
633
634
635
636 if len(self.__stack) > 0:
637 top_type, top_indentation = self.__stack[-1]
638 if top_type in self.__non_nestable_types:
639 self.__formatter.EndItem(top_type)
640
641 self.__formatter.StartList(type)
642
643 self.__stack.append((type, indentation))
644
645
646 - def __PopType(self):
647 """End and remove the innermost environment."""
648
649
650 top_type, top_indentation = self.__stack[-1]
651
652 self.__formatter.EndList(top_type)
653
654 self.__stack.pop()
655
656
657
658
659 if len(self.__stack) > 0:
660 top_type, top_indentation = self.__stack[-1]
661 if top_type in self.__non_nestable_types:
662 self.__formatter.StartItem(top_type)
663
664
665 - def __SetType(self, type, indentation, label=None):
666 """Set the environment type and indentation level."""
667
668 while 1:
669
670
671 if len(self.__stack) == 0:
672 top_indentation = -1
673 else:
674 top_type, top_indentation = self.__stack[-1]
675
676
677
678 if indentation <= top_indentation:
679
680 self.__formatter.EndItem(top_type)
681 if indentation < top_indentation:
682
683 self.__PopType()
684 elif top_type != type:
685
686
687
688 self.__PopType()
689 self.__PushType(type, indentation)
690 else:
691
692
693 break
694 else:
695
696
697 self.__PushType(type, indentation)
698 break
699
700
701 self.__formatter.StartItem(type, label)
702 if type == "definition list":
703 self.__WriteText(label)
704 self.__formatter.FinishDefinedTerm()
705
706
707 - def __WriteText(self, text):
708 """Write paragraph text."""
709
710
711
712 for regex, style in [
713 (self.__literal_regex, "literal"),
714 (self.__strong_regex, "strong"),
715 (self.__emph_regex, "emphasized"),
716 (self.__underline_regex, "underlined"),
717 ]:
718
719 match = regex.search(text)
720 if match is not None:
721
722
723 self.__WriteText(text[:match.end(1)])
724
725 self.__formatter.StartStyle(style)
726
727
728 if style == "literal" or style == "verbatim":
729 self.__formatter.WriteText(match.group(2))
730 else:
731 self.__WriteText(match.group(2))
732
733 self.__formatter.EndStyle(style)
734
735 self.__WriteText(text[match.start(3):])
736 return
737
738
739 match = self.__link_regex.search(text)
740 if match is not None:
741 link_text = string.strip(match.group(1))
742
743
744 if self.__hyperlinks.has_key(link_text):
745
746 link_target = self.__hyperlinks[link_text]
747
748
749 self.__WriteText(text[:match.start(0)])
750
751 self.__formatter.StartLink(link_target)
752
753 self.__WriteText(match.group(1))
754
755 self.__formatter.EndLink()
756
757 self.__WriteText(text[match.end(1) + 1:])
758 return
759 else:
760
761 pass
762
763
764 self.__formatter.WriteText(text)
765
766
767
768
769
770
775
776
783
784
786 """Return 'structured_text' formatted as HTML."""
787
788
789 output_string = cStringIO.StringIO()
790 formatter = HtmlFormatter(output_string)
791
792 __format(structured_text, formatter)
793
794 return output_string.getvalue()
795
796
797 -def to_text(structured_text, width=78, indent=0):
798 """Return 'structured_text' formatted as plain text.
799
800 'width' -- The width of the text (including the indentation).
801
802 'indent' -- The width of the block indentation of the formatted
803 output."""
804
805
806 output_string = cStringIO.StringIO()
807 formatter = TextFormatter(output_string, width=width, indent=indent)
808
809 __format(structured_text, formatter)
810
811 return output_string.getvalue()
812
813
815 """Return the first line of 'structured_text'.
816
817 By convention, the first line of a structured text description is a
818 short summary."""
819
820 return string.split(structured_text, "\n", 1)[0]
821
822
824 """Return the contents of 'structured_text' minus the first line."""
825
826 parts = string.split(structured_text, "\n", 1)
827
828 if len(parts) > 0:
829 return parts[1]
830 else:
831 return ""
832
833
835 """Split 'structured_text' into paragraphs.
836
837 'structured_text' -- A string consisting of structured text.
838
839 returns -- A sequence of pagraphs of structured text. Each
840 element in the sequence corresponds to a successive pagraph
841 in the 'structured_text'. If 'structured_text' is the empty
842 string, the sequence returned will consist of a single
843 paragraph, itself empty."""
844
845
846 paragraphs = []
847
848 begin = 0
849
850 end = 0
851
852 while end < len(structured_text):
853
854
855 if (len(structured_text) - end >= 6
856 and structured_text[end:end+3] == "'''"):
857 end = string.find(structured_text, "'''", end + 3)
858 if end > 0:
859 end = end + 3
860
861 paragraphs.append(structured_text[begin:end])
862 begin = end
863 continue
864 else:
865
866
867 while end < len(structured_text):
868
869 match = __paragraph_regexp.match(structured_text, end)
870 if match:
871
872 paragraphs.append(structured_text[begin:end])
873
874
875 begin = match.end()
876 end = begin
877 break
878 else:
879
880 end = end + 1
881
882
883 if begin != end:
884 paragraphs.append(structured_text[begin:end])
885
886 return paragraphs
887
888
890 """Return the first paragraph of 'structured_text'.
891
892 'structured_text' -- A string consisting of structured text.
893
894 returns -- A string of structured text that is the first paragraph
895 of the 'structured_text'."""
896
897 return get_paragraphs(structured_text)[0]
898
899
900
901
902
903
904
905 __entity_char_regex = htmlentitydefs.entitydefs.values()
906
907 __entity_char_regex = filter(lambda l: len(l) == 1, __entity_char_regex)
908 __entity_char_regex = "[" + string.join(__entity_char_regex, "") + "]"
909 __entity_char_regex = re.compile(__entity_char_regex)
910
911
912
913
914 __entity_char_replacement = {}
915 for entity, character in htmlentitydefs.entitydefs.items():
916 if len(character) == 1:
917 __entity_char_replacement[character] = "&%s;" % entity
918
919
920 __entity_char_replacement = lambda match, \
921 replacement_map=__entity_char_replacement: \
922 replacement_map[match.group(0)]
923
924
925 __paragraph_regexp = re.compile("(?:\n *)+\n")
926
927
928
929 _verbatim_regexp = re.compile("('''.*''')(?:(?:\n *)+\n|\n?$)", re.DOTALL)
930
931
932
933
934
935
936
937 if __name__ == "__main__":
938
939 import getopt
940 long_options = [
941 "html",
942 "text",
943 ]
944 options, arguments = getopt.getopt(sys.argv[1:], "", long_options)
945
946 formatter = None
947 for option, option_argument in options:
948 if option == "--html":
949 formatter = HtmlFormatter()
950 elif option == "--text":
951 formatter = TextFormatter()
952
953 if formatter is None:
954 formatter = TextFormatter()
955
956
957 processor = StructuredTextProcessor(formatter)
958
959
960 if len(arguments) == 0:
961
962 inputs = (sys.stdin, )
963 else:
964
965 inputs = map(lambda file_name: open(file_name, "rt"), arguments)
966
967
968 for input in inputs:
969
970 processor(input.read())
971
972
973 processor.End()
974
975
976 sys.exit(0)
977
978
979
980
981
982
983
984
985