9
from calibre import _ent_pat, walk, xml_entity_to_unicode
9
10
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
10
11
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
11
12
from calibre.ebooks.chardet import detect
12
13
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
13
14
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
14
15
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
15
normalize_line_endings, convert_textile
16
from calibre import _ent_pat, xml_entity_to_unicode
16
normalize_line_endings, convert_textile, remove_indents, block_to_single_line, \
17
separate_hard_scene_breaks
18
from calibre.utils.zipfile import ZipFile
18
20
class TXTInput(InputFormatPlugin):
21
23
author = 'John Schember'
22
24
description = 'Convert TXT files to HTML'
23
file_types = set(['txt'])
25
file_types = set(['txt', 'txtz', 'text'])
26
28
OptionRecommendation(name='paragraph_type', recommended_value='auto',
27
choices=['auto', 'block', 'single', 'print', 'unformatted'],
29
choices=['auto', 'block', 'single', 'print', 'unformatted', 'off'],
28
30
help=_('Paragraph structure.\n'
29
'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
31
'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\', \'off\']\n'
30
32
'* auto: Try to auto detect paragraph type.\n'
31
33
'* block: Treat a blank line as a paragraph break.\n'
32
34
'* single: Assume every line is a paragraph.\n'
33
35
'* print: Assume every line starting with 2+ spaces or a tab '
35
'* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')),
36
'starts a paragraph.\n'
37
'* unformatted: Most lines have hard line breaks, few/no blank lines or indents. '
38
'Tries to determine structure and reformat the differentiate elements.\n'
39
'* off: Don\'t modify the paragraph structure. This is useful when combined with '
40
'Markdown or Textile formatting to ensure no formatting is lost.')),
36
41
OptionRecommendation(name='formatting_type', recommended_value='auto',
37
choices=['auto', 'none', 'heuristic', 'textile', 'markdown'],
42
choices=['auto', 'plain', 'heuristic', 'textile', 'markdown'],
38
43
help=_('Formatting used within the document.'
39
44
'* auto: Automatically decide which formatting processor to use.\n'
40
'* none: Do not process the document formatting. Everything is a '
45
'* plain: Do not process the document formatting. Everything is a '
41
46
'paragraph and no styling is applied.\n'
42
47
'* heuristic: Process using heuristics to determine formatting such '
43
48
'as chapter headings and italic text.\n'
70
91
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
71
92
txt = txt.decode(ienc, 'replace')
73
95
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
75
97
# Normalize line endings
76
98
txt = normalize_line_endings(txt)
78
if options.formatting_type == 'auto':
79
options.formatting_type = detect_formatting_type(txt)
81
if options.formatting_type == 'heuristic':
82
setattr(options, 'enable_heuristics', True)
83
setattr(options, 'markup_chapter_headings', True)
84
setattr(options, 'italicize_common_cases', True)
85
setattr(options, 'fix_indents', True)
86
setattr(options, 'delete_blank_paragraphs', True)
87
setattr(options, 'format_scene_breaks', True)
88
setattr(options, 'dehyphenate', True)
90
100
# Determine the paragraph type of the document.
91
101
if options.paragraph_type == 'auto':
92
102
options.paragraph_type = detect_paragraph_type(txt)
97
107
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
110
if options.formatting_type == 'auto':
111
options.formatting_type = detect_formatting_type(txt)
112
log.debug('Auto detected formatting as %s' % options.formatting_type)
114
if options.formatting_type == 'heuristic':
115
setattr(options, 'enable_heuristics', True)
116
setattr(options, 'unwrap_lines', False)
117
setattr(options, 'smarten_punctuation', True)
119
# Reformat paragraphs to block formatting based on the detected type.
120
# We don't check for block because the processor assumes block.
121
# single and print at transformed to block for processing.
122
if options.paragraph_type == 'single':
123
txt = separate_paragraphs_single_line(txt)
124
elif options.paragraph_type == 'print':
125
txt = separate_hard_scene_breaks(txt)
126
txt = separate_paragraphs_print_formatted(txt)
127
txt = block_to_single_line(txt)
128
elif options.paragraph_type == 'unformatted':
129
from calibre.ebooks.conversion.utils import HeuristicProcessor
130
# unwrap lines based on punctuation
131
docanalysis = DocAnalysis('txt', txt)
132
length = docanalysis.line_length(.5)
133
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
134
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
135
txt = separate_paragraphs_single_line(txt)
136
elif options.paragraph_type == 'block':
137
txt = separate_hard_scene_breaks(txt)
138
txt = block_to_single_line(txt)
140
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
141
docanalysis = DocAnalysis('txt', txt)
143
length = docanalysis.line_length(.5)
144
dehyphenator = Dehyphenator(options.verbose, log=self.log)
145
txt = dehyphenator(txt,'txt', length)
147
# User requested transformation on the text.
148
if options.txt_in_remove_indents:
149
txt = remove_indents(txt)
99
151
# Preserve spaces will replace multiple spaces to a space
100
152
# followed by the entity.
101
153
if options.preserve_spaces:
102
154
txt = preserve_spaces(txt)
104
# Get length for hyphen removal and punctuation unwrap
105
docanalysis = DocAnalysis('txt', txt)
106
length = docanalysis.line_length(.5)
156
# Process the text using the appropriate text processor.
108
158
if options.formatting_type == 'markdown':
109
log.debug('Running text though markdown conversion...')
159
log.debug('Running text through markdown conversion...')
111
161
html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
112
162
except RuntimeError:
113
163
raise ValueError('This txt file has malformed markup, it cannot be'
114
164
' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
115
165
elif options.formatting_type == 'textile':
116
log.debug('Running text though textile conversion...')
166
log.debug('Running text through textile conversion...')
117
167
html = convert_textile(txt)
121
dehyphenator = Dehyphenator(options.verbose, log=self.log)
122
txt = dehyphenator(txt,'txt', length)
124
# We don't check for block because the processor assumes block.
125
# single and print at transformed to block for processing.
127
if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted':
128
txt = separate_paragraphs_single_line(txt)
129
elif options.paragraph_type == 'print':
130
txt = separate_paragraphs_print_formatted(txt)
132
if options.paragraph_type == 'unformatted':
133
from calibre.ebooks.conversion.utils import HeuristicProcessor
136
# unwrap lines based on punctuation
137
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
138
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
169
log.debug('Running text through basic conversion...')
140
170
flow_size = getattr(options, 'flow_size', 0)
141
171
html = convert_basic(txt, epub_split_size_kb=flow_size)
173
# Run the HTMLized text through the html processing plugin.
143
174
from calibre.customize.ui import plugin_for_input_format
144
175
html_input = plugin_for_input_format('html')
145
176
for opt in html_input.options:
146
177
setattr(options, opt.option.name, opt.recommended_value)
147
178
options.input_encoding = 'utf-8'
148
179
base = os.getcwdu()
149
if hasattr(stream, 'name'):
180
if file_ext != 'txtz' and hasattr(stream, 'name'):
150
181
base = os.path.dirname(stream.name)
151
182
fname = os.path.join(base, 'index.html')