~marcusbritanicus/newbreeze/master : revision 75

1

/*

2

3

Released under the MIT License.

4

See the provided LICENSE.TXT file for details.

5

*/

6

7

#include "markdown.hpp"

8

#include "markdown-tokens.hpp"

9

10

#include <sstream>

11

#include <cassert>

12

13

#include <boost/regex.hpp>

14

#include <boost/lexical_cast.hpp>

15

#include <boost/algorithm/string/case_conv.hpp>

16

17

using std::cerr;

18

using std::endl;

19

20

using boost::optional;

21

using boost::none;

22

using markdown::TokenPtr;

23

using markdown::CTokenGroupIter;

24

25

namespace {

26

27

struct HtmlTagInfo {

28

std::string tagName, extra;

29

bool isClosingTag;

30

size_t lengthOfToken; // In original string

31

};

32

33

const std::string cHtmlTokenSource("<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))*? */? *))>");

34

const boost::regex cHtmlTokenExpression(cHtmlTokenSource),

35

cStartHtmlTokenExpression("^"+cHtmlTokenSource),

36

cOneHtmlTokenExpression("^"+cHtmlTokenSource+"$");

37

38

enum ParseHtmlTagFlags { cAlone, cStarts };

39

40

optional<HtmlTagInfo> parseHtmlTag(std::string::const_iterator begin,

41

std::string::const_iterator end, ParseHtmlTagFlags flags)

42

{

43

boost::smatch m;

44

if (boost::regex_search(begin, end, m, (flags==cAlone ?

45

cOneHtmlTokenExpression : cStartHtmlTokenExpression)))

46

{

47

HtmlTagInfo r;

48

r.tagName=m[3];

49

if (m[4].matched) r.extra=m[4];

50

r.isClosingTag=(m[2].length()>0);

51

r.lengthOfToken=m[0].length();

52

return r;

53

}

54

return none;

55

}

56

57

markdown::TokenGroup parseInlineHtmlText(const std::string& src) {

58

markdown::TokenGroup r;

59

std::string::const_iterator prev=src.begin(), end=src.end();

60

while (1) {

61

boost::smatch m;

62

if (boost::regex_search(prev, end, m, cHtmlTokenExpression)) {

63

if (prev!=m[0].first) {

64

//cerr << " Non-tag (" << std::distance(prev, m[0].first) << "): " << std::string(prev, m[0].first) << endl;

65

r.push_back(TokenPtr(new markdown::token::InlineHtmlContents(std::string(prev, m[0].first))));

66

}

67

//cerr << " Tag: " << m[1] << endl;

68

r.push_back(TokenPtr(new markdown::token::HtmlTag(m[1])));

69

prev=m[0].second;

70

} else {

71

std::string eol;

72

if (prev!=end) {

73

eol=std::string(prev, end);

74

//cerr << " Non-tag: " << eol << endl;

75

}

76

eol+='\n';

77

r.push_back(TokenPtr(new markdown::token::InlineHtmlContents(eol)));

78

break;

79

}

80

}

81

return r;

82

}

83

84

bool isHtmlCommentStart(std::string::const_iterator begin,

85

std::string::const_iterator end)

86

{

87

// It can't be a single-line comment, those will already have been parsed

88

// by isBlankLine.

89

static const boost::regex cExpression("^<!--");

90

return boost::regex_search(begin, end, cExpression);

91

}

92

93

bool isHtmlCommentEnd(std::string::const_iterator begin,

94

std::string::const_iterator end)

95

{

96

static const boost::regex cExpression(".*-- *>$");

97

return boost::regex_match(begin, end, cExpression);

98

}

99

100

bool isBlankLine(const std::string& line) {

101

static const boost::regex cExpression(" {0,3}(<--(.*)-- *> *)* *");

102

return boost::regex_match(line, cExpression);

103

}

104

105

optional<TokenPtr> parseInlineHtml(CTokenGroupIter& i, CTokenGroupIter end) {

106

// Preconditions: Previous line was blank, or this is the first line.

107

if ((*i)->text()) {

108

const std::string& line(*(*i)->text());

109

110

bool tag=false, comment=false;

111

optional<HtmlTagInfo> tagInfo=parseHtmlTag(line.begin(), line.end(), cStarts);

112

if (tagInfo && markdown::token::isValidTag(tagInfo->tagName)>1) {

113

tag=true;

114

} else if (isHtmlCommentStart(line.begin(), line.end())) {

115

comment=true;

116

}

117

118

if (tag) {

119

// Block continues until an HTML tag (alone) on a line followed by a

120

// blank line.

121

markdown::TokenGroup contents;

122

CTokenGroupIter firstLine=i, prevLine=i;

123

size_t lines=0;

124

125

bool done=false;

126

do {

127

// We encode HTML tags so that their contents gets properly

128

// handled -- i.e. "<div style=">"/>" becomes <div style=">"/>

129

if ((*i)->text()) {

130

markdown::TokenGroup t=parseInlineHtmlText(*(*i)->text());

131

contents.splice(contents.end(), t);

132

} else contents.push_back(*i);

133

134

prevLine=i;

135

++i;

136

++lines;

137

138

if (i!=end && (*i)->isBlankLine() && (*prevLine)->text()) {

139

if (prevLine==firstLine) {

140

done=true;

141

} else {

142

const std::string& text(*(*prevLine)->text());

143

if (parseHtmlTag(text.begin(), text.end(), cAlone)) done=true;

144

}

145

}

146

} while (i!=end && !done);

147

148

if (lines>1 || markdown::token::isValidTag(tagInfo->tagName, true)>1) {

149

i=prevLine;

150

return TokenPtr(new markdown::token::InlineHtmlBlock(contents));

151

} else {

152

// Single-line HTML "blocks" whose initial tags are span-tags

153

// don't qualify as inline HTML.

154

i=firstLine;

155

return none;

156

}

157

} else if (comment) {

158

// Comment continues until a closing tag is found; at present, it

159

// also has to be the last thing on the line, and has to be

160

// immediately followed by a blank line too.

161

markdown::TokenGroup contents;

162

CTokenGroupIter firstLine=i, prevLine=i;

163

164

bool done=false;

165

do {

166

if ((*i)->text()) contents.push_back(TokenPtr(new markdown::token::InlineHtmlComment(*(*i)->text()+'\n')));

167

else contents.push_back(*i);

168

169

prevLine=i;

170

++i;

171

172

if (i!=end && (*i)->isBlankLine() && (*prevLine)->text()) {

173

if (prevLine==firstLine) {

174

done=true;

175

} else {

176

const std::string& text(*(*prevLine)->text());

177

if (isHtmlCommentEnd(text.begin(), text.end())) done=true;

178

}

179

}

180

} while (i!=end && !done);

181

i=prevLine;

182

return TokenPtr(new markdown::token::InlineHtmlBlock(contents));

183

}

184

}

185

186

return none;

187

}

188

189

optional<std::string> isCodeBlockLine(CTokenGroupIter& i, CTokenGroupIter end) {

190

if ((*i)->isBlankLine()) {

191

// If we get here, we're already in a code block.

192

++i;

193

if (i!=end) {

194

optional<std::string> r=isCodeBlockLine(i, end);

195

if (r) return std::string("\n"+*r);

196

}

197

--i;

198

} else if ((*i)->text() && (*i)->canContainMarkup()) {

199

const std::string& line(*(*i)->text());

200

if (line.length()>=4) {

201

std::string::const_iterator si=line.begin(), sie=si+4;

202

while (si!=sie && *si==' ') ++si;

203

if (si==sie) {

204

++i;

205

return std::string(si, line.end());

206

}

207

}

208

}

209

return none;

210

}

211

212

optional<TokenPtr> parseCodeBlock(CTokenGroupIter& i, CTokenGroupIter end) {

213

if (!(*i)->isBlankLine()) {

214

optional<std::string> contents=isCodeBlockLine(i, end);

215

if (contents) {

216

std::ostringstream out;

217

out << *contents << '\n';

218

while (i!=end) {

219

contents=isCodeBlockLine(i, end);

220

if (contents) out << *contents << '\n';

221

else break;

222

}

223

return TokenPtr(new markdown::token::CodeBlock(out.str()));

224

}

225

}

226

return none;

227

}

228

229

230

231

size_t countQuoteLevel(const std::string& prefixString) {

232

size_t r=0;

233

for (std::string::const_iterator qi=prefixString.begin(),

234

qie=prefixString.end(); qi!=qie; ++qi)

235

if (*qi=='>') ++r;

236

return r;

237

}

238

239

optional<TokenPtr> parseBlockQuote(CTokenGroupIter& i, CTokenGroupIter end) {

240

static const boost::regex cBlockQuoteExpression("^((?: {0,3}>)+) (.*)$");

241

// Useful captures: 1=prefix, 2=content

242

243

if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup()) {

244

const std::string& line(*(*i)->text());

245

boost::smatch m;

246

if (boost::regex_match(line, m, cBlockQuoteExpression)) {

247

size_t quoteLevel=countQuoteLevel(m[1]);

248

boost::regex continuationExpression=boost::regex("^((?: {0,3}>){"+boost::lexical_cast<std::string>(quoteLevel)+"}) ?(.*)$");

249

250

markdown::TokenGroup subTokens;

251

subTokens.push_back(TokenPtr(new markdown::token::RawText(m[2])));

252

253

// The next line can be a continuation of this quote (with or

254

// without the prefix string) or a blank line. Blank lines are

255

// treated as part of this quote if the following line is a

256

// properly-prefixed quote line too, otherwise they terminate the

257

// quote.

258

++i;

259

while (i!=end) {

260

if ((*i)->isBlankLine()) {

261

CTokenGroupIter ii=i;

262

++ii;

263

if (ii==end) {

264

i=ii;

265

break;

266

} else {

267

const std::string& line(*(*ii)->text());

268

if (boost::regex_match(line, m, continuationExpression)) {

269

if (m[1].matched && m[1].length()>0) {

270

i=++ii;

271

subTokens.push_back(TokenPtr(new markdown::token::BlankLine));

272

subTokens.push_back(TokenPtr(new markdown::token::RawText(m[2])));

273

} else break;

274

} else break;

275

}

276

} else {

277

const std::string& line(*(*i)->text());

278

if (boost::regex_match(line, m, continuationExpression)) {

279

assert(m[2].matched);

280

if (!isBlankLine(m[2])) subTokens.push_back(TokenPtr(new markdown::token::RawText(m[2])));

281

else subTokens.push_back(TokenPtr(new markdown::token::BlankLine(m[2])));

282

++i;

283

} else break;

284

}

285

}

286

287

return TokenPtr(new markdown::token::BlockQuote(subTokens));

288

}

289

}

290

return none;

291

}

292

293

optional<TokenPtr> parseListBlock(CTokenGroupIter& i, CTokenGroupIter end, bool sub=false) {

294

static const boost::regex cUnorderedListExpression("^( *)([*+-]) +([^*-].*)$");

295

static const boost::regex cOrderedListExpression("^( *)([0-9]+)\\. +(.*)$");

296

297

enum ListType { cNone, cUnordered, cOrdered };

298

ListType type=cNone;

299

if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup()) {

300

boost::regex nextItemExpression, startSublistExpression;

301

size_t indent=0;

302

303

const std::string& line(*(*i)->text());

304

305

//cerr << "IsList? " << line << endl;

306

307

markdown::TokenGroup subTokens, subItemTokens;

308

309

boost::smatch m;

310

if (boost::regex_match(line, m, cUnorderedListExpression)) {

311

indent=m[1].length();

312

if (sub || indent<4) {

313

type=cUnordered;

314

char startChar=*m[2].first;

315

subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[3])));

316

317

std::ostringstream next;

318

next << "^" << std::string(indent, ' ') << "\\" << startChar << " +([^*-].*)$";

319

nextItemExpression=next.str();

320

}

321

} else if (boost::regex_match(line, m, cOrderedListExpression)) {

322

indent=m[1].length();

323

if (sub || indent<4) {

324

type=cOrdered;

325

subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[3])));

326

327

std::ostringstream next;

328

next << "^" << std::string(indent, ' ') << "[0-9]+\\. +(.*)$";

329

nextItemExpression=next.str();

330

}

331

}

332

333

if (type!=cNone) {

334

CTokenGroupIter originalI=i;

335

size_t itemCount=1;

336

std::ostringstream sub;

337

sub << "^" << std::string(indent, ' ') << " +(([*+-])|([0-9]+\\.)) +.*$";

338

startSublistExpression=sub.str();

339

340

// There are several options for the next line. It's another item in

341

// this list (in which case this one is done); it's a continuation

342

// of this line (collect it and keep going); it's the first item in

343

// a sub-list (call this function recursively to collect it), it's

344

// the next item in the parent list (this one is ended); or it's

345

// blank.

346

//

347

// A blank line requires looking ahead. If the next line is an item

348

// for this list, then switch this list into paragraph-items mode

349

// and continue processing. If it's indented by four or more spaces

350

// (more than the list itself), then it's another continuation of

351

// the current item. Otherwise it's either a new paragraph (and this

352

// list is ended) or the beginning of a sub-list.

353

static const boost::regex cContinuedItemExpression("^ *([^ ].*)$");

354

355

boost::regex continuedAfterBlankLineExpression("^ {"+

356

boost::lexical_cast<std::string>(indent+4)+"}([^ ].*)$");

357

boost::regex codeBlockAfterBlankLineExpression("^ {"+

358

boost::lexical_cast<std::string>(indent+8)+"}(.*)$");

359

360

enum NextItemType { cUnknown, cEndOfList, cAnotherItem };

361

NextItemType nextItem=cUnknown;

362

bool setParagraphMode=false;

363

364

++i;

365

while (i!=end) {

366

if ((*i)->isBlankLine()) {

367

CTokenGroupIter ii=i;

368

++ii;

369

if (ii==end) {

370

i=ii;

371

nextItem=cEndOfList;

372

} else if ((*ii)->text()) {

373

const std::string& line(*(*ii)->text());

374

if (boost::regex_match(line, startSublistExpression)) {

375

setParagraphMode=true;

376

++itemCount;

377

i=ii;

378

optional<TokenPtr> p=parseListBlock(i, end, true);

379

assert(p);

380

subItemTokens.push_back(*p);

381

continue;

382

} else if (boost::regex_match(line, m, nextItemExpression)) {

383

setParagraphMode=true;

384

i=ii;

385

nextItem=cAnotherItem;

386

} else if (boost::regex_match(line, m, continuedAfterBlankLineExpression)) {

387

assert(m[1].matched);

388

subItemTokens.push_back(TokenPtr(new markdown::token::BlankLine()));

389

subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[1])));

390

i=++ii;

391

continue;

392

} else if (boost::regex_match(line, m, codeBlockAfterBlankLineExpression)) {

393

setParagraphMode=true;

394

++itemCount;

395

assert(m[1].matched);

396

subItemTokens.push_back(TokenPtr(new markdown::token::BlankLine()));

397

398

std::string codeBlock=m[1]+'\n';

399

++ii;

400

while (ii!=end) {

401

if ((*ii)->isBlankLine()) {

402

CTokenGroupIter iii=ii;

403

++iii;

404

const std::string& nextLine(*(*iii)->text());

405

if (boost::regex_match(nextLine, m, codeBlockAfterBlankLineExpression)) {

406

codeBlock+='\n'+m[1]+'\n';

407

ii=iii;

408

} else break;

409

} else if ((*ii)->text()) {

410

const std::string& line(*(*ii)->text());

411

if (boost::regex_match(line, m, codeBlockAfterBlankLineExpression)) {

412

codeBlock+=m[1]+'\n';

413

} else break;

414

} else break;

415

++ii;

416

}

417

418

subItemTokens.push_back(TokenPtr(new markdown::token::CodeBlock(codeBlock)));

419

i=ii;

420

continue;

421

} else {

422

nextItem=cEndOfList;

423

}

424

} else break;

425

} else if ((*i)->text()) {

426

const std::string& line(*(*i)->text());

427

if (boost::regex_match(line, startSublistExpression)) {

428

++itemCount;

429

optional<TokenPtr> p=parseListBlock(i, end, true);

430

assert(p);

431

subItemTokens.push_back(*p);

432

continue;

433

} else if (boost::regex_match(line, m, nextItemExpression)) {

434

nextItem=cAnotherItem;

435

} else {

436

if (boost::regex_match(line, m, cUnorderedListExpression)

437

|| boost::regex_match(line, m, cOrderedListExpression))

438

{

439

// Belongs to the parent list

440

nextItem=cEndOfList;

441

} else {

442

boost::regex_match(line, m, cContinuedItemExpression);

443

assert(m[1].matched);

444

subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[1])));

445

++i;

446

continue;

447

}

448

}

449

} else nextItem=cEndOfList;

450

451

if (!subItemTokens.empty()) {

452

subTokens.push_back(TokenPtr(new markdown::token::ListItem(subItemTokens)));

453

subItemTokens.clear();

454

}

455

456

assert(nextItem!=cUnknown);

457

if (nextItem==cAnotherItem) {

458

subItemTokens.push_back(TokenPtr(new markdown::token::RawText(m[1])));

459

++itemCount;

460

++i;

461

} else { // nextItem==cEndOfList

462

break;

463

}

464

}

465

466

// In case we hit the end with an unterminated item...

467

if (!subItemTokens.empty()) {

468

subTokens.push_back(TokenPtr(new markdown::token::ListItem(subItemTokens)));

469

subItemTokens.clear();

470

}

471

472

if (itemCount>1 || indent!=0) {

473

if (type==cUnordered) {

474

return TokenPtr(new markdown::token::UnorderedList(subTokens, setParagraphMode));

475

} else {

476

return TokenPtr(new markdown::token::OrderedList(subTokens, setParagraphMode));

477

}

478

} else {

479

// It looked like a list, but turned out to be a false alarm.

480

i=originalI;

481

return none;

482

}

483

}

484

}

485

return none;

486

}

487

488

bool parseReference(CTokenGroupIter& i, CTokenGroupIter end, markdown::LinkIds &idTable) {

489

if ((*i)->text()) {

490

static const boost::regex cReference("^ {0,3}\\[(.+)\\]: +<?([^ >]+)>?(?: *(?:('|\")(.*)\\3)|(?:\$(.*)\$))?$");

491

// Useful captures: 1=id, 2=url, 4/5=title

492

493

const std::string& line1(*(*i)->text());

494

boost::smatch m;

495

if (boost::regex_match(line1, m, cReference)) {

496

std::string id(m[1]), url(m[2]), title;

497

if (m[4].matched) title=m[4];

498

else if (m[5].matched) title=m[5];

499

else {

500

CTokenGroupIter ii=i;

501

++ii;

502

if (ii!=end && (*ii)->text()) {

503

// It could be on the next line

504

static const boost::regex cSeparateTitle("^ *(?:(?:('|\")(.*)\\1)|(?:\$(.*)\$)) *$");

505

// Useful Captures: 2/3=title

506

507

const std::string& line2(*(*ii)->text());

508

if (boost::regex_match(line2, m, cSeparateTitle)) {

509

++i;

510

title=(m[2].matched ? m[2] : m[3]);

511

}

512

}

513

}

514

515

idTable.add(id, url, title);

516

return true;

517

}

518

}

519

return false;

520

}

521

522

void flushParagraph(std::string& paragraphText, markdown::TokenGroup&

523

paragraphTokens, markdown::TokenGroup& finalTokens, bool noParagraphs)

524

{

525

if (!paragraphText.empty()) {

526

paragraphTokens.push_back(TokenPtr(new markdown::token::RawText(paragraphText)));

527

paragraphText.clear();

528

}

529

530

if (!paragraphTokens.empty()) {

531

if (noParagraphs) {

532

if (paragraphTokens.size()>1) {

533

finalTokens.push_back(TokenPtr(new markdown::token::Container(paragraphTokens)));

534

} else finalTokens.push_back(*paragraphTokens.begin());

535

} else finalTokens.push_back(TokenPtr(new markdown::token::Paragraph(paragraphTokens)));

536

paragraphTokens.clear();

537

}

538

}

539

540

optional<TokenPtr> parseHeader(CTokenGroupIter& i, CTokenGroupIter end) {

541

if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup()) {

542

// Hash-mark type

543

static const boost::regex cHashHeaders("^(#{1,6}) +(.*?) *#*$");

544

const std::string& line=*(*i)->text();

545

boost::smatch m;

546

if (boost::regex_match(line, m, cHashHeaders))

547

return TokenPtr(new markdown::token::Header(m[1].length(), m[2]));

548

549

// Underlined type

550

CTokenGroupIter ii=i;

551

++ii;

552

if (ii!=end && !(*ii)->isBlankLine() && (*ii)->text() && (*ii)->canContainMarkup()) {

553

static const boost::regex cUnderlinedHeaders("^([-=])\\1*$");

554

const std::string& line=*(*ii)->text();

555

if (boost::regex_match(line, m, cUnderlinedHeaders)) {

556

char typeChar=std::string(m[1])[0];

557

TokenPtr p=TokenPtr(new markdown::token::Header((typeChar=='='

558

? 1 : 2), *(*i)->text()));

559

i=ii;

560

return p;

561

}

562

}

563

}

564

return none;

565

}

566

567

optional<TokenPtr> parseHorizontalRule(CTokenGroupIter& i, CTokenGroupIter end) {

568

if (!(*i)->isBlankLine() && (*i)->text() && (*i)->canContainMarkup()) {

569

static const boost::regex cHorizontalRules("^ {0,3}((?:-|\\*|_) *){3,}$");

570

const std::string& line=*(*i)->text();

571

if (boost::regex_match(line, cHorizontalRules)) {

572

return TokenPtr(new markdown::token::HtmlTag("hr/"));

573

}

574

}

575

return none;

576

}

577

578

} // namespace

579

580

581

582

namespace markdown {

583

584

optional<LinkIds::Target> LinkIds::find(const std::string& id) const {

585

Table::const_iterator i=mTable.find(_scrubKey(id));

586

if (i!=mTable.end()) return i->second;

587

else return none;

588

}

589

590

void LinkIds::add(const std::string& id, const std::string& url, const

591

std::string& title)

592

{

593

mTable.insert(std::make_pair(_scrubKey(id), Target(url, title)));

594

}

595

596

std::string LinkIds::_scrubKey(std::string str) {

597

boost::algorithm::to_lower(str);

598

return str;

599

}

600

601

602

603

const size_t Document::cSpacesPerInitialTab=4; // Required by Markdown format

604

const size_t Document::cDefaultSpacesPerTab=cSpacesPerInitialTab;

605

606

Document::Document(size_t spacesPerTab): cSpacesPerTab(spacesPerTab),

607

mTokenContainer(new token::Container), mIdTable(new LinkIds),

608

mProcessed(false)

609

{

610

// This space deliberately blank ;-)

611

}

612

613

Document::Document(std::istream& in, size_t spacesPerTab):

614

cSpacesPerTab(spacesPerTab), mTokenContainer(new token::Container),

615

mIdTable(new LinkIds), mProcessed(false)

616

{

617

read(in);

618

}

619

620

Document::~Document() {

621

delete mIdTable;

622

}

623

624

bool Document::read(const std::string& src) {

625

std::istringstream in(src);

626

return read(in);

627

}

628

629

bool Document::_getline(std::istream& in, std::string& line) {

630

// Handles \n, \r, and \r\n (and even \n\r) on any system. Also does tab-

631

// expansion, since this is the most efficient place for it.

632

line.clear();

633

634

bool initialWhitespace=true;

635

char c;

636

while (in.get(c)) {

637

if (c=='\r') {

638

if ((in.get(c)) && c!='\n') in.unget();

639

return true;

640

} else if (c=='\n') {

641

if ((in.get(c)) && c!='\r') in.unget();

642

return true;

643

} else if (c=='\t') {

644

size_t convert=(initialWhitespace ? cSpacesPerInitialTab :

645

cSpacesPerTab);

646

line+=std::string(convert-(line.length()%convert), ' ');

647

} else {

648

line.push_back(c);

649

if (c!=' ') initialWhitespace=false;

650

}

651

}

652

return !line.empty();

653

}

654

655

bool Document::read(std::istream& in) {

656

if (mProcessed) return false;

657

658

token::Container *tokens=dynamic_cast<token::Container*>(mTokenContainer.get());

659

assert(tokens!=0);

660

661

std::string line;

662

TokenGroup tgt;

663

while (_getline(in, line)) {

664

if (isBlankLine(line)) {

665

tgt.push_back(TokenPtr(new token::BlankLine(line)));

666

} else {

667

tgt.push_back(TokenPtr(new token::RawText(line)));

668

}

669

}

670

tokens->appendSubtokens(tgt);

671

672

return true;

673

}

674

675

void Document::write(std::ostream& out) {

676

_process();

677

mTokenContainer->writeAsHtml(out);

678

}

679

680

void Document::writeTokens(std::ostream& out) {

681

_process();

682

mTokenContainer->writeToken(0, out);

683

}

684

685

std::string Document::asHtml() {

686

687

_process();

688

689

std::stringstream ss( std::ios_base::in | std::ios_base::out );

690

691

mTokenContainer->writeAsHtml(ss);

692

int size = ss.tellp();

693

ss.seekp( 0 );

694

695

char *buffer = new char[ size + 1 ];

696

ss.read( buffer, size );

697

698

return std::string( buffer );

699

}

700

701

void Document::_process() {

702

if (!mProcessed) {

703

_mergeMultilineHtmlTags();

704

_processInlineHtmlAndReferences();

705

_processBlocksItems(mTokenContainer);

706

_processParagraphLines(mTokenContainer);

707

mTokenContainer->processSpanElements(*mIdTable);

708

mProcessed=true;

709

}

710

}

711

712

void Document::_mergeMultilineHtmlTags() {

713

static const boost::regex cHtmlTokenStart("<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))*? */? *))$");

714

static const boost::regex cHtmlTokenEnd("^ *((?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\3))*? */? *))>");

715

716

TokenGroup processed;

717

718

token::Container *tokens=dynamic_cast<token::Container*>(mTokenContainer.get());

719

assert(tokens!=0);

720

721

for (TokenGroup::const_iterator i=tokens->subTokens().begin(),

722

ie=tokens->subTokens().end(); i!=ie; ++i)

723

{

724

if ((*i)->text() && boost::regex_match(*(*i)->text(), cHtmlTokenStart)) {

725

TokenGroup::const_iterator i2=i;

726

++i2;

727

if (i2!=tokens->subTokens().end() && (*i2)->text() &&

728

boost::regex_match(*(*i2)->text(), cHtmlTokenEnd))

729

{

730

processed.push_back(TokenPtr(new markdown::token::RawText(*(*i)->text()+' '+*(*i2)->text())));

731

++i;

732

continue;

733

}

734

}

735

processed.push_back(*i);

736

}

737

tokens->swapSubtokens(processed);

738

}

739

740

void Document::_processInlineHtmlAndReferences() {

741

TokenGroup processed;

742

743

token::Container *tokens=dynamic_cast<token::Container*>(mTokenContainer.get());

744

assert(tokens!=0);

745

746

for (TokenGroup::const_iterator ii=tokens->subTokens().begin(),

747

iie=tokens->subTokens().end(); ii!=iie; ++ii)

748

{

749

if ((*ii)->text()) {

750

if (processed.empty() || processed.back()->isBlankLine()) {

751

optional<TokenPtr> inlineHtml=parseInlineHtml(ii, iie);

752

if (inlineHtml) {

753

processed.push_back(*inlineHtml);

754

if (ii==iie) break;

755

continue;

756

}

757

}

758

759

if (parseReference(ii, iie, *mIdTable)) {

760

if (ii==iie) break;

761

continue;

762

}

763

764

// If it gets down here, just store it in its current (raw text)

765

// form. We'll group the raw text lines into paragraphs in a

766

// later pass, since we can't easily tell where paragraphs

767

// end until then.

768

}

769

processed.push_back(*ii);

770

}

771

tokens->swapSubtokens(processed);

772

}

773

774

void Document::_processBlocksItems(TokenPtr inTokenContainer) {

775

if (!inTokenContainer->isContainer()) return;

776

777

token::Container *tokens=dynamic_cast<token::Container*>(inTokenContainer.get());

778

assert(tokens!=0);

779

780

TokenGroup processed;

781

782

for (TokenGroup::const_iterator ii=tokens->subTokens().begin(),

783

iie=tokens->subTokens().end(); ii!=iie; ++ii)

784

{

785

if ((*ii)->text()) {

786

optional<TokenPtr> subitem;

787

if (!subitem) subitem=parseHeader(ii, iie);

788

if (!subitem) subitem=parseHorizontalRule(ii, iie);

789

if (!subitem) subitem=parseListBlock(ii, iie);

790

if (!subitem) subitem=parseBlockQuote(ii, iie);

791

if (!subitem) subitem=parseCodeBlock(ii, iie);

792

793

if (subitem) {

794

_processBlocksItems(*subitem);

795

processed.push_back(*subitem);

796

if (ii==iie) break;

797

continue;

798

} else processed.push_back(*ii);

799

} else if ((*ii)->isContainer()) {

800

_processBlocksItems(*ii);

801

processed.push_back(*ii);

802

}

803

}

804

tokens->swapSubtokens(processed);

805

}

806

807

void Document::_processParagraphLines(TokenPtr inTokenContainer) {

808

token::Container *tokens=dynamic_cast<token::Container*>(inTokenContainer.get());

809

assert(tokens!=0);

810

811

bool noPara=tokens->inhibitParagraphs();

812

for (TokenGroup::const_iterator ii=tokens->subTokens().begin(),

813

iie=tokens->subTokens().end(); ii!=iie; ++ii)

814

if ((*ii)->isContainer()) _processParagraphLines(*ii);

815

816

TokenGroup processed;

817

std::string paragraphText;

818

TokenGroup paragraphTokens;

819

for (TokenGroup::const_iterator ii=tokens->subTokens().begin(),

820

iie=tokens->subTokens().end(); ii!=iie; ++ii)

821

{

822

if ((*ii)->text() && (*ii)->canContainMarkup() && !(*ii)->inhibitParagraphs()) {

823

static const boost::regex cExpression("^(.*) $");

824

if (!paragraphText.empty()) paragraphText+=" ";

825

826

boost::smatch m;

827

if (boost::regex_match(*(*ii)->text(), m, cExpression)) {

828

paragraphText += m[1];

829

flushParagraph(paragraphText, paragraphTokens, processed, noPara);

830

processed.push_back(TokenPtr(new markdown::token::HtmlTag("br/")));

831

} else paragraphText += *(*ii)->text();

832

} else {

833

flushParagraph(paragraphText, paragraphTokens, processed, noPara);

834

processed.push_back(*ii);

835

}

836

}

837

838

// Make sure the last paragraph is properly flushed too.

839

flushParagraph(paragraphText, paragraphTokens, processed, noPara);

840

841

tokens->swapSubtokens(processed);

842

}

843

844

} // namespace markdown