255
257
LOGDEB1(("FileInterner:: after ucomp: m_tdir %s, tfile %s\n",
256
258
m_tdir.dirname(), m_tfile.c_str()));
258
// Note: still using the original file's stat. right ?
259
l_mime = mimetype(m_fn, stp, m_cfg, usfci);
260
// Stat the uncompressed file, mainly to get the size
262
if (stat(m_fn.c_str(), &ucstat) != 0) {
263
LOGERR(("FileInterner: can't stat the uncompressed file"
264
"[%s] errno %d\n", m_fn.c_str(), errno));
267
docsize = ucstat.st_size;
269
l_mime = mimetype(m_fn, &ucstat, m_cfg, usfci);
260
270
if (l_mime.empty() && imime)
366
378
m_handlers.reserve(MAXHANDLERS);
367
379
for (unsigned int i = 0; i < MAXHANDLERS; i++)
368
380
m_tmpflgs[i] = false;
369
m_targetMType = cstr_stxtplain;
381
m_targetMType = cstr_textplain;
372
// We used a single beagle cache object to access beagle data. We protect it
384
// We use a single beagle cache object to access beagle data. We protect it
373
385
// against multiple thread access.
374
386
static PTMutexInit o_beagler_mutex;
391
403
// and use some kind of backstore object factory next time we add a
392
404
// backend (if ever).
394
map<string, string>::const_iterator it;
395
if ((it = idoc.meta.find(Rcl::Doc::keybcknd)) != idoc.meta.end())
396
backend = it->second;
406
idoc.getmeta(Rcl::Doc::keybcknd, &backend);
398
408
if (backend.empty() || !backend.compare("FS")) {
399
409
// Filesystem document. Intern from file.
400
410
// The url has to be like file://
401
if (idoc.url.find("file://") != 0) {
411
if (idoc.url.find(cstr_fileu) != 0) {
402
412
LOGERR(("FileInterner: FS backend and non fs url: [%s]\n",
403
413
idoc.url.c_str()));
413
423
init(fn, &st, cnf, flags, &idoc.mimetype);
414
424
} else if (!backend.compare("BGL")) {
426
if (!idoc.getmeta(Rcl::Doc::keyudi, &udi) || udi.empty()) {
427
LOGERR(("FileInterner:: no udi in idoc\n"));
417
map<string,string>::const_iterator it =
418
idoc.meta.find(Rcl::Doc::keyudi);
419
if (it == idoc.meta.end() || it->second.empty()) {
420
LOGERR(("FileInterner:: no udi in idoc\n"));
423
string udi = it->second;
426
434
PTMutexLocker locker(o_beagler_mutex);
427
435
// Retrieve from our webcache (beagle data). The beagler
456
#include "fsindexer.h"
457
bool FileInterner::makesig(const Rcl::Doc& idoc, string& sig)
459
if (idoc.url.empty()) {
460
LOGERR(("FileInterner::makesig:: no url!\n"));
464
idoc.getmeta(Rcl::Doc::keybcknd, &backend);
466
if (backend.empty() || !backend.compare("FS")) {
467
if (idoc.url.find(cstr_fileu) != 0) {
468
LOGERR(("FileInterner: FS backend and non fs url: [%s]\n",
472
string fn = idoc.url.substr(7, string::npos);
474
if (stat(fn.c_str(), &st) < 0) {
475
LOGERR(("FileInterner:: cannot access document file: [%s]\n",
479
FsIndexer::makesig(&st, sig);
481
} else if (!backend.compare("BGL")) {
482
// Bgl sigs are empty
486
LOGERR(("FileInterner:: unknown backend: [%s]\n", backend.c_str()));
448
492
FileInterner::~FileInterner()
594
void FileInterner::getMissingFromDescription(FIMissingStore *st, const string& in)
599
// The "missing" file is text. Each line defines a missing filter
600
// and the list of mime types actually encountered that needed it (see method
601
// getMissingDescription())
603
vector<string> lines;
604
stringToTokens(in, lines, "\n");
606
for (vector<string>::const_iterator it = lines.begin();
607
it != lines.end(); it++) {
608
// Lines from the file are like:
610
// filter name string (mime1 mime2)
612
// We can't be too sure that there will never be a parenthesis
613
// inside the filter string as this comes from the filter
614
// itself. The list part is safer, so we start from the end.
615
const string& line = *it;
616
string::size_type lastopen = line.find_last_of("(");
617
if (lastopen == string::npos)
619
string::size_type lastclose = line.find_last_of(")");
620
if (lastclose == string::npos || lastclose <= lastopen + 1)
622
string smtypes = line.substr(lastopen+1, lastclose - lastopen - 1);
623
vector<string> mtypes;
624
stringToTokens(smtypes, mtypes);
625
string filter = line.substr(0, lastopen);
630
st->m_missingExternal.insert(filter);
631
for (vector<string>::const_iterator itt = mtypes.begin();
632
itt != mtypes.end(); itt++) {
633
st->m_typesForMissing[filter].insert(*itt);
550
638
// Helper for extracting a value from a map.
551
639
static inline bool getKeyValue(const map<string, string>& docdata,
552
640
const string& key, string& value)
565
// These defs are for the Dijon meta array. Rcl::Doc predefined field
566
// names are used where appropriate. In some cases, Rcl::Doc names are
567
// used inside the Dijon metadata (ex: origcharset)
568
static const string cstr_keyau("author");
569
static const string cstr_keycs("charset");
570
static const string cstr_keyct("content");
571
static const string cstr_keyds("description");
572
static const string cstr_keyfn("filename");
573
static const string cstr_keymd("modificationdate");
574
static const string cstr_keymt("mimetype");
575
static const string cstr_keytt("title");
577
653
bool FileInterner::dijontorcl(Rcl::Doc& doc)
579
655
Dijon::Filter *df = m_handlers.back();
587
663
for (map<string,string>::const_iterator it = docdata.begin();
588
664
it != docdata.end(); it++) {
589
if (it->first == cstr_keyct) {
665
if (it->first == cstr_dj_keycontent) {
590
666
doc.text = it->second;
591
} else if (it->first == cstr_keymd) {
667
if (doc.fbytes.empty()) {
668
// It's normally set by walking the filter stack, in
669
// collectIpathAndMt, which was called before us. It
670
// can happen that the doc size is still empty at this
671
// point if the last container filter is directly
672
// returning text/plain content, so that there is no
673
// ipath-less filter at the top
675
sprintf(cbuf, "%d", int(doc.text.length()));
678
} else if (it->first == cstr_dj_keymd) {
592
679
doc.dmtime = it->second;
593
} else if (it->first == Rcl::Doc::keyoc) {
680
} else if (it->first == cstr_dj_keyorigcharset) {
594
681
doc.origcharset = it->second;
595
} else if (it->first == cstr_keymt || it->first == cstr_keycs) {
682
} else if (it->first == cstr_dj_keymt ||
683
it->first == cstr_dj_keycharset) {
596
684
// don't need/want these.
598
686
doc.meta[it->first] = it->second;
601
if (doc.meta[Rcl::Doc::keyabs].empty() && !doc.meta[cstr_keyds].empty()) {
602
doc.meta[Rcl::Doc::keyabs] = doc.meta[cstr_keyds];
603
doc.meta.erase(cstr_keyds);
689
if (doc.meta[Rcl::Doc::keyabs].empty() &&
690
!doc.meta[cstr_dj_keyds].empty()) {
691
doc.meta[Rcl::Doc::keyabs] = doc.meta[cstr_dj_keyds];
692
doc.meta.erase(cstr_dj_keyds);
608
697
// Collect the ipath from the current path in the document tree.
609
// While we're at it, we also set the mimetype and filename, which are special
610
// properties: we want to get them from the topmost doc
611
// with an ipath, not the last one which is usually text/plain
612
// We also set the author and modification time from the last doc
698
// While we're at it, we also set the mimetype and filename,
699
// which are special properties: we want to get them from the topmost
700
// doc with an ipath, not the last one which is usually text/plain We
701
// also set the author and modification time from the last doc which
704
// The docsize is fetched from the first element without an ipath
705
// (first non container). If the last element directly returns
706
// text/plain so that there is no ipath-less element, the value will
707
// be set in dijontorcl().
709
// The whole thing is a bit messy but it's not obvious how it should
710
// be cleaned up as the "inheritance" rules inside the stack are
711
// actually complicated.
614
712
void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const
616
714
LOGDEB2(("FileInterner::collectIpathAndMT\n"));
632
730
for (vector<Dijon::Filter*>::const_iterator hit = m_handlers.begin();
633
731
hit != m_handlers.end(); hit++) {
634
732
const map<string, string>& docdata = (*hit)->get_meta_data();
635
if (getKeyValue(docdata, "ipath", ipathel)) {
733
if (getKeyValue(docdata, cstr_dj_keyipath, ipathel)) {
636
734
if (!ipathel.empty()) {
637
735
// We have a non-empty ipath
639
getKeyValue(docdata, cstr_keymt, doc.mimetype);
640
getKeyValue(docdata, cstr_keyfn, doc.utf8fn);
737
getKeyValue(docdata, cstr_dj_keymt, doc.mimetype);
738
getKeyValue(docdata, cstr_dj_keyfn, doc.utf8fn);
740
if (doc.fbytes.empty())
741
getKeyValue(docdata, cstr_dj_keydocsize, doc.fbytes);
642
743
doc.ipath += colon_hide(ipathel) + cstr_isep;
745
if (doc.fbytes.empty())
746
getKeyValue(docdata, cstr_dj_keydocsize, doc.fbytes);
644
747
doc.ipath += cstr_isep;
646
getKeyValue(docdata, cstr_keyau, doc.meta[Rcl::Doc::keyau]);
647
getKeyValue(docdata, cstr_keymd, doc.dmtime);
749
getKeyValue(docdata, cstr_dj_keyauthor, doc.meta[Rcl::Doc::keyau]);
750
getKeyValue(docdata, cstr_dj_keymd, doc.dmtime);
650
753
// Trim empty tail elements in ipath.
683
786
const map<string, string>& docdata = m_handlers.back()->get_meta_data();
684
787
string charset, mimetype;
685
getKeyValue(docdata, cstr_keycs, charset);
686
getKeyValue(docdata, cstr_keymt, mimetype);
788
getKeyValue(docdata, cstr_dj_keycharset, charset);
789
getKeyValue(docdata, cstr_dj_keymt, mimetype);
688
791
LOGDEB(("FileInterner::addHandler: next_doc is %s\n", mimetype.c_str()));
691
794
// general), we're done decoding. If we hit text/plain, we're done
693
796
if (!stringicmp(mimetype, m_targetMType) ||
694
!stringicmp(mimetype, cstr_stxtplain)) {
797
!stringicmp(mimetype, cstr_textplain)) {
695
798
m_reachedMType = mimetype;
696
799
LOGDEB1(("FileInterner::addHandler: target reached\n"));
697
800
return ADD_BREAK;
724
827
const string *txt = &ns;
726
829
map<string,string>::const_iterator it;
727
it = docdata.find(cstr_keyct);
830
it = docdata.find(cstr_dj_keycontent);
728
831
if (it != docdata.end())
729
832
txt = &it->second;
731
834
bool setres = false;
835
newflt->set_docsize(txt->length());
732
836
if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
733
837
setres = newflt->set_document_string(*txt);
734
838
} else if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) {