23
23
_ disallow system* manipulations from the database.
28
#include "mongo/db/pdfile_private.h"
30
#include "../util/mmap.h"
31
#include "../util/hashtab.h"
32
#include "../util/file_allocator.h"
33
#include "../util/processinfo.h"
34
#include "../util/file.h"
36
#include "btreebuilder.h"
26
#include "mongo/pch.h"
28
#include "mongo/db/pdfile.h"
37
30
#include <algorithm>
31
#include <boost/filesystem/operations.hpp>
32
#include <boost/optional/optional.hpp>
40
#include "dbhelpers.h"
41
#include "namespace-inl.h"
43
#include "curop-inl.h"
44
#include "background.h"
46
#include "ops/delete.h"
49
#include "memconcept.h"
35
#include "mongo/base/counter.h"
36
#include "mongo/db/auth/auth_index_d.h"
37
#include "mongo/db/auth/authorization_manager.h"
38
#include "mongo/db/pdfile_private.h"
39
#include "mongo/db/background.h"
40
#include "mongo/db/btree.h"
41
#include "mongo/db/commands/server_status.h"
42
#include "mongo/db/curop-inl.h"
43
#include "mongo/db/db.h"
44
#include "mongo/db/dbhelpers.h"
45
#include "mongo/db/extsort.h"
46
#include "mongo/db/index_update.h"
47
#include "mongo/db/instance.h"
48
#include "mongo/db/kill_current_op.h"
50
49
#include "mongo/db/lasterror.h"
51
#include "mongo/db/index_update.h"
53
#include <boost/filesystem/operations.hpp>
50
#include "mongo/db/memconcept.h"
51
#include "mongo/db/namespace-inl.h"
52
#include "mongo/db/namespacestring.h"
53
#include "mongo/db/ops/delete.h"
54
#include "mongo/db/repl.h"
55
#include "mongo/db/replutil.h"
56
#include "mongo/db/sort_phase_one.h"
57
#include "mongo/util/file.h"
58
#include "mongo/util/file_allocator.h"
59
#include "mongo/util/hashtab.h"
60
#include "mongo/util/mmap.h"
61
#include "mongo/util/processinfo.h"
62
#include "mongo/db/stats/timer_stats.h"
63
#include "mongo/db/stats/counters.h"
57
67
BOOST_STATIC_ASSERT( sizeof(Extent)-4 == 48+128 );
58
68
BOOST_STATIC_ASSERT( sizeof(DataFileHeader)-4 == 8192 );
60
void printMemInfo( const char * where ) {
65
if ( ! pi.supported() ) {
66
cout << " not supported" << endl;
70
cout << "vsize: " << pi.getVirtualMemorySize() << " resident: " << pi.getResidentSize() << " mapped: " << ( MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) ) << endl;
70
//The oplog entries inserted
71
static TimerStats oplogInsertStats;
72
static ServerStatusMetricField<TimerStats> displayInsertedOplogEntries(
75
static Counter64 oplogInsertBytesStats;
76
static ServerStatusMetricField<Counter64> displayInsertedOplogEntryBytes(
77
"repl.oplog.insertBytes",
78
&oplogInsertBytesStats );
73
80
bool isValidNS( const StringData& ns ) {
74
81
// TODO: should check for invalid characters
76
const char * x = strchr( ns.data() , '.' );
83
size_t idx = ns.find( '.' );
84
if ( idx == string::npos )
87
if ( idx == ns.size() - 1 )
84
93
// TODO SERVER-4328
699
719
int delRecLength;
700
720
getEmptyLoc(nsname, myLoc, _length, capped, emptyLoc, delRecLength);
702
DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength) );
722
DeletedRecord* empty = getDur().writing(DataFileMgr::getDeletedRecord(emptyLoc));
703
723
empty->lengthWithHeaders() = delRecLength;
704
724
empty->extentOfs() = myLoc.getOfs();
725
empty->nextDeleted().Null();
709
bool Extent::validates(const DiskLoc diskLoc, BSONArrayBuilder* errors) {
710
bool extentOk = true;
711
if (magic != extentSignature) {
714
sb << "bad extent signature " << toHex(&magic, 4)
715
<< " in extent " << diskLoc.toString();
720
if (myLoc != diskLoc) {
723
sb << "extent " << diskLoc.toString()
724
<< " self-pointer is " << myLoc.toString();
729
if (firstRecord.isNull() != lastRecord.isNull()) {
732
if (firstRecord.isNull()) {
733
sb << "in extent " << diskLoc.toString()
734
<< ", firstRecord is null but lastRecord is "
735
<< lastRecord.toString();
738
sb << "in extent " << diskLoc.toString()
739
<< ", firstRecord is " << firstRecord.toString()
740
<< " but lastRecord is null";
746
if (length < minSize()) {
749
sb << "length of extent " << diskLoc.toString()
751
<< ", which is less than minimum length of " << minSize();
760
Record* Extent::newRecord(int len) {
761
if( firstEmptyRegion.isNull() )8
765
int newRecSize = len + Record::HeaderSize;
766
DiskLoc newRecordLoc = firstEmptyRegion;
767
Record *r = getRecord(newRecordLoc);
768
int left = r->netLength() - len;
771
firstEmptyRegion.Null();
775
DiskLoc nextEmpty = r->next.getNextEmpty(firstEmptyRegion);
776
r->lengthWithHeaders = newRecSize;
777
r->next.markAsFirstOrLastInExtent(this); // we're now last in the extent
778
if( !lastRecord.isNull() ) {
779
verify(getRecord(lastRecord)->next.lastInExtent()); // it was the last one
780
getRecord(lastRecord)->next.set(newRecordLoc); // until now
781
r->prev.set(lastRecord);
784
r->prev.markAsFirstOrLastInExtent(this); // we are the first in the extent
785
verify( firstRecord.isNull() );
786
firstRecord = newRecordLoc;
788
lastRecord = newRecordLoc;
790
if( left < Record::HeaderSize + 32 ) {
791
firstEmptyRegion.Null();
794
firstEmptyRegion.inc(newRecSize);
795
Record *empty = getRecord(firstEmptyRegion);
796
empty->next.set(nextEmpty); // not for empty records, unless in-use records, next and prev can be null.
798
empty->lengthWithHeaders = left;
729
bool Extent::validates(const DiskLoc diskLoc, BSONArrayBuilder* errors) {
730
bool extentOk = true;
731
if (magic != extentSignature) {
734
sb << "bad extent signature " << toHex(&magic, 4)
735
<< " in extent " << diskLoc.toString();
740
if (myLoc != diskLoc) {
743
sb << "extent " << diskLoc.toString()
744
<< " self-pointer is " << myLoc.toString();
749
if (firstRecord.isNull() != lastRecord.isNull()) {
752
if (firstRecord.isNull()) {
753
sb << "in extent " << diskLoc.toString()
754
<< ", firstRecord is null but lastRecord is "
755
<< lastRecord.toString();
758
sb << "in extent " << diskLoc.toString()
759
<< ", firstRecord is " << firstRecord.toString()
760
<< " but lastRecord is null";
766
if (length < minSize()) {
769
sb << "length of extent " << diskLoc.toString()
771
<< ", which is less than minimum length of " << minSize();
780
Record* Extent::newRecord(int len) {
781
if( firstEmptyRegion.isNull() )8
785
int newRecSize = len + Record::HeaderSize;
786
DiskLoc newRecordLoc = firstEmptyRegion;
787
Record *r = getRecord(newRecordLoc);
788
int left = r->netLength() - len;
791
firstEmptyRegion.Null();
795
DiskLoc nextEmpty = r->next.getNextEmpty(firstEmptyRegion);
796
r->lengthWithHeaders = newRecSize;
797
r->next.markAsFirstOrLastInExtent(this); // we're now last in the extent
798
if( !lastRecord.isNull() ) {
799
verify(getRecord(lastRecord)->next.lastInExtent()); // it was the last one
800
getRecord(lastRecord)->next.set(newRecordLoc); // until now
801
r->prev.set(lastRecord);
804
r->prev.markAsFirstOrLastInExtent(this); // we are the first in the extent
805
verify( firstRecord.isNull() );
806
firstRecord = newRecordLoc;
808
lastRecord = newRecordLoc;
810
if( left < Record::HeaderSize + 32 ) {
811
firstEmptyRegion.Null();
814
firstEmptyRegion.inc(newRecSize);
815
Record *empty = getRecord(firstEmptyRegion);
816
empty->next.set(nextEmpty); // not for empty records, unless in-use records, next and prev can be null.
818
empty->lengthWithHeaders = left;
805
825
int Extent::maxSize() {
806
826
int maxExtentSize = 0x7ff00000;
1246
1292
#pragma pack(1)
1247
struct IDToInsert_ {
1252
type = (char) jstOID;
1254
verify( sizeof(IDToInsert_) == 17 );
1257
struct IDToInsert : public BSONElement {
1258
IDToInsert() : BSONElement( ( char * )( &idToInsert_ ) ) {}
1302
bool needed() const { return type > 0; }
1305
type = static_cast<char>(jstOID);
1306
strcpy( id, "_id" );
1308
verify( size() == 17 );
1311
int size() const { return sizeof( IDToInsert ); }
1313
const char* rawdata() const { return reinterpret_cast<const char*>( this ); }
1262
1317
void DataFileMgr::insertAndLog( const char *ns, const BSONObj &o, bool god, bool fromMigrate ) {
1263
1318
BSONObj tmp = o;
1264
insertWithObjMod( ns, tmp, god );
1319
insertWithObjMod( ns, tmp, false, god );
1265
1320
logOp( "i", ns, tmp, 0, 0, fromMigrate );
1268
1323
/** @param o the object to insert. can be modified to add _id and thus be an in/out param
1270
DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) {
1325
DiskLoc DataFileMgr::insertWithObjMod(const char* ns, BSONObj& o, bool mayInterrupt, bool god) {
1271
1326
bool addedID = false;
1272
DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god, true, &addedID );
1327
DiskLoc loc = insert( ns, o.objdata(), o.objsize(), mayInterrupt, god, true, &addedID );
1273
1328
if( addedID && !loc.isNull() )
1274
1329
o = BSONObj::make( loc.rec() );
1278
bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject );
1280
1333
// We are now doing two btree scans for all unique indexes (one here, and one when we've
1281
1334
// written the record to the collection. This could be made more efficient inserting
1282
1335
// dummy data here, keeping pointers to the btree nodes holding the dummy data and then
1324
NOINLINE_DECL DiskLoc outOfSpace(const char *ns, NamespaceDetails *d, int lenWHdr, bool god, DiskLoc extentLoc) {
1377
NOINLINE_DECL DiskLoc outOfSpace(const char* ns, NamespaceDetails* d, int lenWHdr, bool god) {
1326
1379
if ( ! d->isCapped() ) { // size capped doesn't grow
1327
1380
LOG(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor() << " lenWHdr: " << lenWHdr << endl;
1328
1381
cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god);
1329
loc = d->alloc(ns, lenWHdr, extentLoc);
1382
loc = d->alloc(ns, lenWHdr);
1330
1383
if ( loc.isNull() ) {
1331
log() << "warning: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n";
1384
log() << "warning: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again" << endl;
1332
1385
for ( int z=0; z<10 && lenWHdr > d->lastExtentSize; z++ ) {
1333
1386
log() << "try #" << z << endl;
1334
1387
cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god);
1335
loc = d->alloc(ns, lenWHdr, extentLoc);
1388
loc = d->alloc(ns, lenWHdr);
1336
1389
if ( ! loc.isNull() )
1357
1409
uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns);
1358
1410
if ( strstr(ns, ".system.") ) {
1359
1411
// later:check for dba-type permissions here if have that at some point separate
1360
if ( strstr(ns, ".system.indexes" ) )
1412
if (NamespaceString(ns).coll == "system.indexes")
1361
1413
wouldAddIndex = true;
1362
1414
else if ( legalClientSystemNS( ns , true ) ) {
1363
1415
if ( obuf && strstr( ns , ".system.users" ) ) {
1364
1416
BSONObj t( reinterpret_cast<const char *>( obuf ) );
1365
uassert( 14051 , "system.users entry needs 'user' field to be a string" , t["user"].type() == String );
1366
uassert( 14052 , "system.users entry needs 'pwd' field to be a string" , t["pwd"].type() == String );
1367
uassert( 14053 , "system.users entry needs 'user' field to be non-empty" , t["user"].String().size() );
1368
uassert( 14054 , "system.users entry needs 'pwd' field to be non-empty" , t["pwd"].String().size() );
1417
uassertStatusOK(AuthorizationManager::checkValidPrivilegeDocument(
1418
nsToDatabaseSubstring(ns), t));
1371
1421
else if ( !god ) {
1372
// todo this should probably uasseert rather than doing this:
1373
log() << "ERROR: attempt to insert in system namespace " << ns << endl;
1422
uasserted(16459, str::stream() << "attempt to insert in system namespace '"
1407
1462
background = false;
1410
int idxNo = tableToIndex->nIndexes;
1411
IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes
1412
getDur().writingDiskLoc(idx.info) = loc;
1465
// The total number of indexes right before we write to the collection
1466
int oldNIndexes = -1;
1467
int idxNo = tableToIndex->getTotalIndexCount();
1468
std::string idxName = info["name"].valuestr();
1470
// Set curop description before setting indexBuildInProg, so that there's something
1471
// commands can find and kill as soon as indexBuildInProg is set. Only set this if it's a
1472
// killable index, so we don't overwrite commands in currentOp.
1474
cc().curop()->setQuery(info);
1414
buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background);
1478
IndexDetails& idx = tableToIndex->getNextIndexDetails(tabletoidxns.c_str());
1479
// It's important that this is outside the inner try/catch so that we never try to call
1480
// kill_idx on a half-formed disk loc (if this asserts).
1481
getDur().writingDiskLoc(idx.info) = loc;
1484
getDur().writingInt(tableToIndex->indexBuildsInProgress) += 1;
1485
buildAnIndex(tabletoidxns, tableToIndex, idx, background, mayInterrupt);
1487
catch (DBException& e) {
1488
// save our error msg string as an exception or dropIndexes will overwrite our message
1489
LastError *le = lastError.get();
1493
savecode = le->code;
1494
saveerrmsg = le->msg;
1497
savecode = e.getCode();
1498
saveerrmsg = e.what();
1501
// Recalculate the index # so we can remove it from the list in the next catch
1502
idxNo = IndexBuildsInProgress::get(tabletoidxns.c_str(), idxName);
1503
// roll back this index
1506
verify(le && !saveerrmsg.empty());
1507
setLastError(savecode,saveerrmsg.c_str());
1511
// Recompute index numbers
1512
tableToIndex = nsdetails(tabletoidxns);
1513
idxNo = IndexBuildsInProgress::get(tabletoidxns.c_str(), idxName);
1516
// Make sure the newly created index is relocated to nIndexes, if it isn't already there
1517
if (idxNo != tableToIndex->nIndexes) {
1518
log() << "switching indexes at position " << idxNo << " and "
1519
<< tableToIndex->nIndexes << endl;
1520
// We cannot use idx here, as it may point to a different index entry if it was
1521
// flipped during building
1522
IndexDetails temp = tableToIndex->idx(idxNo);
1523
*getDur().writing(&tableToIndex->idx(idxNo)) =
1524
tableToIndex->idx(tableToIndex->nIndexes);
1525
*getDur().writing(&tableToIndex->idx(tableToIndex->nIndexes)) = temp;
1527
// We also have to flip multikey entries
1528
bool tempMultikey = tableToIndex->isMultikey(idxNo);
1529
tableToIndex->setIndexIsMultikey(tabletoidxns.c_str(), idxNo,
1530
tableToIndex->isMultikey(tableToIndex->nIndexes));
1531
tableToIndex->setIndexIsMultikey(tabletoidxns.c_str(), tableToIndex->nIndexes,
1534
idxNo = tableToIndex->nIndexes;
1537
// Store the current total of indexes in case something goes wrong actually adding the
1539
oldNIndexes = tableToIndex->getTotalIndexCount();
1541
// clear transient info caches so they refresh; increments nIndexes
1542
tableToIndex->addIndex(tabletoidxns.c_str());
1543
getDur().writingInt(tableToIndex->indexBuildsInProgress) -= 1;
1545
IndexType* indexType = idx.getSpec().getType();
1546
const IndexPlugin *plugin = indexType ? indexType->getPlugin() : NULL;
1548
plugin->postBuildHook( idx.getSpec() );
1416
catch( DBException& e ) {
1417
// save our error msg string as an exception or dropIndexes will overwrite our message
1418
LastError *le = lastError.get();
1422
savecode = le->code;
1423
saveerrmsg = le->msg;
1426
savecode = e.getCode();
1427
saveerrmsg = e.what();
1430
// roll back this index
1431
string name = idx.indexName();
1434
bool ok = dropIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true);
1436
log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl;
1439
verify( le && !saveerrmsg.empty() );
1440
setLastError(savecode,saveerrmsg.c_str());
1553
// Generally, this will be called as an exception from building the index bubbles up.
1554
// Thus, the index will have already been cleaned up. This catch just ensures that the
1555
// metadata is consistent on any exception. It may leak like a sieve if the index
1556
// successfully finished building and addIndex or kill_idx threw.
1558
// Check if nIndexes was incremented
1559
if (oldNIndexes != -1 && oldNIndexes != tableToIndex->nIndexes) {
1560
getDur().writingInt(tableToIndex->nIndexes) = oldNIndexes;
1563
// Move any other in prog indexes "back" one. It is important that idxNo is set
1564
// correctly so that the correct index is removed
1565
IndexBuildsInProgress::remove(tabletoidxns.c_str(), idxNo);
1566
getDur().writingInt(tableToIndex->indexBuildsInProgress) -= 1;
1445
/* if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc
1446
after the call -- that will prevent a double buffer copy in some cases (btree.cpp).
1448
@param mayAddIndex almost always true, except for invocation from rename namespace command.
1449
@param addedID if not null, set to true if adding _id element. you must assure false before calling
1453
DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, bool mayAddIndex, bool *addedID) {
1572
// indexName is passed in because index details may not be pointing to something valid at this
1574
int IndexBuildsInProgress::get(const char* ns, const std::string& indexName) {
1575
Lock::assertWriteLocked(ns);
1576
NamespaceDetails* nsd = nsdetails(ns);
1578
// Go through unfinished index builds and try to find this index
1579
for (int i=nsd->nIndexes; i<nsd->nIndexes+nsd->indexBuildsInProgress; i++) {
1580
if (indexName == nsd->idx(i).indexName()) {
1588
void IndexBuildsInProgress::remove(const char* ns, int offset) {
1589
Lock::assertWriteLocked(ns);
1590
NamespaceDetails* nsd = nsdetails(ns);
1592
for (int i=offset; i<nsd->getTotalIndexCount(); i++) {
1593
if (i < NamespaceDetails::NIndexesMax-1) {
1594
*getDur().writing(&nsd->idx(i)) = nsd->idx(i+1);
1595
nsd->setIndexIsMultikey(ns, i, nsd->isMultikey(i+1));
1598
*getDur().writing(&nsd->idx(i)) = IndexDetails();
1599
nsd->setIndexIsMultikey(ns, i, false);
1604
DiskLoc DataFileMgr::insert(const char* ns,
1454
1611
bool wouldAddIndex = false;
1455
1612
massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) );
1456
1613
uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );