diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..5bdf624 --- /dev/null +++ b/.clang-format @@ -0,0 +1,46 @@ +# +# http://clang.llvm.org/docs/ClangFormatStyleOptions.html +# +AccessModifierOffset: -4 +ConstructorInitializerIndentWidth: 4 +AlignEscapedNewlinesLeft: false +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakTemplateDeclarations: false +AlwaysBreakBeforeMultilineStrings: false +BreakBeforeBinaryOperators: false +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BinPackParameters: true +ColumnLimit: 0 +ConstructorInitializerAllOnOneLineOrOnePerLine: false +DerivePointerBinding: false +ExperimentalAutoDetectBinPacking: false +IndentCaseLabels: true +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: All +ObjCSpaceBeforeProtocolList: true +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 60 +PenaltyBreakString: 1000 +PenaltyBreakFirstLessLess: 120 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PointerBindsToType: false +SpacesBeforeTrailingComments: 1 +Cpp11BracedListStyle: false +Standard: Cpp03 +IndentWidth: 4 +TabWidth: 4 +UseTab: Never +BreakBeforeBraces: Allman +IndentFunctionDeclarationAfterType: false +SpacesInParentheses: false +SpacesInAngles: false +SpaceInEmptyParentheses: false +SpacesInCStyleCastParentheses: false +SpaceAfterControlStatementKeyword: true +SpaceBeforeAssignmentOperators: true +ContinuationIndentWidth: 4 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..15546a3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +build +.cache +data +index.lmdb +index.lmdb-lock +test/requirements.txt +test/.test_venv +.vscode/settings.json +__pycache__ diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..e1b025d --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,28 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "(gdb) Attach", + "type": "cppdbg", + "request": "attach", + "program": "/home/urpc/hs5/build/hs5", + "MIMode": "gdb", + "setupCommands": [ + { + "description": "Enable pretty-printing for gdb", + "text": "-enable-pretty-printing", + "ignoreFailures": true + }, + { + "description": "Set Disassembly Flavor to Intel", + "text": "-gdb-set disassembly-flavor intel", + "ignoreFailures": true + } + ] + } + + ] +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..f4f54dd --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,25 @@ +cmake_minimum_required(VERSION 3.15) +project(hs5 VERSION 0.1.0 LANGUAGES CXX C) + +add_executable(hs5 + lmdb/midl.cpp + lmdb/mdb.c + lmdb/cppmidl.cpp + main.cpp + s3handler.cpp + SingleFileStorage.cpp + data.cpp + os_functions.cpp + utils.cpp ) + +#set(GCC_COVERAGE_COMPILE_FLAGS "-fcoroutines") + +add_definitions(${GCC_COVERAGE_COMPILE_FLAGS}) + +find_package(folly CONFIG REQUIRED) +find_package(proxygen CONFIG REQUIRED) +find_package(zstd CONFIG REQUIRED) +target_link_libraries(hs5 PRIVATE $,zstd::libzstd_shared,zstd::libzstd_static> Folly::folly Folly::folly_deps Folly::follybenchmark Folly::folly_test_util proxygen::proxygen proxygen::proxygencurl proxygen::proxygenhttpserver) + + +target_compile_features(hs5 PUBLIC cxx_std_20) \ No newline at end of file diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..be3f7b2 --- /dev/null +++ b/COPYING @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..be3f7b2 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/README.md b/README.md index 0c5ed6d..994f7a9 100644 --- a/README.md +++ b/README.md @@ -1 +1,11 @@ -# hs5 \ No newline at end of file +High-performance scale-up self-hosted simple storage service (hs5) + +* High performance: Designed to run with high performance + +* Scale-up: Runs only on a single node. To scale it use it on a better machine. With machines with terabytes of RAM and hundreds of terabytes of storage available this might be enough for many use cases + +* Self-hosted: You run it yourself, keeping ownership and responsibility of your data + +* Simple: Simple to setup and run. API-compatible with AWS S3 API + +* Storage Service: An object storage service like AWS S3 \ No newline at end of file diff --git a/SingleFileStorage.cpp b/SingleFileStorage.cpp new file mode 100644 index 0000000..b1d6034 --- /dev/null +++ b/SingleFileStorage.cpp @@ -0,0 +1,8378 @@ +/** + * Copyright Martin Raiber. All Rights Reserved. + * SPDX-License-Identifier: AGPL-3.0-or-later + */ +#include "SingleFileStorage.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "os_functions.h" +#include "utils.h" +#include +#include + +#include +#include +#include +#include +#include "data.h" + +using namespace std::chrono_literals; + + +#ifndef _WIN32 +#include +#include +#include +#include +#else +#define aligned_alloc(allign, size) malloc(size) +int getpagesize() { return 4096; } +const int PROT_READ = 1; +const int PROT_WRITE = 2; +const int MAP_SHARED = 2; +const int MS_SYNC = 1; +void* mmap(void* addr, size_t length, int prot, int flags, + void* fd, off_t offset) { + return nullptr; +} +int munmap(void* addr, size_t length) { + return 0; +} +int msync(void* addr, size_t length, int flags); +typedef int64 ssize_t; +int fallocate64(void* fd, int mode, int64_t offset, int64_t len) { + return 1; +} +#endif +#include +#include "relaxed_atomic.h" + +#define EXT_DEBUG(x) + +namespace +{ + int64_t div_up(int64_t num, int64_t div) + { + if (num%div == 0) + { + return num / div; + } + else + { + return num / div + 1; + } + } + + const int64_t block_size = 4096; + const size_t max_extent_num = 64; + const size_t num_cached_free_exts = 1000; + const size_t max_defrag_extents = 1000000; + const int64_t data_file_copy_num_bytes = 100 * 1024 * 1024; + const size_t n_rewrite_pages = (128 * 1024) / 4096; + + /** Compare two varint lexically */ + static int + mdb_cmp_varint(const MDB_val *a, const MDB_val *b) + { + if (a->mv_size != b->mv_size) + { + return a->mv_size < b->mv_size ? -1 : 1; + } + + return memcmp(a->mv_data, b->mv_data, a->mv_size); + } + + static int + mdb_cmp_varint_rev(const MDB_val *a, const MDB_val *b) + { + return -1 * mdb_cmp_varint(a, b); + } + + static int + mdb_cmp_memn(const MDB_val* a, const MDB_val* b) + { + int diff; + ssize_t len_diff; + unsigned int len; + + len = a->mv_size; + len_diff = (ssize_t)a->mv_size - (ssize_t)b->mv_size; + if (len_diff > 0) { + len = b->mv_size; + len_diff = 1; + } + + diff = memcmp(a->mv_data, b->mv_data, len); + return static_cast(diff ? diff : len_diff < 0 ? -1 : len_diff); + } + + int64_t extract_num_exts(int64_t& offset) + { + int64_t exts = offset % 512; + offset -= exts; + return exts; + } + + int64_t encode_num_exts(int64_t offset, int64_t exts) + { + assert(exts < 512); + assert(offset % 512 == 0); + return offset + exts; + } + + class SingleFileStorageMigrate + { + SingleFileStorage* sfs; + public: + SingleFileStorageMigrate(SingleFileStorage* sfs) + :sfs(sfs) {} + + void operator()() + { + sfs->migrate_thread(); + delete this; + } + }; + + bool writeZeroes(int64_t offset, const folly::File& file, int64_t num, int set_val) + { + static char buf[4096] = {}; + + while (num > 0) + { + int64_t towrite = (std::min)((int64_t)4096, num); + if (folly::writeNoInt(file.fd(), buf, static_cast(towrite))!= towrite) + { + return false; + } + num -= towrite; + offset += towrite; + } + return true; + } + + + + class TmpMmapedFileBitmap + { + public: + TmpMmapedFileBitmap(int64_t n, bool init_set) + : mmap_ptr(nullptr), backing_file(folly::File::temporary()) + { + resize(n, init_set); + } + + ~TmpMmapedFileBitmap() + { + if (mmap_ptr != nullptr) + { + munmap(mmap_ptr, bitmap_size); + } + } + + void resize(int64_t n, bool init_set) + { + if (mmap_ptr != nullptr) + { + munmap(mmap_ptr, bitmap_size); + mmap_ptr = nullptr; + } + + total_size = n; + bitmap_size = static_cast(n / 8 + (n % 8 == 0 ? 0 : 1)); + + if (fileSize(backing_file) < static_cast(bitmap_size)) + { + while (!writeZeroes(fileSize(backing_file), backing_file, bitmap_size - fileSize(backing_file), init_set ? 255 : 0)) + { + XLOG(ERR) << "Error resizing bitmap file. Retrying..."; + std::this_thread::sleep_for(1s); + } + } + + if(bitmap_size==0) + return; + + mmap_ptr = reinterpret_cast(mmap(NULL, bitmap_size, PROT_READ|PROT_WRITE, MAP_SHARED, backing_file.fd(), 0)); + + if (mmap_ptr == reinterpret_cast(-1)) + { + XLOGF(ERR, "Error creating mmap of bitmap. {}", folly::errnoStr(errno)); + abort(); + } + } + + void set(int64_t i, bool v) + { + size_t bitmap_byte = (size_t)(i / 8); + size_t bitmap_bit = i % 8; + + unsigned char b = mmap_ptr[bitmap_byte]; + + if (v == true) + b = b | (1 << (7 - bitmap_bit)); + else + b = b & (~(1 << (7 - bitmap_bit))); + + mmap_ptr[bitmap_byte] = b; + } + + size_t set_range(int64_t start, int64_t end, bool v) + { + size_t set_bits = 0; + for (; start < end; ++start) + { + if (get(start) != v) + { + set(start, v); + ++set_bits; + } + } + return set_bits; + } + + char getb(int64_t i) const + { + size_t bitmap_byte = (size_t)(i / 8); + return mmap_ptr[bitmap_byte]; + } + + bool get(int64_t i) const + { + size_t bitmap_byte = (size_t)(i / 8); + size_t bitmap_bit = i % 8; + + unsigned char b = mmap_ptr[bitmap_byte]; + + bool has_bit = ((b & (1 << (7 - bitmap_bit))) > 0); + + return has_bit; + } + + bool flush() + { + if(mmap_ptr==nullptr) + return true; + + return msync(mmap_ptr, bitmap_size, MS_SYNC) == 0; + } + + size_t count_bits() + { + size_t set_count = 0; + for (int64_t i = 0; i < total_size;) + { + if (i % 8 == 0 + && getb(i) == 0) + { + i += 8; + continue; + } + + if (get(i)) + { + ++set_count; + } + + ++i; + } + + return set_count; + } + + bool get_range(int64_t start, int64_t end) const + { + for (; start < end; ++start) + { + if (get(start)) + { + return true; + } + } + return false; + } + + int64_t size() + { + return total_size; + } + + size_t meminfo() + { + return bitmap_size; + } + + + private: + folly::File backing_file; + size_t bitmap_size; + int64_t total_size; + char* mmap_ptr; + }; + + const char dbi_size_info_size = 0; + const char dbi_size_info_next_disk_id = 1; + const char dbi_size_info_migration = 2; + const char dbi_size_info_ext_freespace = 3; + +} + +#ifndef _WIN32 +void mmap_read_error(int sig, siginfo_t *si, void *unused) +{ + if (sig != SIGBUS) + { + abort(); + } + SingleFileStorage::handle_mmap_read_error(si->si_addr); +} +#endif + +std::mutex SingleFileStorage::mmap_read_error_mutex; +std::unordered_map > > SingleFileStorage::mmap_read_error_jmp; +std::vector SingleFileStorage::mmap_dbs; + +SingleFileStorage::SingleFileStorage(SFSOptions options) + : data_file(options.data_path + os_file_sep() + "data", O_RDWR|O_CREAT|O_CLOEXEC), + data_file_path(options.data_path + os_file_sep() + "data"), + data_file_max_size(0), data_file_offset(0), data_file_offset_end(-1), data_file_free(0), do_quit(false), + min_free_space(20LL * 1024 * 1024 * 1024), is_defragging(false), defrag_restart(0), db_path(options.db_path), + is_dead(false), write_offline(false), curr_transid(1), startup_finished(false), + force_freespace_check(true), stop_defrag(false), allow_defrag(true), next_disk_id(1), data_file_copy_done(-1), data_file_copy_max(0), data_file_copy_done_sync(0), + stop_data_file_copy(false), references(0), db_env(nullptr), freespace_cache_path(options.freespace_cache_path), cache_db_env(nullptr), regen_freespace_cache(false), + sync_freespace_cache(true), mdb_curr_sync(false), data_file_size_limit(options.data_file_size_limit_mb*1024*1024), alloc_chunk_size(options.alloc_chunk_size), + runtime_id(options.runtime_id), manual_commit(options.manual_commit), stop_on_error(options.stop_on_error), punch_holes(options.punch_holes) +{ + int64_t index_file_size = 0; + + int64_t total_space = os_total_space(options.data_path); + if (total_space > 0 && total_space < 100LL * 1024 * 1024 * 1024) + { + min_free_space = 2LL * 1024 * 1024 * 1024; + XLOGF(INFO, "Minimum free space: {}", folly::prettyPrint(min_free_space, folly::PRETTY_BYTES_IEC)); + } + + int64_t mapsize = total_space / 175; + + if (mapsize < 1LL * 1024 * 1024 * 1024) + { + mapsize = 1LL * 1024 * 1024 * 1024; + } + + mapsize = ((mapsize + 1 * 1024 * 1024) / (1 * 1024 * 1024)) * 1 * 1024 * 1024; + + std::string index_lmdb_fn = db_path + os_file_sep() + "index.lmdb"; + + XLOGF(INFO, "{} max size: {}", index_lmdb_fn, folly::prettyPrint(mapsize * 2, folly::PRETTY_BYTES_IEC)); + + { + int fd = open(index_lmdb_fn.c_str(), O_RDONLY | O_CLOEXEC); + if (fd != -1) + { + index_file_size = fileSize(fd); + } + else + { + if (std::filesystem::exists(options.data_path + os_file_sep() + "active")) + { + throw std::runtime_error("File storage was active before (file "+ options.data_path + os_file_sep() + + "active present) but there is no index file at "+ index_lmdb_fn +" (index_file_not_found)"); + } + + MDB_env* tmp_env; + int rc = mdb_env_create(&tmp_env); + if (rc) + { + throw std::runtime_error("LMDB(0): Failed to create LMDB env (" + (std::string)mdb_strerror(rc) + ")"); + } + + rc = mdb_env_set_mapsize(tmp_env, mapsize * 2); + + if (rc) + { + throw std::runtime_error("LMDB(0): Failed to set map size (" + (std::string)mdb_strerror(rc) + ")"); + } + + rc = mdb_env_open(tmp_env, index_lmdb_fn.c_str(), MDB_NOSUBDIR|MDB_NOLOCK, 0664); + + if (rc) + { + throw std::runtime_error("LMDB(0): Failed to open LMDB database file (" + (std::string)mdb_strerror(rc) + ")"); + } + + mdb_env_close(tmp_env); + + fd = open(index_lmdb_fn.c_str(), O_RDONLY | O_CLOEXEC); + if (fd != -1) + { + index_file_size = fileSize(fd); + } + else + { + throw std::runtime_error("LMDB(0): Cannot open index file to get size"); + } + } + + if (!(os_get_file_type(db_path + os_file_sep() + "index.lmdb-lock") & EFileType_Symlink)) + { + std::filesystem::remove(db_path + os_file_sep() + "index.lmdb-lock"); + if (symlink(("/tmp/index.lmdb-lock-" + random_uuid()).c_str(), (db_path + os_file_sep() + "index.lmdb-lock").c_str())!=0) + { + throw std::runtime_error("Error creating symlink at " + db_path + os_file_sep() + "index.lmdb-lock. " + folly::errnoStr(errno)); + } + } + } + + /*if (db_path.find("f0f42ce575714ecf80ec9324e59ef60a") != std::string::npos + && cache_path.empty()) + { + Server->Log("Enabling debug freespace cache at /var/log/freespace_cache1", LL_WARNING); + cache_path = "/var/log/freespace_cache1"; + }*/ + + bool use_dm_cache = false; + if (!options.dm_cache_path.empty() + && options.dm_cache_size > 20 * 1024 * 1024) + { + int64_t index_size = mapsize * 2; + + if (options.dm_cache_size > index_size) + options.dm_cache_size = index_size; + + int64_t cache_meta_size = options.dm_cache_size / 100; + + cache_meta_size = ((cache_meta_size + 1 * 1024 * 1024) / (1 * 1024 * 1024)) * 1 * 1024 * 1024; + options.dm_cache_size = ((options.dm_cache_size + 1 * 1024 * 1024) / (1 * 1024 * 1024)) * 1 * 1024 * 1024; + + std::error_code ec; + std::filesystem::remove(options.dm_cache_path + "/cache", ec); + int cache_fd = open((options.dm_cache_path + "/cache").c_str(), O_WRONLY | O_CREAT | O_CLOEXEC); + + std::filesystem::remove(options.dm_cache_path + "/meta", ec); + int cache_meta_fd = open((options.dm_cache_path + "/meta").c_str(), O_WRONLY | O_CREAT | O_CLOEXEC); + + std::string cache_loop, meta_loop, db_loop; + + std::filesystem::path data_path_p(options.data_path); + + std::string cache_name = data_path_p.parent_path().filename().string() + "-" + random_uuid(); + + { + int fd = open(index_lmdb_fn.c_str(), O_RDWR | O_CREAT | O_CLOEXEC); + if (fd!=-1) + { + int64_t fsize = fileSize(fd); + if (fsize > 0 + && fsize < index_size) + { + folly::checkUnixError(ftruncate64(fd, index_size), "index_lmdb_fn ftrucate failed"); + } + } + } + + std::string db_direct_io = "--direct-io=on"; +#ifndef _WIN32 + FILE* aFile; + + aFile = setmntent("/proc/mounts", "r"); + if (aFile != nullptr) + { + struct mntent* ent; + std::string maxmount; + std::string fsname; + while (NULL != (ent = getmntent(aFile))) + { + if (next(db_path, 0, ent->mnt_dir) + && std::string(ent->mnt_dir).size() > maxmount.size()) + { + maxmount = ent->mnt_dir; + fsname = ent->mnt_fsname; + } + } + endmntent(aFile); + + if (next(fsname, 0, "//")) + { + XLOG(WARN) << "Detected CIFS. Not using direct-io loop for cache"; + db_direct_io = ""; + } + } +#endif + + std::string table_setup_cmd; + if (cache_fd == -1) + { + XLOG(ERR) << "Error opening cache_f at " << options.dm_cache_path << "/cache"; + } + else if (cache_meta_fd == -1) + { + XLOG(ERR) << "Error opening cache_meta_f at " << options.dm_cache_path << "/meta"; + } + else if (fallocate64(cache_fd, 0, 0, options.dm_cache_size)!=0) + { + XLOG(ERR) << "Error resizing cache_f at " << options.dm_cache_path << "/cache " << folly::errnoStr(errno); + } + else if(fallocate64(cache_meta_fd, 0, 0, cache_meta_size)!=0) + { + XLOG(ERR) << "Error resizing cache_meta_f at " << options.dm_cache_path << "/meta " << folly::errnoStr(errno); + } + else if (os_popen("losetup -f --show \"" + db_path + os_file_sep() + "index.lmdb\" " + db_direct_io, db_loop) != 0) + { + XLOG(ERR) << "Error setting up db loop"; + } + else if (os_popen("losetup -f --show \"" + options.dm_cache_path + "/cache\" --direct-io=on", cache_loop) != 0) + { + XLOG(ERR) << "Error setting up cache loop"; + } + else if (os_popen("losetup -f --show \"" + options.dm_cache_path + "/meta\" --direct-io=on", meta_loop) != 0) + { + XLOG(ERR) << "Error setting up cache meta loop"; + } + else if (!(table_setup_cmd = "dmsetup create \"" + cache_name + "\" --table '0 " + std::to_string(index_size / 512) + + " cache " + folly::trimWhitespace(meta_loop).toString() + " " + folly::trimWhitespace(cache_loop).toString() + " " + + folly::trimWhitespace(db_loop).toString() + " 2048 1 writethrough default 0'").empty() + && system(table_setup_cmd.c_str())!=0) + { + XLOG(ERR) << "Error setting up dm-cache (" << table_setup_cmd << ")"; + } + else + { + index_lmdb_fn = "/dev/mapper/" + cache_name; + use_dm_cache = true; + + cache_loop = getafter("/dev/", folly::trimWhitespace(cache_loop).toString()); + meta_loop = getafter("/dev/", folly::trimWhitespace(meta_loop).toString()); + + //Disable flushing + folly::writeFile(std::string("write through"), ("/sys/block/" + cache_loop + "/queue/write_cache").c_str()); + folly::writeFile(std::string("write through"), ("/sys/block/" + meta_loop + "/queue/write_cache").c_str()); + } + } + + if (next(freespace_cache_path, 0, "nosync:")) + { + freespace_cache_path.erase(0, 7); + sync_freespace_cache = false; + } + +#ifdef _WIN32 + use_direct_io = false; +#endif + + if (options.use_direct_io) + { + data_file_dio = folly::File(data_file_path, O_RDONLY | O_CLOEXEC | O_DIRECT ); + } + + if (os_get_file_type(options.data_path + os_file_sep() + "new_data")!=0) + { + new_data_file = folly::File( (options.data_path + os_file_sep() + "new_data").c_str(), O_RDWR | O_CLOEXEC); + + if (options.use_direct_io) + { + new_data_file_dio = folly::File( (options.data_path + os_file_sep() + "new_data").c_str(), O_RDWR | O_CLOEXEC | O_DIRECT); + } + } + + if ( fileSize(data_file.fd()) > 0 + && index_file_size == 0) + { + throw std::runtime_error("Could not open index file at \"" + db_path + os_file_sep() + "index.lmdb"); + } + + int rc = mdb_env_create(&db_env); + if (rc) + { + throw std::runtime_error("LMDB: Failed to create LMDB env (" + (std::string)mdb_strerror(rc) + ")"); + } + + rc = mdb_env_set_maxreaders(db_env, 4094); + + if (rc) + { + throw std::runtime_error("LMDB: Failed to set max readers (" + (std::string)mdb_strerror(rc) + ")"); + } + + rc = mdb_env_set_mapsize(db_env, mapsize*2); + + if (rc) + { + throw std::runtime_error("LMDB: Failed to set map size (" + (std::string)mdb_strerror(rc) + ")"); + } + + rc = mdb_env_set_maxdbs(db_env, 7); + + if (rc) + { + throw std::runtime_error("LMDB: Failed to set max dbs (" + (std::string)mdb_strerror(rc) + ")"); + } + + if (mapsize + 2LL * 1024 * 1024 * 1024 > min_free_space) + { + min_free_space = mapsize + 2LL * 1024 * 1024 * 1024; + } + + unsigned int mdb_flags = MDB_NOSUBDIR; + + with_rewrite = true; + + std::error_code ec; + if ( (db_path != options.data_path + || use_dm_cache) + && !std::filesystem::exists("/var/urbackup/sfs_with_readahead", ec) ) + { + mdb_flags |= MDB_NORDAHEAD; + } + + if (db_path != options.data_path + || std::filesystem::exists("/var/urbackup/sfs_no_rewrite", ec)) + { + with_rewrite = false; + } + if (std::filesystem::exists("/var/urbackup/sfs_with_rewrite", ec)) + { + with_rewrite = true; + } + + if (std::filesystem::exists("/var/urbackup/sfs_use_lmdb_writemap", ec)) + { + mdb_flags |= MDB_WRITEMAP; + } + + rc = mdb_env_open(db_env, index_lmdb_fn.c_str(), mdb_flags, 0664); + + if (rc) + { + throw std::runtime_error("LMDB: Failed to open LMDB database file (" + (std::string)mdb_strerror(rc) + ")"); + } + + THREAD_ID tid = gettid(); + setup_mmap_read_error(tid); + + MDB_txn* txn; + rc = mdb_txn_begin(db_env, NULL, MDB_RDONLY, &txn); + if (rc) + { + throw std::runtime_error("LMDB: Failed to open transaction handle for dbi handle (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Failed to open transaction handle for dbi handle (SIGBUS)"); + } + + int64_t current_txn_id = mdb_get_txnid(txn); + + mdb_txn_abort(txn); + + + rc = mdb_txn_begin(db_env, NULL, 0, &txn); + + if (rc) + { + throw std::runtime_error("LMDB: Failed to open transaction handle for dbi handle (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Failed to open transaction handle for dbi handle (SIGBUS)"); + } + + rc = mdb_dbi_open(txn, "main", MDB_CREATE, &dbi_main); + if (rc) + { + throw std::runtime_error("LMDB: Error opening main dbi handle (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Error opening main dbi handle (SIGBUS)"); + } + + rc = mdb_dbi_open(txn, "old", MDB_CREATE, &dbi_old); + if (rc) + { + throw std::runtime_error("LMDB: Error opening old dbi handle (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Error opening old dbi handle (SIGBUS)"); + } + + rc = mdb_dbi_open(txn, "free", MDB_CREATE, &dbi_free); + if (rc) + { + throw std::runtime_error("LMDB: Error opening free dbi handle (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Error opening free dbi handle (SIGBUS)"); + } + + rc = mdb_set_compare(txn, dbi_free, mdb_cmp_varint); + if (rc) + { + throw std::runtime_error("LMDB: Error setting free comparison function (" + (std::string)mdb_strerror(rc) + ")"); + } + + rc = mdb_dbi_open(txn, "holes", MDB_CREATE, &dbi_holes); + if (rc) + { + throw std::runtime_error("LMDB: Error opening holes dbi handle (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Error opening holes dbi handle (SIGBUS)"); + } + + rc = mdb_set_compare(txn, dbi_holes, mdb_cmp_varint); + if (rc) + { + throw std::runtime_error("LMDB: Error setting holes comparison function (" + (std::string)mdb_strerror(rc) + ")"); + } + + rc = mdb_dbi_open(txn, "size", MDB_CREATE, &dbi_size); + if (rc) + { + throw std::runtime_error("LMDB: Error opening size dbi handle (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Error opening size dbi handle (SIGBUS)"); + } + + rc = mdb_dbi_open(txn, "free_len", MDB_CREATE|MDB_DUPSORT, &dbi_free_len); + if (rc) + { + throw std::runtime_error("LMDB: Error opening free_len dbi handle (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Error opening free_len dbi handle (SIGBUS)"); + } + + rc = mdb_set_compare(txn, dbi_free_len, mdb_cmp_varint_rev); + if (rc) + { + throw std::runtime_error("LMDB: Error setting free_len comparison function (" + (std::string)mdb_strerror(rc) + ")"); + } + + rc = mdb_dbi_open(txn, "queue_del", MDB_CREATE, &dbi_queue_del); + if (rc) + { + throw std::runtime_error("LMDB: Error opening queue_del dbi handle (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Error opening queue_del dbi handle (SIGBUS)"); + } + + MDB_val val; + char ch = dbi_size_info_size; + val.mv_data = &ch; + val.mv_size = 1; + + MDB_val size_out; + + rc = mdb_get(txn, dbi_size, &val, &size_out); + + if (rc && rc != MDB_NOTFOUND) + { + throw std::runtime_error("LMDB: Error getting data file max size (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Error getting data file max size (SIGBUS)"); + } + + if (rc != MDB_NOTFOUND) + { + if (size_out.mv_size == sizeof(data_file_offset) * 5) + { + memcpy(&data_file_max_size, size_out.mv_data, sizeof(data_file_max_size)); + memcpy(&data_file_offset, reinterpret_cast(size_out.mv_data) + sizeof(data_file_max_size), sizeof(data_file_offset)); + memcpy(&data_file_offset_end, reinterpret_cast(size_out.mv_data) + sizeof(data_file_offset) * 2, sizeof(data_file_offset_end)); + memcpy(&data_file_free, reinterpret_cast(size_out.mv_data) + sizeof(data_file_offset) * 3, sizeof(data_file_free)); + memcpy(&curr_transid, reinterpret_cast(size_out.mv_data) + sizeof(data_file_offset) * 4, sizeof(curr_transid)); + XLOG(INFO) << "Data file max " << std::to_string(data_file_max_size) << " offset " << std::to_string(data_file_offset) << + " end " << std::to_string(data_file_offset_end) << + " free " << std::to_string(data_file_free) << + " transid " << std::to_string(curr_transid) << " fn " << data_file_path; + } + else + { + throw std::runtime_error("Size data has wrong size"); + } + + if (data_file_offset_end < 0) + { + data_file_max_size = 0; + data_file_offset_end = -1; + data_file_free = 0; + } + } + else if (std::filesystem::exists(options.data_path + os_file_sep() + "active", ec)) + { + throw std::runtime_error("Could not read data file information from index file \"" + +index_lmdb_fn + "\" " + "(Index file size "+folly::prettyPrint(index_file_size,folly::PRETTY_BYTES_IEC)+")"); + } + else + { + XLOG(INFO) << "New data file transid " << std::to_string(curr_transid) << " curr size "+folly::prettyPrint(index_file_size, folly::PRETTY_BYTES_IEC) << " fn " << data_file_path; + } + + ch = dbi_size_info_next_disk_id; + val.mv_data = &ch; + val.mv_size = 1; + + MDB_val next_disk_id_out; + + rc = mdb_get(txn, dbi_size, &val, &next_disk_id_out); + + if (rc && rc != MDB_NOTFOUND) + { + throw std::runtime_error("LMDB: Error getting next disk id (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Error getting next disk id (SIGBUS)"); + } + + if (rc != MDB_NOTFOUND) + { + if (next_disk_id_out.mv_size == sizeof(next_disk_id)) + { + memcpy(&next_disk_id, next_disk_id_out.mv_data, sizeof(next_disk_id)); + XLOG(INFO) << "Next disk id " << std::to_string(next_disk_id) << " fn " << data_file_path; + } + } + + ch = dbi_size_info_migration; + val.mv_data = &ch; + val.mv_size = 1; + + MDB_val copy_info_out; + + rc = mdb_get(txn, dbi_size, &val, ©_info_out); + + if (rc && rc != MDB_NOTFOUND) + { + throw std::runtime_error("LMDB: Error getting datafile migration info (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Error getting datafile migration info (SIGBUS)"); + } + + if (rc != MDB_NOTFOUND) + { + if (copy_info_out.mv_size == sizeof(data_file_copy_done_sync)) + { + memcpy(&data_file_copy_done, copy_info_out.mv_data, sizeof(data_file_copy_done)); + XLOG(INFO) << "data_file_copy_done=" << folly::prettyPrint(data_file_copy_done_sync, folly::PRETTY_BYTES_IEC); + data_file_copy_max = data_file_copy_done_sync; + data_file_copy_done = data_file_copy_done_sync; + + if (data_file_copy_done_sync > 0) + { + new_data_file = folly::File(data_file_path.parent_path() / "new_data", O_RDWR | O_CLOEXEC); + new_data_file_dio = folly::File(data_file_path.parent_path() / "new_data", O_RDWR | O_CLOEXEC | O_DIRECT); + } + } + } + + if (options.data_path != db_path) + { + XLOG(INFO) << "Data file metadata at " << index_lmdb_fn; + } + + ch = dbi_size_info_ext_freespace; + val.mv_data = &ch; + val.mv_size = 1; + + MDB_val ext_freespace_info; + + rc = mdb_get(txn, dbi_size, &val, &ext_freespace_info); + + if (rc && rc != MDB_NOTFOUND) + { + throw std::runtime_error("LMDB: Error getting datafile ext freespace info (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Error getting datafile ext freespace info (SIGBUS)"); + } + + if (rc != MDB_NOTFOUND) + { + int64_t ext_freespace_info_val; + if (ext_freespace_info.mv_size == sizeof(ext_freespace_info_val)) + { + memcpy(&ext_freespace_info_val, ext_freespace_info.mv_data, sizeof(ext_freespace_info_val)); + if (ext_freespace_info_val & 1) + { + XLOG(INFO) << "Ext freespace sync"; + } + else + { + XLOG(INFO) << "Ext freespace nosync"; + regen_freespace_cache = true; + } + + if (freespace_cache_path.empty() + || freespace_cache_path == db_path) + { + regen_freespace_cache = true; + } + } + } + + MDB_txn* freespace_txn = nullptr; + + if (!freespace_cache_path.empty() + && freespace_cache_path != db_path) + { + if (!regen_freespace_cache) + { + if (!open_cache_db(current_txn_id, mapsize, false, false, freespace_txn) + && !regen_freespace_cache) + { + if (!open_cache_db(current_txn_id, mapsize, true, false, freespace_txn)) + { + regen_freespace_cache = true; + } + } + } + + if (regen_freespace_cache) + { + bool b = open_cache_db(current_txn_id, mapsize, false, true, freespace_txn); + if (!b) + { + abort(); + } + } + } + + if (data_file_free == 0 && freespace_txn!=nullptr && !regen_freespace_cache) + { + regen_datafile_free(freespace_txn); + } + + XLOG(INFO) << "Free space in data file: " << folly::prettyPrint(data_file_free, folly::PRETTY_BYTES_IEC); + + if(freespace_txn!=nullptr && !regen_freespace_cache) + { + regen_free_len_idx(freespace_txn); + } + + if(freespace_txn!=txn && freespace_txn!=nullptr) + { + rc = mdb_txn_commit(freespace_txn); + if(rc) + { + throw std::runtime_error("LMDB: Error committing open_cache_db txn (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Error committing open_cache_db txn (SIGBUS)"); + } + } + + rc = mdb_txn_commit(txn); + if (rc) + { + throw std::runtime_error("LMDB: Error commiting txn for dbi handle (" + (std::string)mdb_strerror(rc) + ")"); + } + if (has_mmap_read_error_reset(tid)) + { + throw std::runtime_error("LMDB: Error commiting txn for dbi handle (SIGBUS)"); + } + + if (data_file_max_size < fileSize(data_file.fd())) + { + XLOG(INFO) << "Trimming " << folly::prettyPrint(fileSize(data_file.fd()) - data_file_max_size, folly::PRETTY_BYTES_IEC) << " from data file during recovery"; + folly::checkUnixError(ftruncate64(data_file.fd(), data_file_max_size), "Recovery truncate failed"); + } + + if (data_file_offset_end < 0) + { + data_file_offset_end = data_file_offset + alloc_chunk_size; + } + + { + std::scoped_lock lock(mmap_read_error_mutex); + mmap_dbs.push_back(db_env); + + if (cache_db_env != nullptr) + mmap_dbs.push_back(cache_db_env); + } + + if (std::filesystem::exists(data_file_path.parent_path() / "new_data")) + { + migrate_thread_h = std::thread([this]() { + auto cb = new SingleFileStorageMigrate(this); + (*cb)(); + }); + } +} + +SingleFileStorage::SingleFileStorage() + : data_file_max_size(0), data_file_offset(0), data_file_offset_end(-1), data_file_free(0), + do_quit(false), min_free_space(20LL * 1024 * 1024 * 1024), is_defragging(false), defrag_restart(0), db_path(std::string()), + is_dead(true), write_offline(true), curr_transid(0), startup_finished(false), + force_freespace_check(true), stop_defrag(false), allow_defrag(false), next_disk_id(1), data_file_copy_done(-1), data_file_copy_max(0), data_file_copy_done_sync(0), + stop_data_file_copy(false), references(0), + db_env(nullptr), cache_db_env(nullptr), regen_freespace_cache(false), sync_freespace_cache(true) +{ + +} + +SingleFileStorage::~SingleFileStorage() +{ + if(commit_thread_h.joinable()) + { + { + std::scoped_lock lock(mutex); + do_quit = true; + cond.notify_all(); + } + + commit_thread_h.join(); + } + + if (migrate_thread_h.joinable()) + { + { + std::scoped_lock copy_lock(data_file_copy_mutex); + stop_data_file_copy = true; + } + + migrate_thread_h.join(); + } + + { + std::scoped_lock lock(mmap_read_error_mutex); + auto it = std::find(mmap_dbs.begin(), mmap_dbs.end(), db_env); + if (it != mmap_dbs.end()) + mmap_dbs.erase(it); + } + + for (uintptr_t addr : mmap_cleanup_addrs) + { + int page_size = getpagesize(); + if (page_size <= 0) + page_size = 4096; + +#ifndef _WIN32 + if (munmap(reinterpret_cast(addr), page_size) != 0) + { + XLOG(ERR) << "Munmap failed. addr=" + << std::to_string(static_cast(addr)) + << " page_size=" << std::to_string(page_size) + << " " << folly::errnoStr(errno); + abort(); + } +#endif + } + + /*if (is_dead) + mdb_env_unmap(db_env);*/ + + mdb_env_close(db_env); +} + +void SingleFileStorage::init_mutex() +{ +#ifndef _WIN32 + struct sigaction act; + act.sa_sigaction = mmap_read_error; + sigemptyset(&act.sa_mask); + act.sa_flags = SA_SIGINFO| SA_RESTART; + sigaction(SIGBUS, &act, NULL); +#endif +} + +void SingleFileStorage::handle_mmap_read_error(void* addr) +{ + std::scoped_lock lock(mmap_read_error_mutex); + auto it = mmap_read_error_jmp.find(gettid()); + if (it != mmap_read_error_jmp.end()) + { + int page_size = getpagesize(); + if (page_size <= 0) + page_size = 4096; + + uintptr_t addr_page = reinterpret_cast(addr); + if (addr_page % page_size != 0) + { + addr_page = (addr_page / page_size)*page_size; + } + + bool found = false; + for (MDB_env* env : mmap_dbs) + { + char* map_ptr; + size_t map_size; + mdb_get_map(env, &map_ptr, &map_size); + + uintptr_t map_ptr_addr = reinterpret_cast(map_ptr); + + if (addr_page >= map_ptr_addr + && addr_page < map_ptr_addr + map_size + && addr_page+page_size >= map_ptr_addr + && addr_page+page_size <= map_ptr_addr+map_size) + { + found = true; + const char* env_path; + mdb_env_get_path(env, &env_path); + XLOG(WARN) << std::string("SIGBUS occured in ") << env_path; + break; + } + } + + if (!found) + { + XLOG(ERR) << "Did not find addr=" << std::to_string(reinterpret_cast(addr)) + << " page=" << std::to_string(static_cast(addr_page)) << " in any LMDB mapping"; + for (MDB_env* env : mmap_dbs) + { + char* map_ptr; + size_t map_size; + mdb_get_map(env, &map_ptr, &map_size); + const char* env_path; + mdb_env_get_path(env, &env_path); + XLOG(ERR) << "Mapping: " << std::to_string((int64_t)map_ptr) << " size " << std::to_string(map_size) << " (" << env_path << ")"; + } + abort(); + } + +#ifndef _WIN32 + if (mmap(reinterpret_cast(addr_page), page_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + -1, 0) == MAP_FAILED) +#endif + { + XLOG(ERR) << "Failed mmap in SIGBUS signal handler. addr=" + << std::to_string(reinterpret_cast(addr)) + +" addr_page="+std::to_string(static_cast(addr_page)) + + " "+folly::errnoStr(errno); + abort(); + } +#ifndef _WIN32 + else +#endif + { + XLOG(WARN) << "SIGBUS at addr=" << std::to_string(reinterpret_cast(addr)) + + ". Mapping " + std::to_string(page_size) + " zeroes at addr " + std::to_string(static_cast(addr_page)); + } + it->second.first = true; + it->second.second.push_back(addr_page); + } + else + { + XLOG(ERR) << "Not setup to catch error in SIGBUS signal handler"; + abort(); + } +} + +int SingleFileStorage::write(const std::string & fn, const char* data, + size_t data_size, int64_t last_modified, const std::string & md5sum, + bool no_del_old, bool is_fragment, size_t max_data_fragments) +{ + if (is_dead) + { + return ENODEV; + } + + if (fn.size() > 255) + return EINVAL; + + return write_int(fn, data, data_size, last_modified, md5sum, true, no_del_old, max_data_fragments); +} + +int64_t SingleFileStorage::get_transid(int64_t disk_id) +{ + std::scoped_lock lock(mutex); + + THREAD_ID tid = gettid(); + + setup_mmap_read_error(tid); + + MDB_txn* txn; + int rc = mdb_txn_begin(db_env, NULL, MDB_RDONLY, &txn); + if (rc) + { + XLOG(ERR) << "LMDB: Error starting transaction for get_transid (" << mdb_strerror(rc) << ") txn=" << std::to_string(reinterpret_cast(txn)) << " sfs " << db_path; + clear_mmap_read_error(tid); + return -1; + } + + int64_t ret = get_disk_trans_id(txn, tid, disk_id); + + mdb_txn_abort(txn); + + return ret; +} + +std::string SingleFileStorage::meminfo() +{ + std::string ret = "##SingleFileStorage:\n"; + { + std::scoped_lock lock(mutex); + ret += " defrag_skip_items: " + std::to_string(defrag_skip_items.size()) + " * " + folly::prettyPrint(sizeof(std::string), folly::PRETTY_BYTES_IEC) + "\n"; + ret += " commit_queue: " + std::to_string(commit_queue.size()) + " * " + folly::prettyPrint(sizeof(SFragInfo), folly::PRETTY_BYTES_IEC) + "\n"; + ret += " commit_background_queue: " + std::to_string(commit_background_queue.size()) + " * " + folly::prettyPrint(sizeof(SFragInfo), folly::PRETTY_BYTES_IEC) + "\n"; + ret += " defrag_items: " + std::to_string(defrag_items.size()) + " * " + folly::prettyPrint(sizeof(std::string), folly::PRETTY_BYTES_IEC) + "\n"; + } + { + std::scoped_lock lock(mmap_read_error_mutex); + ret += " mmap_read_error_jmp: " + std::to_string(mmap_read_error_jmp.size()) + " * " + folly::prettyPrint(sizeof(THREAD_ID)+sizeof(bool), folly::PRETTY_BYTES_IEC) + "\n"; + } + return ret; +} + +bool SingleFileStorage::set_write_offline(bool b) +{ + XLOG(ERR) << "SFS " << db_path << " set write offline b=" << std::to_string(b); + + if (!b && write_offline) + { + write_offline = b; + do_stop_on_error(); + + std::unique_lock lock(mutex); + + SCommitInfo commit_info; + SFragInfo frag_info; + frag_info.md5sum = "reset"; + frag_info.action = FragAction::Commit; + frag_info.commit_info = &commit_info; + + if (is_dead) + return false; + + commit_queue.push_back(frag_info); + + cond.notify_all(); + commit_info.commit_done.wait(lock); + + return commit_info.commit_errors == 0; + } + + write_offline = b; + + do_stop_on_error(); + return true; +} + +bool SingleFileStorage::set_allow_defrag(bool b, int64_t disk_id) +{ + std::scoped_lock lock(mutex); + + if (!b) + { + disallow_defrag_disk_id.insert(disk_id); + } + else + { + disallow_defrag_disk_id.erase(disk_id); + } + + allow_defrag = disallow_defrag_disk_id.empty(); + + return true; +} + +bool SingleFileStorage::reset_del_log(int64_t disk_id, int64_t reset_transid) +{ + if (is_dead) + { + return false; + } + + std::unique_lock lock(mutex); + + SCommitInfo commit_info; + SFragInfo frag_info; + frag_info.offset = reset_transid; + frag_info.last_modified = disk_id; + frag_info.action = FragAction::ResetDelLog; + frag_info.commit_info = &commit_info; + + commit_queue.push_back(frag_info); + + cond.notify_all(); + commit_info.commit_done.wait(lock); + + return commit_info.commit_errors == 0; +} + +bool SingleFileStorage::reset_del_queue(int64_t disk_id, int64_t reset_transid) +{ + if (is_dead) + { + return false; + } + + std::unique_lock lock(mutex); + + SCommitInfo commit_info; + SFragInfo frag_info; + frag_info.offset = reset_transid; + frag_info.last_modified = disk_id; + frag_info.action = FragAction::ResetDelQueue; + frag_info.commit_info = &commit_info; + + commit_queue.push_back(frag_info); + + cond.notify_all(); + commit_info.commit_done.wait(lock); + + return commit_info.commit_errors == 0; +} + +int64_t SingleFileStorage::get_disk_id(const std::string & uuid) +{ + if (is_dead) + { + return false; + } + + std::unique_lock lock(mutex); + + SCommitInfo commit_info; + SFragInfo frag_info; + frag_info.fn = uuid; + frag_info.action = FragAction::GetDiskId; + frag_info.commit_info = &commit_info; + + commit_queue.push_back(frag_info); + + cond.notify_all(); + commit_info.commit_done.wait(lock); + + if (commit_info.commit_errors != 0) + return 0; + + return commit_info.new_datafile_offset; +} + +void SingleFileStorage::migrate_thread() +{ + std::unique_lock lock(data_file_copy_mutex); + + auto new_data_file_path = data_file_path.parent_path() / "new_data"; + + if (new_data_file.fd() == -1) + { + new_data_file = folly::File(new_data_file_path, O_RDWR | O_CLOEXEC); + + if (data_file_dio.fd() == -1) + { + new_data_file_dio = folly::File(new_data_file_path, O_RDWR | O_CLOEXEC | O_DIRECT); + } + } + + auto status_fn = data_file_path.parent_path() / "new_data.status"; + + std::vector buf; + buf.resize(512 * 1024); + int64_t pos = (std::max)((int64_t)0, data_file_copy_done); + int64_t data_file_size; + while (pos < (data_file_size=fileSize(data_file.fd())) + && !stop_data_file_copy) + { + data_file_copy_max = (std::min)(pos + data_file_copy_num_bytes, data_file_size); + + lock.unlock(); + + while (pos < data_file_copy_max + && !stop_data_file_copy) + { + int64_t tocopy = (std::min)(static_cast(buf.size()), data_file_copy_max - pos); + + unsigned int read; + if ( (read=folly::preadNoInt(data_file.fd(), buf.data(), static_cast(tocopy), pos)) != tocopy) + { + if (errno == EIO) + { + XLOG(ERR) << "Error reading " << std::to_string(tocopy) << " bytes at pos " << std::to_string(pos) << " from " << data_file_path << " for migration. " << folly::errnoStr(errno) << ". Ignoring read error, writing zeroes..."; + memset(buf.data() + read, 0, tocopy - read); + } + else + { + + XLOG(ERR) << "Error reading " << std::to_string(tocopy) << " bytes at pos " << std::to_string(pos) << " from " << data_file_path << " for migration. " << folly::errnoStr(errno); + folly::writeFileAtomic(status_fn.string(), "{\"status\": \"error\"}"); + return; + } + } + + if (folly::pwriteNoInt(new_data_file.fd(), buf.data(), static_cast(tocopy), pos)!=tocopy) + { + XLOG(ERR) << "Error writing " << std::to_string(tocopy) << " bytes at pos " << std::to_string(pos) << " to " << new_data_file_path << " for migration. " << folly::errnoStr(errno); + folly::writeFileAtomic(status_fn.string(), "{\"status\": \"error\"}"); + return; + } + + pos += tocopy; + } + + if (folly::fsyncNoInt(new_data_file.fd())!=0) + { + XLOG(ERR) << "Error syncing " << new_data_file_path << " (pos "<((pos * 100) / data_file_size); + folly::writeFileAtomic(status_fn.string(), "{\"status\": \"running\", \"complete\": "+std::to_string(complete_pc)+"}"); + + lock.lock(); + data_file_copy_done = pos; + } + + if (pos == fileSize(data_file.fd())) + { + data_file_copy_done = LLONG_MAX; + data_file_copy_max = LLONG_MAX; + + folly::writeFileAtomic(status_fn.string(), "{\"status\": \"done\"}"); + } +} + +bool SingleFileStorage::start_migrate() +{ + if (is_dead) + { + return false; + } + + std::scoped_lock copy_lock(data_file_copy_mutex); + + if (migrate_thread_h.joinable()) + return false; + + if (!std::filesystem::exists(data_file_path.parent_path() / "new_data")) + return false; + + auto status_fn = data_file_path.parent_path()/ "new_data.status"; + std::filesystem::remove(status_fn); + folly::writeFileAtomic(status_fn.string(), "{\"status\": \"init\"}"); + + migrate_thread_h = std::thread([this](){ + auto cb = new SingleFileStorageMigrate(this); + (*cb)(); + }); + + return true; +} + +void SingleFileStorage::reference() +{ + ++references; +} + +void SingleFileStorage::unreference() +{ + --references; +} + +SingleFileStorage::WritePrepareResult SingleFileStorage::write_prepare(const std::string& fn, size_t data_size, size_t max_data_fragments) +{ + if (is_dead) + { + return WritePrepareResult{ENODEV}; + } + + if (fn.size() > 255) + return WritePrepareResult{EINVAL}; + + assert(data_size > 0); + std::string cfn = compress_filename(fn); + + if (max_data_fragments > max_extent_num) + max_data_fragments = max_extent_num; + + std::vector extents; + size_t data_size_remaining = data_size; + { + std::lock_guard lock(datafileoffset_mutex); + + while (data_size_remaining > 0) + { + if (extents.size() >= max_data_fragments) + { + XLOG(INFO) << "Too many extents (" << std::to_string(extents.size()) << " while writing " << fn << " size " << std::to_string(data_size) << ")"; + free_extents(extents); + return WritePrepareResult{ENOSPC}; + } + Ext curr_ext; + + if (data_file_offset_end > 0 + && data_file_offset >= data_file_offset_end) + { + XLOG(DBG) << "Current data offset " << std::to_string(data_file_offset) << " out of extent end (" << std::to_string(data_file_offset_end) << "). Searching for new free extent..."; + + std::unique_lock lock(mutex); + SCommitInfo commit_info; + SFragInfo frag_info; + frag_info.action = FragAction::FindFree; + frag_info.commit_info = &commit_info; + frag_info.offset = data_file_offset_end; + + if (is_dead) + return WritePrepareResult{EIO}; + + commit_queue.push_back(frag_info); + + cond.notify_all(); + commit_info.commit_done.wait(lock); + + if (commit_info.commit_errors != 0) + { + lock.unlock(); + free_extents(extents); + if (commit_info.commit_errors == LLONG_MAX) + { + return WritePrepareResult{ENOSPC}; + } + return WritePrepareResult{EIO}; + } + + data_file_offset = commit_info.new_datafile_offset; + data_file_offset_end = commit_info.new_datafile_offset_end; + + XLOG(INFO) << "Found new free extent (" << std::to_string(data_file_offset) << ", " << std::to_string(data_file_offset_end) << "). Size " << folly::prettyPrint(data_file_offset_end - data_file_offset, folly::PRETTY_BYTES_IEC); + } + + if (data_file_offset_end > data_file_offset) + { + int64_t remaining = data_file_offset_end - data_file_offset; + assert(remaining >= block_size); + curr_ext.data_file_offset = data_file_offset; + curr_ext.len = (std::min)(static_cast(data_size_remaining), remaining); + } + else + { + curr_ext.data_file_offset = data_file_offset; + curr_ext.len = data_size_remaining; + } + + data_file_offset += div_up(curr_ext.len, block_size)*block_size; + curr_ext.obj_offset = data_size - data_size_remaining; + data_size_remaining -= curr_ext.len; + + reserved_extents[curr_ext.data_file_offset] = curr_ext.len; + extents.push_back(curr_ext); + } + } + + return WritePrepareResult{0, extents}; +} + +int SingleFileStorage::write_ext(const Ext& ext, const void* data, size_t data_size) +{ + std::shared_lock copy_lock(data_file_copy_mutex); + + folly::File* sel_data_file = &data_file; + + while (data_file_copy_done!=-1 + && (ext.data_file_offset >= data_file_copy_done + && ext.data_file_offset <= data_file_copy_max) + || (ext.data_file_offset+ext.len >= data_file_copy_done + && ext.data_file_offset+ext.len <= data_file_copy_max) ) + { + copy_lock.unlock(); + std::this_thread::sleep_for(1s); + copy_lock.lock(); + } + + if (ext.data_file_offset + ext.len >= data_file_copy_done_sync + && ext.data_file_offset + ext.len <= data_file_copy_done) + { + if (folly::pwriteNoInt(sel_data_file->fd(), data, data_size, ext.data_file_offset) != data_size) + { + std::string fn = + sel_data_file == &data_file ? data_file_path : (data_file_path.parent_path() / "new_data"); + + XLOG(ERR) << "Error writing to data file " << fn << ". " << folly::errnoStr(errno); + + if (errno > 0) + return errno; + else + return EIO; + } + } + + if (ext.data_file_offset+ext.len <= data_file_copy_done) + { + sel_data_file = &new_data_file; + } + + EXT_DEBUG(XLOG(INFO) << "Writing " << fn << " to offset " << std::to_string(ext.offset) << " len " << std::to_string(ext.len) ) + + if (folly::pwriteNoInt(sel_data_file->fd(), data, data_size, ext.data_file_offset) != data_size ) + { + std::string fn = + sel_data_file == &data_file ? data_file_path : (data_file_path.parent_path() / "new_data"); + + XLOG(ERR) << "Error writing to data file " << fn + ". " + folly::errnoStr(errno); + + if (errno > 0) + return errno; + else + return EIO; + } + + return 0; +} + +int SingleFileStorage::write_finalize(const std::string& fn, const std::vector& extents, int64_t last_modified, const std::string& md5sum, + bool no_del_old, bool is_fragment) +{ + std::unique_lock lock(mutex); + wait_queue(lock, false, true); + wait_defrag(fn, lock); + + SFragInfo curr_frag(extents[0].data_file_offset, extents[0].len); + for (size_t i = 1; i < extents.size(); ++i) + { + curr_frag.extra_exts.push_back(SPunchItem(extents[i].data_file_offset, extents[i].len)); + } + curr_frag.action = no_del_old ? FragAction::AddNoDelOld : FragAction::Add; + curr_frag.fn = fn; + curr_frag.last_modified = last_modified; + curr_frag.md5sum = md5sum; + + ++commit_items[std::hash()(fn)]; + + commit_queue.push_back(curr_frag); + + if (is_defragging) + { + defrag_skip_items.insert(fn); + } + cond.notify_all(); + + return 0; +} + +int SingleFileStorage::write_int(const std::string & fn, const char* data, + size_t data_size, int64_t last_modified, const std::string & md5sum, bool allow_defrag_lock, + bool no_del_old, size_t max_data_fragments) +{ + assert(data_size > 0); + std::string cfn = compress_filename(fn); + + if (max_data_fragments > max_extent_num) + max_data_fragments = max_extent_num; + + std::vector extents; + size_t data_size_remaining = data_size; + { + if (!allow_defrag_lock) + { + if (!datafileoffset_mutex.try_lock()) + { + return EDEADLK; + } + } + else + { + datafileoffset_mutex.lock(); + } + + SCOPE_EXIT { datafileoffset_mutex.unlock(); }; + + while (data_size_remaining > 0) + { + if (extents.size() >= max_data_fragments) + { + XLOG(INFO) << "Too many extents (" << std::to_string(extents.size()) << " while writing " << fn << " size " << std::to_string(data_size) << ")"; + free_extents(extents); + return ENOSPC; + } + Ext curr_ext; + + if (data_file_offset_end > 0 + && data_file_offset >= data_file_offset_end) + { + XLOG(DBG) << "Current data offset " << std::to_string(data_file_offset) << " out of extent end (" << std::to_string(data_file_offset_end) << "). Searching for new free extent..."; + + std::unique_lock lock(mutex); + SCommitInfo commit_info; + SFragInfo frag_info; + frag_info.action = FragAction::FindFree; + frag_info.commit_info = &commit_info; + frag_info.offset = data_file_offset_end; + if (!allow_defrag_lock) + { + //write in defrag + frag_info.len = 1; + } + + if (is_dead) + return EIO; + + commit_queue.push_back(frag_info); + + cond.notify_all(); + commit_info.commit_done.wait(lock); + + if (commit_info.commit_errors != 0) + { + lock.unlock(); + free_extents(extents); + if (commit_info.commit_errors == LLONG_MAX) + { + return ENOSPC; + } + return EIO; + } + + data_file_offset = commit_info.new_datafile_offset; + data_file_offset_end = commit_info.new_datafile_offset_end; + + XLOG(INFO) << "Found new free extent (" << std::to_string(data_file_offset) << ", " << std::to_string(data_file_offset_end) << "). Size " << folly::prettyPrint(data_file_offset_end - data_file_offset, folly::PRETTY_BYTES_IEC); + } + + if (data_file_offset_end > data_file_offset) + { + int64_t remaining = data_file_offset_end - data_file_offset; + assert(remaining >= block_size); + curr_ext.data_file_offset = data_file_offset; + curr_ext.len = (std::min)(static_cast(data_size_remaining), remaining); + } + else + { + curr_ext.data_file_offset = data_file_offset; + curr_ext.len = data_size_remaining; + } + + data_file_offset += div_up(curr_ext.len, block_size)*block_size; + curr_ext.obj_offset = data_size - data_size_remaining; + data_size_remaining -= curr_ext.len; + + extents.push_back(curr_ext); + } + } + + assert(!extents.empty()); + + { + std::shared_lock copy_lock(data_file_copy_mutex); + + size_t data_offset = 0; + for (Ext& ext : extents) + { + folly::File* sel_data_file = &data_file; + + while (data_file_copy_done!=-1 + && (ext.data_file_offset >= data_file_copy_done + && ext.data_file_offset <= data_file_copy_max) + || (ext.data_file_offset+ext.len >= data_file_copy_done + && ext.data_file_offset+ext.len <= data_file_copy_max) ) + { + copy_lock.unlock(); + std::this_thread::sleep_for(1s); + copy_lock.lock(); + } + + if (ext.data_file_offset + ext.len >= data_file_copy_done_sync + && ext.data_file_offset + ext.len <= data_file_copy_done) + { + if (folly::pwriteNoInt(sel_data_file->fd(), data + data_offset, static_cast(ext.len), ext.data_file_offset) != static_cast(ext.len)) + { + std::string fn = + sel_data_file == &data_file ? data_file_path : (data_file_path.parent_path() / "new_data"); + + XLOG(ERR) << "Error writing to data file " << fn << ". " << folly::errnoStr(errno); + free_extents(extents); + if (errno > 0) + return errno; + else + return EIO; + } + } + + if (ext.data_file_offset+ext.len <= data_file_copy_done) + { + sel_data_file = &new_data_file; + } + + EXT_DEBUG(XLOG(INFO) << "Writing " << fn << " to offset " << std::to_string(ext.offset) << " len " << std::to_string(ext.len) ) + + if (folly::pwriteNoInt(sel_data_file->fd(), data + data_offset, static_cast(ext.len), ext.data_file_offset) != static_cast(ext.len)) + { + std::string fn = + sel_data_file == &data_file ? data_file_path : (data_file_path.parent_path() / "new_data"); + + XLOG(ERR) << "Error writing to data file " << fn + ". " + folly::errnoStr(errno); + free_extents(extents); + if (errno > 0) + return errno; + else + return EIO; + } + data_offset += ext.len; + } + assert(data_offset == data_size); + + } + + if (extents.size() > 1) + { + XLOG(INFO) << "Item " << fn << " has " << std::to_string(extents.size()) << " extents"; + } + + std::unique_lock lock(mutex); + wait_queue(lock, false, !allow_defrag_lock); + if (allow_defrag_lock) + { + wait_defrag(cfn, lock); + } + + SFragInfo curr_frag(extents[0].data_file_offset, extents[0].len); + for (size_t i = 1; i < extents.size(); ++i) + { + curr_frag.extra_exts.push_back(SPunchItem(extents[i].data_file_offset, extents[i].len)); + } + curr_frag.action = no_del_old ? FragAction::AddNoDelOld : FragAction::Add; + curr_frag.fn = cfn; + curr_frag.last_modified = last_modified; + curr_frag.md5sum = md5sum; + + ++commit_items[std::hash()(cfn)]; + + commit_queue.push_back(curr_frag); + + if (is_defragging) + { + defrag_skip_items.insert(cfn); + } + cond.notify_all(); + + return 0; +} + +void SingleFileStorage::add_reading_item(const SingleFileStorage::SFragInfo& fi) +{ + auto& ri = reading_items[fi.offset]; + ++ri.refs; +} + +void SingleFileStorage::remove_reading_item(const std::vector& extents) +{ + assert(!extents.empty()); + auto ri = reading_items.find(extents.begin()->data_file_offset); + assert(ri!=reading_items.end()); + --ri->second.refs; + assert(ri->second.refs>=0); + if(ri->second.refs<=0) + { + if(ri->second.free_skip) + { + for(const auto& ext : extents) + { + auto it = reading_free_skip_extents.find(ext.data_file_offset); + if(it!=reading_free_skip_extents.end()) + reading_free_skip_extents.erase(it); + } + } + + reading_items.erase(ri); + } +} + +SingleFileStorage::ReadPrepareResult SingleFileStorage::read_prepare(const std::string& fn, unsigned int flags) +{ + if (is_dead) + { + return ReadPrepareResult{ENOTRECOVERABLE}; + } + + if ((flags & ReadUnsynced) == 0) + { + std::unique_lock lock(mutex); + + auto it = commit_items.find(std::hash()(fn)); + if(it!=commit_items.end()) + { + flags |= ReadUnsynced; + } + } + + SFragInfo frag_info; + if ((flags & ReadUnsynced) == 0) + { + frag_info = get_frag_info(nullptr, fn); + + if (frag_info.offset == -1) + { + XLOG(INFO) << "Could not find metadata for fragment " << fn << " in LMDB sfs " << db_path; + return ReadPrepareResult{ENOENT}; + } + + std::lock_guard lock(mutex); + add_reading_item(frag_info); + } + else + { + SCommitInfo commit_info; + commit_info.frag_info = &frag_info; + SFragInfo curr_frag; + curr_frag.action =FragAction::ReadFragInfo; + curr_frag.fn = fn; + curr_frag.commit_info = &commit_info; + + std::unique_lock lock(mutex); + wait_startup_finished(lock); + + commit_queue.push_back(curr_frag); + cond.notify_all(); + commit_info.commit_done.wait(lock); + + if (frag_info.offset == -1) + { + XLOG(INFO) << "Could not find metadata for fragment " << fn << " in LMDB (read unsynced) sfs " << db_path; + return ReadPrepareResult{ENOENT}; + } + + add_reading_item(frag_info); + } + + ReadPrepareResult res = {0}; + res.extents.reserve(1 + frag_info.extra_exts.size()); + res.extents.push_back(Ext(0, frag_info.offset, frag_info.len)); + res.total_len += frag_info.len; + for (const SPunchItem& ext : frag_info.extra_exts) + { + res.extents.push_back(Ext(res.total_len, ext.offset, ext.len)); + res.total_len += ext.len; + } + + return res; +} + + +SingleFileStorage::ReadExtResult SingleFileStorage::read_ext(const Ext& ext, const unsigned int flags, const size_t bufsize, folly::IOBufQueue& buf) +{ + if (is_dead) + { + return ReadExtResult{ENOTRECOVERABLE}; + } + + std::shared_lock copy_lock(data_file_copy_mutex); + + folly::File* sel_data_file = &data_file; + folly::File* sel_data_file_dio = &data_file_dio; + + size_t toread = std::min(static_cast(ext.len), bufsize); + + if (ext.data_file_offset+static_cast(toread) <= data_file_copy_done) + { + sel_data_file = &new_data_file; + sel_data_file_dio = &new_data_file_dio; + } + + auto data = buf.preallocate(bufsize, bufsize); + auto bufptr = reinterpret_cast(data.first); + + ssize_t read; + bool dio_read = (flags & ReadWithReadahead) == 0 && (flags & ReadUnsynced) == 0 && sel_data_file_dio->fd() != -1; + if (dio_read) + { + read = folly::preadNoInt(sel_data_file_dio->fd(), bufptr, toread, ext.data_file_offset); + } + else + { + read = folly::preadNoInt(sel_data_file->fd(), bufptr, toread, ext.data_file_offset); + } + + if (readfd(), bufptr, toread, ext.data_file_offset); + } + if (read < ssize_t(toread)) + { + return ReadExtResult{errno}; + } + else + { + XLOG(WARN) << "Read succeeded with non-dio"; + } + } + + buf.postallocate(read); + + return ReadExtResult{0, buf.move()}; +} + +int SingleFileStorage::read_finalize(const std::string& fn, const std::vector& extents, unsigned int flags) +{ + if(!extents.empty()) + { + std::lock_guard lock(mutex); + remove_reading_item(extents); + } + return 0; +} + +bool SingleFileStorage::del(const std::string & fn, DelAction da, + bool background_queue) +{ + if (is_dead) + { + return false; + } + + std::string cfn = compress_filename(fn); + + SFragInfo curr_frag; + switch (da) + { + case DelAction::Del: + curr_frag.action = FragAction::Del; + break; + case DelAction::DelOld: + curr_frag.action = FragAction::DelOld; + break; + case DelAction::DelWithQueued: + curr_frag.action = FragAction::DelWithQueued; + break; + case DelAction::Queue: + curr_frag.action = FragAction::QueueDel; + break; + case DelAction::Unqueue: + curr_frag.action = FragAction::UnqueueDel; + break; + case DelAction::AssertQueueEmpty: + curr_frag.action = FragAction::AssertDelQueueEmpty; + break; + } + curr_frag.fn = cfn; + + std::unique_lock lock(mutex); + wait_queue(lock, background_queue, false); + wait_defrag(cfn, lock); + ++commit_items[std::hash()(cfn)]; + + if (is_defragging) + { + defrag_skip_items.insert(cfn); + } + if (background_queue) + { + commit_background_queue.push_back(curr_frag); + } + else + { + commit_queue.push_back(curr_frag); + } + cond.notify_all(); + + return true; +} + +bool SingleFileStorage::restore_old(const std::string & fn) +{ + if (is_dead) + { + return false; + } + + std::string cfn = compress_filename(fn); + + SFragInfo curr_frag; + curr_frag.action = FragAction::RestoreOld; + curr_frag.fn = cfn; + + std::unique_lock lock(mutex); + wait_queue(lock, false, false); + wait_defrag(cfn, lock); + ++commit_items[std::hash()(cfn)]; + + if (is_defragging) + { + defrag_skip_items.insert(cfn); + } + commit_queue.push_back(curr_frag); + cond.notify_all(); + + return true; +} + +bool SingleFileStorage::commit(bool background_queue, int64_t transid, int64_t disk_id) +{ + if (is_dead) + { + return false; + } + + if (folly::fsyncNoInt(data_file.fd())!=0) + { + XLOG(ERR) << "Failed to sync data file " << data_file_path << ". " << folly::errnoStr(errno); + write_offline = true; + do_stop_on_error(); + return false; + } + + { + std::scoped_lock lock(mutex); + mdb_curr_sync = true; + } + + if (mdb_env_sync(db_env, 0) != 0) + { + XLOG(ERR) << "mdb_env_sync on " << db_path << " failed. " << folly::errnoStr(errno); + write_offline = true; + do_stop_on_error(); + return false; + } + + if (cache_db_env != nullptr) + { + if (mdb_env_sync(cache_db_env, 0) != 0) + { + XLOG(ERR) << "mdb_env_sync on cache db failed. " << folly::errnoStr(errno); + write_offline = true; + do_stop_on_error(); + return false; + } + } + + std::unique_lock lock(mutex); + + SCommitInfo commit_info; + SFragInfo frag_info; + frag_info.offset = transid; + frag_info.len = disk_id; + frag_info.action = FragAction::Commit; + frag_info.commit_info = &commit_info; + + if (is_dead) + return false; + + if (background_queue) + { + commit_background_queue.push_back(frag_info); + } + else + { + commit_queue.push_back(frag_info); + } + + mdb_curr_sync = false; + + cond.notify_all(); + commit_info.commit_done.wait(lock); + + return commit_info.commit_errors == 0; +} + +bool SingleFileStorage::empty_queue(bool background_queue) +{ + if (is_dead) + { + return false; + } + + std::unique_lock lock(mutex); + + SCommitInfo commit_info; + SFragInfo frag_info; + frag_info.action = FragAction::EmptyQueue; + frag_info.commit_info = &commit_info; + + if (background_queue) + { + commit_background_queue.push_back(frag_info); + } + else + { + commit_queue.push_back(frag_info); + } + + cond.notify_all(); + commit_info.commit_done.wait(lock); + + return commit_info.commit_errors == 0; +} + +bool SingleFileStorage::iter_start(int64_t disk_id, bool compressed, IterData& iter_data) +{ + CWData wdata; + std::string prefix = std::to_string(disk_id) + "#"; + + if (disk_id == 0) + { + if (compressed) + wdata.addChar(0); + else + wdata.addChar(10); + } + else + { + if (compressed) + wdata.addChar(1); + else + wdata.addBuffer(prefix.data(), prefix.size()); + } + + return iter_start(std::string(wdata.getDataPtr(), wdata.getDataSize()), false, iter_data); +} + +bool SingleFileStorage::iter_start(bool compressed, IterData& iter_data) +{ + return iter_start(0, compressed, iter_data); +} + +bool SingleFileStorage::iter_start(std::string fn, bool compressed, IterData& iter_data) +{ + if (is_dead) + { + return false; + } + + if (compressed) + { + fn = compress_filename(fn); + } + + mdb_madvise(db_env, 0); + + int rc = mdb_txn_begin(db_env, NULL, MDB_RDONLY, &iter_data.iter_txn); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for iteration (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + + rc = mdb_cursor_open(iter_data.iter_txn, dbi_main, &iter_data.iter_cur); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to open cursor for iteration (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + + if(fn.empty()) + { + rc = mdb_cursor_get(iter_data.iter_cur, &iter_data.iter_key, &iter_data.iter_val, MDB_FIRST); + } + else + { + iter_data.iter_key.mv_data = const_cast(fn.data()); + iter_data.iter_key.mv_size = fn.size(); + iter_data.iter_val.mv_data = nullptr; + rc = mdb_cursor_get(iter_data.iter_cur, &iter_data.iter_key, &iter_data.iter_val, MDB_SET_RANGE); + } + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Failed to get cursor for iteration (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + + return true; +} + +void SingleFileStorage::start_debug() +{ + /*iter_start("39640_73585ef100", true); + + int64_t offset; + int64_t size; + std::vector extra_exts; + int64_t last_modified; + std::string fn; + std::string md5sum; + if (iter_curr_val(fn, offset, size, extra_exts, last_modified, md5sum)) + { + XLOG(WARN) << "Debug val: " << decompress_filename(fn) << " offset " << std::to_string(offset) << " size " + std::to_string(size) << " last_modified " << std::to_string(last_modified) << " md5sum " << folly::hexlify(md5sum); + } + + iter_stop(); + + iter_start("39640_73585ef100", false); + + if (iter_curr_val(fn, offset, size, extra_exts, last_modified, md5sum)) + { + XLOG(WARN) << "Debug val: " << decompress_filename(fn) << " offset " << std::to_string(offset) << " size " << std::to_string(size) << " last_modified " << std::to_string(last_modified) << " md5sum " << folly::hexlify(md5sum); + } + + iter_stop();*/ +} + +void SingleFileStorage::iter_stop(IterData& iter_data) +{ + mdb_cursor_close(iter_data.iter_cur); + mdb_txn_abort(iter_data.iter_txn); + + mdb_madvise(db_env, 1); +} + +bool SingleFileStorage::iter_next(IterData& iter_data) +{ + int rc = mdb_cursor_get(iter_data.iter_cur, &iter_data.iter_key, &iter_data.iter_val, MDB_NEXT); + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Failed to get curosr next for iteration (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + else if (rc == MDB_NOTFOUND) + { + iter_data.iter_key.mv_data = nullptr; + } + + return true; +} + +bool SingleFileStorage::iter_curr_val(std::string & fn, int64_t& offset, int64_t & size, + std::vector& extra_exts, int64_t & last_modified, std::string & md5sum, + IterData& iter_data) +{ + if (iter_data.iter_key.mv_data == nullptr) + return false; + + fn = std::string(reinterpret_cast(iter_data.iter_key.mv_data), iter_data.iter_key.mv_size); + + CRData rdata(reinterpret_cast(iter_data.iter_val.mv_data), iter_data.iter_val.mv_size); + + if (!rdata.getVarInt(&offset) + || !rdata.getVarInt(&size)) + { + return false; + } + + int64_t num_exts = extract_num_exts(offset); + + for (int64_t i = 0; i < num_exts; ++i) + { + SPunchItem ext; + if (!rdata.getVarInt(&ext.offset) + || !rdata.getVarInt(&ext.len)) + { + return false; + } + + extra_exts.push_back(ext); + } + + if(!rdata.getVarInt(&last_modified) + || !rdata.getStr2(&md5sum)) + { + return false; + } + + return true; +} + +bool SingleFileStorage::iter_curr_val(std::string & fn, std::string & data, IterData& iter_data) +{ + if (iter_data.iter_key.mv_data == nullptr) + return false; + + fn = std::string(reinterpret_cast(iter_data.iter_key.mv_data), iter_data.iter_key.mv_size); + + data.assign(reinterpret_cast(iter_data.iter_val.mv_data), iter_data.iter_val.mv_size); + + return true; +} + +int64_t SingleFileStorage::remove_fn(const std::string & fn, MDB_txn * txn, MDB_txn* freespace_txn, + bool del_from_main, bool del_old, THREAD_ID tid) +{ + int64_t commit_errors = 0; + + MDB_cursor* mc; + + int rc = mdb_cursor_open(txn, del_old ? dbi_old : dbi_main, &mc); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_open in remove_fn (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + return commit_errors; + } + + SCOPE_EXIT{ mdb_cursor_close(mc); }; + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_open in remove_fn (SIGBUS) (" + << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + return commit_errors; + } + + MDB_val tkey; + tkey.mv_data = const_cast(&fn[0]); + tkey.mv_size = fn.size(); + + MDB_val tval; + + rc = mdb_cursor_get(mc, &tkey, &tval, MDB_SET); + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to get extent info in commit for del (SIGBUS) (" + << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + return commit_errors; + } + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Failed to get extent info in commit for del (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + else if (rc != MDB_NOTFOUND) + { + if (del_old) + { + XLOG(INFO) << "Del old fn " << decompress_filename(fn) << " sfs " << db_path; + } + + CRData rdata(reinterpret_cast(tval.mv_data), tval.mv_size); + + int64_t fn_transid; + if (del_old + && !rdata.getVarInt(&fn_transid)) + { + XLOG(ERR) << "Cannot read transid in del_old"; + ++commit_errors; + } + + int64_t offset; + int64_t length; + if (rdata.getVarInt(&offset) + && rdata.getVarInt(&length)) + { + int64_t num_exts = extract_num_exts(offset); + + std::vector extra_exts; + extra_exts.reserve(num_exts); + for (int64_t i = 0; i < num_exts; ++i) + { + int64_t extra_offset, extra_length; + if (!rdata.getVarInt(&extra_offset) + || !rdata.getVarInt(&extra_length)) + { + XLOG(ERR) << "LMDB: Failed to read extra extent sfs " << db_path; + ++commit_errors; + break; + } + else + { + extra_exts.push_back(SPunchItem(extra_offset, extra_length)); + } + } + + if(startup_finished) + { + std::lock_guard lock(mutex); + auto it_reading = reading_items.find(offset); + if(it_reading != reading_items.end()) + { + assert(it_reading->second.refs>0); + it_reading->second.free_skip = true; + reading_free_skip_extents.insert(offset); + for(const auto& ext: extra_exts) + { + reading_free_skip_extents.insert(ext.offset); + } + } + } + + //Following can invalidate the memory of tval + rdata by spilling + + if (!add_freemap_ext(freespace_txn, offset, length, true, tid)) + { + XLOG(ERR) << "LMDB: Failed to put free extent in commit sfs " << db_path; + ++commit_errors; + } + + for (SPunchItem& eo: extra_exts) + { + if (!add_freemap_ext(freespace_txn, eo.offset, eo.len, true, tid)) + { + XLOG(ERR) << "LMDB: Failed to put free extent in commit sfs " << db_path; + ++commit_errors; + } + } + + if (del_from_main) + { + rc = mdb_cursor_del(mc, 0); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to delete extent info in commit (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB:Failed to delete extent info in commit (SIGBUS) sfs " << db_path; + ++commit_errors; + } + + if(commit_errors==0 && with_rewrite) + rewrite_npages(txn, mc, tid, n_rewrite_pages); + } + } + else + { + XLOG(ERR) << "Cannot read offset+length in remove_fn()"; + ++commit_errors; + } + } + else if (del_from_main) + { + if(del_old) + { + XLOG(INFO) << "Did not find extent info for key \"" << decompress_filename(fn) + << "\" sfs "<(tval.mv_data), tval.mv_size, true); + + commit_errors += remove_fn(fn, txn, freespace_txn, true, false, tid); + + int64_t fn_transid; + rdata.getVarInt(&fn_transid); + + MDB_val val_offset; + val_offset.mv_data = const_cast(rdata.getCurrDataPtr()); + val_offset.mv_size = rdata.getLeft(); + + rc = mdb_put(txn, dbi_main, &tkey, &val_offset, 0); + + if (rc) + { + XLOG(ERR) << "Error restoring del item to main (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + return commit_errors; + } + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "Error restoring del item to main (SIGBUS) sfs " << db_path; + ++commit_errors; + return commit_errors; + } + + rc = mdb_del(txn, dbi_old, &tkey, nullptr); + + if (rc) + { + XLOG(ERR) << "Error del restore item (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + return commit_errors; + } + else + { + XLOG(INFO) << "Restore fn " << decompress_filename(fn) << " sfs " << db_path; + } + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "Error del restore item (SIGBUS) sfs " << db_path; + ++commit_errors; + return commit_errors; + } + } + else + { + XLOG(DBG) << "Did not find extent info for key \"" << decompress_filename(fn) + << "\" sfs " << db_path << " in restore"; + } + + return commit_errors; +} + +int64_t SingleFileStorage::log_fn(const std::string & fn, MDB_txn * txn, THREAD_ID tid, int64_t transid) +{ + int64_t commit_errors = 0; + + MDB_val tkey; + tkey.mv_data = const_cast(&fn[0]); + tkey.mv_size = fn.size(); + + MDB_val tval; + + int rc = mdb_get(txn, dbi_main, &tkey, &tval); + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to get extent info in log for del (SIGBUS) (" + << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + return commit_errors; + } + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Failed to get extent info in log for del (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + else if (rc != MDB_NOTFOUND) + { + CWData wdata; + wdata.addVarInt(transid); + wdata.addBuffer(reinterpret_cast(tval.mv_data), tval.mv_size); + + MDB_val tval_log; + tval_log.mv_data = wdata.getDataPtr(); + tval_log.mv_size = wdata.getDataSize(); + + rc = mdb_put(txn, dbi_old, &tkey, &tval_log, 0); + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put log for del (SIGBUS) (" + << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + return commit_errors; + } + + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put in log for del (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + else + { + XLOG(INFO) << "Log fn " << decompress_filename(fn) << " sfs " << db_path; + } + } + + return commit_errors; +} + +int64_t SingleFileStorage::add_tmp(int64_t idx, MDB_txn* txn, THREAD_ID tid, int64_t offset, int64_t len) +{ + int64_t commit_errors = 0; + + CWData keydata; + keydata.addChar('t'); + keydata.addVarInt(idx); + + MDB_val tkey; + tkey.mv_data = keydata.getDataPtr(); + tkey.mv_size = keydata.getDataSize(); + + CWData valdata; + valdata.addVarInt(0); + valdata.addVarInt(offset); + valdata.addVarInt(len); + + MDB_val tval; + tval.mv_data = valdata.getDataPtr(); + tval.mv_size = valdata.getDataSize(); + + int rc = mdb_put(txn, dbi_old, &tkey, &tval, 0); + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put in add_tmp (SIGBUS) (" + << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + return commit_errors; + } + + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put in add_tmp (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + else + { + XLOG(INFO) << "Add tmp idx " << idx << " off " << offset << " len " << len << " sfs " << db_path; + } + + return commit_errors; +} + +int64_t SingleFileStorage::rm_tmp(int64_t idx, MDB_txn* txn, THREAD_ID tid) +{ + CWData keydata; + keydata.addChar('t'); + keydata.addVarInt(idx); + + MDB_val tkey; + tkey.mv_data = keydata.getDataPtr(); + tkey.mv_size = keydata.getDataSize(); + + int rc = mdb_del(txn, dbi_old, &tkey, nullptr); + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to del in rm_tmp (SIGBUS) (" + << mdb_strerror(rc) << ") sfs " << db_path; + return 1; + } + + if (rc) + { + XLOG(ERR) << "LMDB: Failed to del in rm_tmp (" << mdb_strerror(rc) << ") sfs " << db_path; + return 1; + } + else + { + XLOG(INFO) << "Rm tmp idx " << idx << " sfs " << db_path; + } + + return 0; +} + +void SingleFileStorage::wait_queue(std::unique_lock& lock, bool background_queue, bool defrag_check) +{ + size_t throttle_max = 1000; + auto& sel_queue = background_queue ? commit_background_queue : commit_queue; + while (sel_queue.size() > throttle_max + && !is_dead) + { + if (defrag_check + && defrag_restart == 1) + { + break; + } + + lock.unlock(); + std::this_thread::sleep_for(10ms); + lock.lock(); + } +} + +#define FREEMAP_DEBUG(x) x + +bool SingleFileStorage::add_freemap_ext(MDB_txn* txn, int64_t offset, int64_t len, + bool used_in_curr_trans, THREAD_ID tid) +{ + if (is_dead) + return false; + + len = div_up(len, block_size)*block_size; + + MDB_cursor* fmap_cur; + int rc = mdb_cursor_open(txn, dbi_free, &fmap_cur); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to open cursor for freemap ext (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to open cursor for freemap ext (SIGBUS) sfs " << db_path; + is_dead = true; + do_stop_on_error(); + return false; + } + + CWData wdata_init; + wdata_init.addVarInt(offset); + + MDB_val key; + key.mv_data = wdata_init.getDataPtr(); + key.mv_size = wdata_init.getDataSize(); + + MDB_val val; + rc = mdb_cursor_get(fmap_cur, &key, &val, MDB_SET_RANGE); + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Failed to get curosr for freemap ext (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to get curosr for freemap ext (SIGBUS) sfs " << db_path; + is_dead = true; + do_stop_on_error(); + return false; + } + + int64_t next_offset = -1; + int64_t next_len = -1; + bool merged_extent = false; + if (rc != MDB_NOTFOUND) + { + CRData rdata(reinterpret_cast(key.mv_data), key.mv_size); + CRData rdatalen(reinterpret_cast(val.mv_data), val.mv_size); + if (rdata.getVarInt(&next_offset) + && rdatalen.getVarInt(&next_len)) + { + CWData orig_data; + orig_data.addVarInt(offset); + + if (offset + len > next_offset) + { + XLOG(WARN) << "Extent (" << std::to_string(offset) << ", " << std::to_string(len) << ") already in freemap (next) as (" << std::to_string(next_offset) << ", " << std::to_string(next_len) << ") sfs " << db_path; + //already in freemap + //assert(false); + mdb_cursor_close(fmap_cur); + return true; + } + else if (offset + len == next_offset) + { + FREEMAP_DEBUG(XLOG(INFO) << "Merging new freemap extent (" << std::to_string(offset) << ", " << std::to_string(len) << ") with next freemap ext at " << std::to_string(next_offset) << " new len " << folly::prettyPrint(next_len +len, folly::PRETTY_BYTES_IEC);) + + rc = mdb_cursor_del(fmap_cur, 0); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed to del cursor current for freemap ext (" << mdb_strerror(rc) << ") sfs " << db_path; + mdb_cursor_close(fmap_cur); + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to del cursor current for freemap ext (SIGBUS) sfs " << db_path; + return false; + } + + merged_extent = true; + } + else + { + next_offset = -1; + } + } + else + { + assert(false); + } + } + + rc = mdb_cursor_get(fmap_cur, &key, &val, MDB_PREV); + + if (rc != MDB_NOTFOUND) + { + CRData rdata(reinterpret_cast(key.mv_data), key.mv_size); + CRData rdatalen(reinterpret_cast(val.mv_data), val.mv_size); + int64_t prev_offset; + int64_t prev_length; + if (rdata.getVarInt(&prev_offset) + && rdatalen.getVarInt(&prev_length)) + { + if ( prev_offset + prev_length > offset + && prev_offset <= offset ) + { + XLOG(WARN) << "Extent (" << std::to_string(offset) << ", " << std::to_string(len) << ") already in freemap (prev) as (" << std::to_string(prev_offset) << ", " << std::to_string(prev_length) << ") sfs " << db_path; + //already in freemap + //assert(false); + mdb_cursor_close(fmap_cur); + return true; + } + else if (offset == prev_offset + prev_length) + { + //Delete prev + rc = mdb_del(txn, dbi_free_len, &val, &key); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to del prev for freemap ext len (" << mdb_strerror(rc) << ") sfs " << db_path; + mdb_cursor_close(fmap_cur); + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to del prev for freemap ext len (SIGBUS) sfs " << db_path; + return false; + } + + CWData wdataval; + if (!merged_extent) + { + FREEMAP_DEBUG(XLOG(INFO) << "Merging new freemap extent (" << std::to_string(offset) << ", " << std::to_string(len) << ") with prev freemap ext at " << std::to_string(prev_offset)<<" new len "+folly::prettyPrint(prev_length +len, folly::PRETTY_BYTES_IEC); ) + wdataval.addVarInt(prev_length + len); + } + else + { + assert(next_len > 0); + FREEMAP_DEBUG( XLOG(INFO) << "Merging new freemap extent (" << std::to_string(offset) << ", " << std::to_string(len) << ") with prev and next freemap ext at " << std::to_string(prev_offset) << " new len " << folly::prettyPrint(prev_length + len+ next_len, folly::PRETTY_BYTES_IEC); ) + wdataval.addVarInt(prev_length + len + next_len); + + //Delete next + CWData wdatanextlen; + MDB_val nextlen; + wdatanextlen.addVarInt(next_len); + nextlen.mv_data = wdatanextlen.getDataPtr(); + nextlen.mv_size = wdatanextlen.getDataSize(); + + CWData wdatanextoffset; + wdatanextoffset.addVarInt(next_offset); + MDB_val nextoffset; + nextoffset.mv_data = wdatanextoffset.getDataPtr(); + nextoffset.mv_size = wdatanextoffset.getDataSize(); + + rc = mdb_del(txn, dbi_free_len, &nextlen, &nextoffset); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to del next for freemap ext len (1) (" << mdb_strerror(rc) << ") sfs " << db_path; + mdb_cursor_close(fmap_cur); + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to del next for freemap ext len (1) (SIGBUS) sfs " << db_path; + return false; + } + } + + val.mv_data = wdataval.getDataPtr(); + val.mv_size = wdataval.getDataSize(); + + CWData wdatakey; + wdatakey.addVarInt(prev_offset); + key.mv_data = wdatakey.getDataPtr(); + key.mv_size = wdatakey.getDataSize(); + + rc = mdb_cursor_put(fmap_cur, &key, &val, MDB_CURRENT); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put cursor current for freemap ext (2) (" << mdb_strerror(rc) << ") sfs " << db_path; + mdb_cursor_close(fmap_cur); + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put cursor current for freemap ext (2) (SIGBUS) sfs " << db_path; + return false; + } + + rc = mdb_put(txn, dbi_free_len, &val, &key, 0); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put prev for freemap ext len (" << mdb_strerror(rc) << ") sfs " << db_path; + mdb_cursor_close(fmap_cur); + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put prev for freemap ext len (2) (SIGBUS) sfs " << db_path; + return false; + } + + merged_extent = true; + next_offset = -1; + } + } + else + { + assert(false); + } + } + + mdb_cursor_close(fmap_cur); + + if (!merged_extent + || next_offset!=-1) + { + int64_t plen; + if (next_offset == -1) + { + FREEMAP_DEBUG( XLOG(INFO) << "Unmerged new freemap extent (" << std::to_string(offset) << ", " << std::to_string(len) << ") new len " << folly::prettyPrint(len, folly::PRETTY_BYTES_IEC); ) + plen = len; + } + else + { + plen = len + next_len; + + //Delete next + CWData wdatanextlen; + MDB_val nextlen; + wdatanextlen.addVarInt(next_len); + nextlen.mv_data = wdatanextlen.getDataPtr(); + nextlen.mv_size = wdatanextlen.getDataSize(); + + CWData wdatanextoffset; + wdatanextoffset.addVarInt(next_offset); + MDB_val nextoffset; + nextoffset.mv_data = wdatanextoffset.getDataPtr(); + nextoffset.mv_size = wdatanextoffset.getDataSize(); + + rc = mdb_del(txn, dbi_free_len, &nextlen, &nextoffset); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to del next for freemap ext len (1) (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to del next for freemap ext len (1) (SIGBUS) sfs " << db_path; + return false; + } + } + + CWData wdata; + wdata.addVarInt(offset); + key.mv_data = wdata.getDataPtr(); + key.mv_size = wdata.getDataSize(); + + CWData wdataval; + wdataval.addVarInt(plen); + + val.mv_data = wdataval.getDataPtr(); + val.mv_size = wdataval.getDataSize(); + + rc = mdb_put(txn, dbi_free, &key, &val, 0); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put freemap extent (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put freemap extent (1) (SIGBUS) sfs " << db_path; + return false; + } + + rc = mdb_put(txn, dbi_free_len, &val, &key, 0); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put freemap extent len (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put freemap extent len (SIGBUS) sfs " << db_path; + return false; + } + } + + { + std::scoped_lock lock2(freespace_mutex); + data_file_free += len; + } + + if (used_in_curr_trans) + { + curr_new_free_extents.insert(offset); + } + + return true; +} + +bool SingleFileStorage::add_freemap_ext_simple(MDB_txn* txn, int64_t offset, int64_t len, THREAD_ID tid) +{ + if (is_dead) + return false; + + len = div_up(len, block_size) * block_size; + + CWData wdata; + wdata.addVarInt(offset); + MDB_val key; + key.mv_data = wdata.getDataPtr(); + key.mv_size = wdata.getDataSize(); + + CWData wdataval; + wdataval.addVarInt(len); + + MDB_val val; + val.mv_data = wdataval.getDataPtr(); + val.mv_size = wdataval.getDataSize(); + + int rc = mdb_put(txn, dbi_free, &key, &val, 0); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put freemap extent in add_freemap_ext_simple(" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put freemap extent in add_freemap_ext_simple (1) (SIGBUS) sfs " << db_path; + return false; + } + + rc = mdb_put(txn, dbi_free_len, &val, &key, 0); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put freemap extent len in add_freemap_ext_simple (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put freemap extent len in add_freemap_ext_simple (SIGBUS) sfs " << db_path; + return false; + } + + return true; +} + +bool SingleFileStorage::find_freemap_ext(MDB_txn* txn, + THREAD_ID tid, int64_t & start, int64_t & len) +{ + len = 0; + start = -1; + + int rc = mdb_set_compare(txn, dbi_free_len, mdb_cmp_varint_rev); + if (rc) + { + XLOG(ERR) << "Error setting free len comparison function in find_freemap_ext (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + + MDB_cursor* it_cursor; + mdb_cursor_open(txn, dbi_free_len, &it_cursor); + MDB_val tkey; + MDB_val tval; + MDB_cursor_op op = MDB_FIRST; + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error opening cursor in find_freemap_ext (SIGBUS) sfs " << db_path; + return false; + } + + std::vector local_free_skip_extents; + { + std::scoped_lock lock(mutex); + local_free_skip_extents = curr_free_skip_extents; + } + + while (rc == MDB_SUCCESS) + { + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting cursor in find_freemap_ext (SIGBUS) sfs " << db_path; + mdb_cursor_close(it_cursor); + return false; + } + + if (rc == MDB_SUCCESS) + { + CRData dkey(reinterpret_cast(tkey.mv_data), tkey.mv_size); + CRData dval(reinterpret_cast(tval.mv_data), tval.mv_size); + + if (dval.getVarInt(&start) + && dkey.getVarInt(&len)) + { + bool skip_ext = false; + for (size_t i = 0; i < local_free_skip_extents.size(); ++i) + { + const SPunchItem& ext = local_free_skip_extents[i]; + if ((start > ext.offset + && start <= ext.offset+ext.len) + || (start+len > ext.offset + && start+len <= ext.offset + ext.len)) + { + skip_ext = true; + break; + } + } + + if (!skip_ext) + { + auto it = curr_new_free_extents.lower_bound(start); + if (it == curr_new_free_extents.end() + || *it >= start + len) + { + //TODO perf: Seperate RW-Lock? + std::scoped_lock lock(mutex); + auto it = reading_free_skip_extents.lower_bound(start); + if (it == reading_free_skip_extents.end() + || *it >= start + len) + { + break; + } + } + } + } + } + } + + mdb_cursor_close(it_cursor); + + if (rc == MDB_SUCCESS) + { + return true; + } + else if (rc == MDB_NOTFOUND) + { + return false; + } + + XLOG(ERR) << "Error reading freemap ext len item (" << mdb_strerror(rc) << ") sfs " << db_path; + assert(false); + return false; +} + +void SingleFileStorage::lock_defrag(const std::string & fn) +{ + std::scoped_lock lock(mutex); + defrag_items.insert(fn); +} + +bool SingleFileStorage::is_defrag_skip_item(const std::string & fn) +{ + std::scoped_lock lock(mutex); + return defrag_skip_items.find(fn)!=defrag_skip_items.end(); +} + +void SingleFileStorage::unlock_defrag(const std::string & fn) +{ + std::scoped_lock lock(mutex); + defrag_items.erase(fn); +} + +void SingleFileStorage::wait_defrag(const std::string & fn, std::unique_lock& lock) +{ + while (defrag_items.find(fn) != defrag_items.end()) + { + lock.unlock(); + std::this_thread::sleep_for(10ms); + lock.lock(); + } +} + +void SingleFileStorage::setup_mmap_read_error(THREAD_ID tid) +{ + std::scoped_lock lock(mmap_read_error_mutex); + mmap_read_error_jmp[tid] = std::pair >(); +} + +bool SingleFileStorage::clear_mmap_read_error(THREAD_ID tid) +{ + std::scoped_lock lock(mmap_read_error_mutex); + auto it = mmap_read_error_jmp.find(tid); + if (it == mmap_read_error_jmp.end()) + { + return true; + } + else + { + bool ret = it->second.first; + + mmap_cleanup_addrs.insert(mmap_cleanup_addrs.end(), + it->second.second.begin(), + it->second.second.end()); + + mmap_read_error_jmp.erase(it); + if (ret) + { + XLOG(WARN) << "Had mmap read error (SIGBUS)"; + } + return ret; + } +} + +bool SingleFileStorage::has_mmap_read_error_reset(THREAD_ID tid) +{ + std::scoped_lock lock(mmap_read_error_mutex); + auto it = mmap_read_error_jmp.find(tid); + if (it == mmap_read_error_jmp.end()) + { + is_dead = true; + do_stop_on_error(); + return true; + } + else + { + bool ret = it->second.first; + + mmap_cleanup_addrs.insert(mmap_cleanup_addrs.end(), + it->second.second.begin(), it->second.second.end()); + + it->second = std::pair >(); + if (ret) + { + is_dead = true; + do_stop_on_error(); + } + return ret; + } +} + +int64_t SingleFileStorage::reset_del_log_fn(MDB_txn * txn, MDB_txn* freespace_txn, THREAD_ID tid, int64_t disk_id, int64_t transid) +{ + int64_t commit_errors = 0; + MDB_cursor* it_cursor; + mdb_cursor_open(txn, dbi_old, &it_cursor); + MDB_cursor_op op = MDB_FIRST; + int rc; + bool ret = true; + do + { + MDB_val tkey; + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in reset del log fn(" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + break; + } + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in reset del log fn (SIGBUS) sfs " << db_path; + ++commit_errors; + break; + } + + if (rc != MDB_NOTFOUND) + { + std::string fn = std::string(reinterpret_cast(tkey.mv_data), tkey.mv_size); + + int64_t curr_disk_id = get_fn_disk_id(fn); + + if (curr_disk_id != disk_id) + continue; + + std::vector data_buf(reinterpret_cast(tval.mv_data), + reinterpret_cast(tval.mv_data) + tval.mv_size); + CRData rdata(data_buf.data(), data_buf.size()); + + int64_t fn_transid = 0; + rdata.getVarInt(&fn_transid); + + if (fn_transid == transid) + { + XLOG(INFO) << "Restoring key " << decompress_filename(fn) << " transid "<< std::to_string(fn_transid)<<" to previous data sfs " << db_path<<" curr transid "<(tkey.mv_data), tkey.mv_size); + + int64_t curr_disk_id = get_fn_disk_id(fn); + + if (curr_disk_id != disk_id) + continue; + + CRData rdata(reinterpret_cast(tval.mv_data), tval.mv_size); + + int64_t fn_transid = 0; + rdata.getVarInt(&fn_transid); + + if (fn_transid == transid) + { + XLOG(INFO) << "Not deleting queued " << decompress_filename(fn) << " transid " << std::to_string(fn_transid) << " sfs " << db_path << " curr transid " << std::to_string(transid); + } + else if(fn_transid>0) + { + XLOG(INFO) << "Deleting queued " << decompress_filename(fn) << " from main transid " << std::to_string(fn_transid) << " sfs " << db_path << " curr transid " << std::to_string(transid); + + commit_errors += remove_fn(fn, txn, freespace_txn, true, false, tid); + } + else + { + XLOG(INFO) << "Not deleting queued (transid<=0) " << decompress_filename(fn) << " from main transid " << std::to_string(fn_transid) << " sfs " << db_path << " curr transid " << std::to_string(transid); + } + + rc = mdb_cursor_del(it_cursor, 0); + + if (rc) + { + XLOG(ERR) << "Error del del queue item (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + continue; + } + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "Error del del item (SIGBUS) sfs " << db_path; + ++commit_errors; + continue; + } + + } + } while (rc != MDB_NOTFOUND); + + mdb_cursor_close(it_cursor); + + return commit_errors; +} + +int64_t SingleFileStorage::reset_holes(MDB_txn* txn, MDB_txn* freespace_txn, THREAD_ID tid) +{ + int64_t commit_errors = 0; + MDB_cursor* it_cursor; + mdb_cursor_open(txn, dbi_holes, &it_cursor); + MDB_cursor_op op = MDB_FIRST; + int rc; + bool ret = true; + do + { + MDB_val tkey; + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in reset holes (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + break; + } + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in reset holes (SIGBUS) sfs " << db_path; + ++commit_errors; + break; + } + + if (rc != MDB_NOTFOUND) + { + CRData rdata_key(reinterpret_cast(tkey.mv_data), tkey.mv_size); + CRData rdata_val(reinterpret_cast(tval.mv_data), tval.mv_size); + + int64_t start, len; + + if (rdata_key.getVarInt(&start) + && rdata_val.getVarInt(&len)) + { + if (!add_freemap_ext(freespace_txn, start, len, false, tid)) + ++commit_errors; + + rc = mdb_cursor_del(it_cursor, 0); + + if (rc) + { + XLOG(ERR) << "Error del in reset holes (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + continue; + } + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "Error del in reset holes (SIGBUS) sfs " << db_path; + ++commit_errors; + continue; + } + } + + } + } while (rc != MDB_NOTFOUND); + + mdb_cursor_close(it_cursor); + + return commit_errors; +} + +void SingleFileStorage::wait_startup_finished(std::unique_lock & lock) +{ + while (!startup_finished) + { + lock.unlock(); + std::this_thread::sleep_for(1s); + lock.lock(); + } +} + +void SingleFileStorage::free_extents(const std::vector& extents) +{ + if (extents.empty()) + return; + + std::unique_lock lock(mutex); + wait_queue(lock, false, false); + + SFragInfo curr_frag(extents[0].data_file_offset, extents[0].len); + for (size_t i = 1; i < extents.size(); ++i) + { + curr_frag.extra_exts.push_back(SPunchItem(extents[i].data_file_offset, extents[i].len)); + } + curr_frag.action = FragAction::FreeExtents; + + commit_queue.push_back(curr_frag); + + cond.notify_all(); +} + +int64_t SingleFileStorage::get_really_min_space(int64_t& index_file_size) +{ + index_file_size = 0; + { + int fd = open((db_path + os_file_sep() + "index.lmdb").c_str(), O_RDONLY|O_CLOEXEC); + if (fd!=-1) + { + index_file_size = fileSize(fd); + } + else + { + XLOG(ERR) << "Error opening lmdb file " << db_path << os_file_sep() << "index.lmdb to get real size " << folly::errnoStr(errno); + return false; + } + } + + if (cache_db_env != nullptr) + { + int fd = open( (freespace_cache_path + os_file_sep() + "freespace.lmdb").c_str(), O_RDONLY|O_CLOEXEC); + if (fd!=-1) + { + index_file_size += fileSize(fd); + } + else + { + XLOG(ERR) << "Error opening lmdb file " << freespace_cache_path << os_file_sep() << "freespace.lmdb to get real size " << folly::errnoStr(errno); + } + } + + int64_t really_min_space = index_file_size + 2LL * 1024 * 1024 * 1024; + + return really_min_space; +} + +int64_t SingleFileStorage::get_burn_in_data_size() +{ + int64_t dsize = 0; + int fd = open( (data_file_path.parent_path() / "burn_in.data").c_str(), O_RDONLY|O_CLOEXEC); + if (fd!=-1) + { + dsize = fileSize(fd); + } + return dsize; +} + +bool SingleFileStorage::do_free_minspace(MDB_txn* txn, MDB_txn* freespace_txn, THREAD_ID tid) +{ + int64_t free_space = os_free_space(data_file_path.string()); + free_space += get_burn_in_data_size(); + + int64_t index_file_size; + int64_t really_min_space = get_really_min_space(index_file_size); + + int64_t curr_min_space = min_free_space - index_file_size; + + int64_t skip_allowance = 512LL * 1024 * 1024; + + if (free_space >= curr_min_space - skip_allowance + && free_space >= really_min_space - skip_allowance) + { + return true; + } + + int64_t freed_space = 0; + std::string last_info; + + while (free_space>=0 + && (free_space < curr_min_space + || free_space 0) + { + XLOG(WARN) << "Freed " << folly::prettyPrint(freed_space, folly::PRETTY_BYTES_IEC) << " via punching holes. " << last_info; + } + + return true; +} + +int64_t SingleFileStorage::queue_del(const std::string & fn, MDB_txn * txn, THREAD_ID tid, int64_t transid) +{ + int64_t commit_errors = 0; + + MDB_val tkey; + tkey.mv_data = const_cast(&fn[0]); + tkey.mv_size = fn.size(); + + CWData wdata; + wdata.addVarInt(transid); + + MDB_val tval_log; + tval_log.mv_data = wdata.getDataPtr(); + tval_log.mv_size = wdata.getDataSize(); + + int rc = mdb_put(txn, dbi_queue_del, &tkey, &tval_log, 0); + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put queue for del (SIGBUS) (" + << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + return commit_errors; + } + + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put in queue for del (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + else + { + XLOG(INFO) << "Queue del fn " << decompress_filename(fn) << " sfs " << db_path; + } + + return commit_errors; +} + +int64_t SingleFileStorage::unqueue_del(const std::string & fn, MDB_txn * txn, THREAD_ID tid) +{ + int64_t commit_errors = 0; + + MDB_val tkey; + tkey.mv_data = const_cast(&fn[0]); + tkey.mv_size = fn.size(); + + int rc = mdb_del(txn, dbi_queue_del, &tkey, NULL); + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to del queue for del (SIGBUS) (" + << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + return commit_errors; + } + + if (rc) + { + XLOG(ERR) << "LMDB: Failed to del in unqueue for del (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + else + { + XLOG(INFO) << "Unqueue del fn " << decompress_filename(fn) << " sfs " << db_path; + } + + return commit_errors; +} + +void SingleFileStorage::add_defrag_skip_items_queue() +{ + for (SFragInfo& fi : commit_queue) + { + if (!fi.fn.empty()) + { + defrag_skip_items.insert(fi.fn); + } + } + + for (SFragInfo& fi : commit_background_queue) + { + if (!fi.fn.empty()) + { + defrag_skip_items.insert(fi.fn); + } + } +} + +bool SingleFileStorage::open_cache_db(int64_t current_txn_id, int64_t mapsize, bool use_other, bool del_create, MDB_txn*& freespace_txn) +{ + if (cache_db_env != nullptr) + { + if (freespace_txn != nullptr) + { + mdb_txn_abort(freespace_txn); + freespace_txn = nullptr; + } + mdb_env_close(cache_db_env); + cache_db_env = nullptr; + } + + THREAD_ID tid = gettid(); + + int rc = mdb_env_create(&cache_db_env); + + std::string cache_db_name = "freespace.lmdb"; + + if (rc) + { + throw std::runtime_error("LMDB: Failed to create cache LMDB env (" + (std::string)mdb_strerror(rc) + ")"); + } + + rc = mdb_env_set_maxreaders(cache_db_env, 4094); + + if (rc) + { + throw std::runtime_error("LMDB: Failed to set cache max readers (" + (std::string)mdb_strerror(rc) + ")"); + } + + rc = mdb_env_set_mapsize(cache_db_env, mapsize * 2); + + if (rc) + { + throw std::runtime_error("LMDB: Failed to set cache map size (" + (std::string)mdb_strerror(rc) + ")"); + } + + rc = mdb_env_set_maxdbs(cache_db_env, 3); + + if (rc) + { + throw std::runtime_error("LMDB: Failed to set cache max dbs (" + (std::string)mdb_strerror(rc) + ")"); + } + + unsigned int mdb_flags = MDB_NOSUBDIR | MDB_NOTLS | MDB_NORDAHEAD; + + if (del_create) + { + std::error_code ec; + std::filesystem::remove(freespace_cache_path + os_file_sep() + cache_db_name, ec); + std::filesystem::remove(freespace_cache_path + os_file_sep() + cache_db_name + "-lock", ec); + } + + if (use_other) + { + mdb_flags |= MDB_PREVSNAPSHOT; + } + + if (std::filesystem::exists("/var/urbackup/sfs_use_lmdb_writemap")) + { + mdb_flags |= MDB_WRITEMAP; + } + + rc = mdb_env_open(cache_db_env, (freespace_cache_path + os_file_sep() + cache_db_name).c_str(), mdb_flags, 0664); + + if (rc) + { + throw std::runtime_error("LMDB: Failed to open cache LMDB database file (" + (std::string)mdb_strerror(rc) + ")"); + } + + size_t txnid1, txnid2; + mdb_env_get_txnids(cache_db_env, &txnid1, &txnid2); + if(use_other) + { + XLOG(INFO) << "Opened other cache db txn, txnid1="<(tval.mv_data), tval.mv_size); + + SPunchItem first_ext; + + if (!rdata.getVarInt(&first_ext.offset) + || !rdata.getVarInt(&first_ext.len)) + { + return true; + } + + std::vector extents; + + int64_t num_extents = extract_num_exts(first_ext.offset); + + bool has_error = false; + for (int64_t i = 0; i < num_extents; ++i) + { + SPunchItem cfrag; + if (!rdata.getVarInt(&cfrag.offset) + || !rdata.getVarInt(&cfrag.len)) + { + has_error = true; + break; + } + + extents.push_back(cfrag); + } + + if (has_error) + { + std::string key_name = std::string(reinterpret_cast(tkey.mv_data), tkey.mv_size); + XLOG(ERR) << "Error getting extents of " << decompress_filename(key_name) << " in generate_freespace_cache"; + return true; + } + + bool ret = true; + for (size_t i = 0; i < extents.size() + 1; ++i) + { + SPunchItem& ext = (i == 0) ? first_ext : extents[i - 1]; + + int64_t start_bit = ext.offset / 4096; + int64_t len_bits = div_up(ext.len, 4096); + + if (bmap.get_range(start_bit, start_bit + len_bits)) + { + std::string key_name = std::string(reinterpret_cast(tkey.mv_data), tkey.mv_size); + XLOG(ERR) << "Range of " << decompress_filename(key_name) << " already set offset=" << std::to_string(ext.offset) << " len=" + std::to_string(ext.len); + if (!ignore_errors) + ret = false; + } + + bmap.set_range(start_bit, start_bit + len_bits, true); + } + + return ret; + }; + + bool ret = true; + + MDB_cursor* it_cursor; + mdb_cursor_open(source_txn, dbi_main, &it_cursor); + MDB_cursor_op op = MDB_FIRST; + + if (fast_gen) + { + XLOG(INFO) << "Reading page ids from " << db_path; + + TmpMmapedPgIds page_ids; + + if (!read_pgids(source_txn, dbi_main, tid, page_ids)) + { + XLOG(ERR) << "Reading page ids from " << db_path << " failed"; + mdb_cursor_close(it_cursor); + return false; + } + + XLOG(INFO) << "Iterating through "<< std::to_string(page_ids.size()) << " pages " << db_path; + + for (size_t* pg_it = page_ids.begin(); pg_it != page_ids.end(); ++pg_it) + { + unsigned int nkeys; + rc = mdb_page_get_nkeys(it_cursor, *pg_it, &nkeys); + + if (rc) + { + XLOG(ERR) << "LMDB: Error getting mdb_page_get_nkeys in generate_freespace_cache (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in mdb_page_get_nkeys in generate_freespace_cache (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + for (unsigned int idx = 0; idx < nkeys; ++idx) + { + MDB_val tval; + rc = mdb_page_get_val(it_cursor, *pg_it, idx, &tkey, &tval); + + if (rc) + { + XLOG(ERR) << "LMDB: Error getting item mdb_page_get_val in main generate_freespace_cache (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item mdb_page_get_val in main generate_freespace_cache (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + if (!add_extent_bmap(tkey, tval)) + ret = false; + } + + if (!ret) + break; + } + } + else + { + do + { + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in main generate_freespace_cache (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in main generate_freespace_cache (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + if (rc != MDB_NOTFOUND) + { + if (!add_extent_bmap(tkey, tval)) + ret = false; + } + } while (rc != MDB_NOTFOUND); + } + + mdb_cursor_close(it_cursor); + + if (!ret) + return ret; + + mdb_cursor_open(source_txn, dbi_old, &it_cursor); + op = MDB_FIRST; + + do + { + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in old generate_freespace_cache (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in old generate_freespace_cache (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + if (rc != MDB_NOTFOUND) + { + CRData rdata(reinterpret_cast(tval.mv_data), tval.mv_size); + + SPunchItem first_ext; + + int64_t transid; + if (!rdata.getVarInt(&transid) + || !rdata.getVarInt(&first_ext.offset) + || !rdata.getVarInt(&first_ext.len)) + { + std::string key_name = std::string(reinterpret_cast(tkey.mv_data), tkey.mv_size); + XLOG(ERR) << "Error getting first old extent of " << decompress_filename(key_name) << " in generate_freespace_cache"; + continue; + } + + std::vector extents; + + int64_t num_extents = extract_num_exts(first_ext.offset); + + bool has_error = false; + for (int64_t i = 0; i < num_extents; ++i) + { + SPunchItem cfrag; + if (!rdata.getVarInt(&cfrag.offset) + || !rdata.getVarInt(&cfrag.len)) + { + has_error = true; + break; + } + + extents.push_back(cfrag); + } + + if (has_error) + { + std::string key_name = std::string(reinterpret_cast(tkey.mv_data), tkey.mv_size); + XLOG(ERR) << "Error getting old extents of " << decompress_filename(key_name) << " in generate_freespace_cache"; + continue; + } + + for (size_t i = 0; i < extents.size() + 1; ++i) + { + SPunchItem& ext = (i == 0) ? first_ext : extents[i - 1]; + + int64_t start_bit = ext.offset / 4096; + int64_t len_bits = div_up(ext.len, 4096); + + if (bmap.get_range(start_bit, start_bit + len_bits)) + { + std::string key_name = std::string(reinterpret_cast(tkey.mv_data), tkey.mv_size); + XLOG(ERR) << "Range of " << decompress_filename(key_name) << " already set offset=" << std::to_string(ext.offset) << " len=" + std::to_string(ext.len); + if (!ignore_errors) + ret = false; + } + + bmap.set_range(start_bit, start_bit + len_bits, true); + } + } + } while (rc != MDB_NOTFOUND); + + mdb_cursor_close(it_cursor); + + if (!ret) + return ret; + + mdb_cursor_open(source_txn, dbi_holes, &it_cursor); + op = MDB_FIRST; + + do + { + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in holes generate_freespace_cache (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in holes generate_freespace_cache (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + if (rc != MDB_NOTFOUND) + { + CRData rdata_val(reinterpret_cast(tval.mv_data), tval.mv_size); + CRData rdata_key(reinterpret_cast(tkey.mv_data), tkey.mv_size); + + int64_t offset, len; + if (rdata_key.getVarInt(&offset) + && rdata_val.getVarInt(&len)) + { + int64_t start_bit = offset / 4096; + int64_t len_bits = div_up(len, 4096); + + if(bmap.get_range(start_bit, start_bit + len_bits)) + { + XLOG(ERR) << "Range of hole already set offset="<0) + { + int64_t curr_end = (std::min)(data_size, data_file_offset_end); + int64_t start_bit = data_file_offset/4096; + int64_t len_bits = div_up(curr_end - data_file_offset, 4096); + + if(bmap.get_range(start_bit, start_bit + len_bits)) + { + XLOG(ERR) << "Range of data_file_offset+end already set offset=" << std::to_string(data_file_offset) << " len=" << std::to_string(curr_end - data_file_offset); + if (!ignore_errors) + ret=false; + } + + bmap.set_range(start_bit, start_bit + len_bits, true); + } + + int64_t free_start = -1; + for (int64_t i = 0; i < bitmap_size; ++i) + { + bool bit_set = bmap.get(i); + + if (!bit_set + && free_start == -1) + { + free_start = i; + } + else if (free_start != -1 + && bit_set) + { + int64_t free_len = i - free_start; + + if (!add_freemap_ext_simple(dst_txn, free_start * 4096, free_len * 4096 - 1, tid)) + { + ret = false; + XLOG(ERR) << "Error adding freemap ext " << std::to_string(free_start) << " len " << std::to_string(free_len); + } + free_start=-1; + } + } + + if (free_start != -1) + { + int64_t free_len = bitmap_size - free_start; + + if (!add_freemap_ext_simple(dst_txn, free_start * 4096, free_len * 4096 - 1, tid)) + { + ret = false; + XLOG(ERR) << "Error adding last freemap ext " << std::to_string(free_start) << " len " << std::to_string(free_len); + } + } + + if(source_txn == dst_txn) + { + return ret; + } + + char ch = 0; + tkey.mv_data = &ch; + tkey.mv_size = 1; + + MDB_val tval; + int64_t tdata; + + tdata = mdb_get_txnid(source_txn); + + tval.mv_data = &tdata; + tval.mv_size = sizeof(tdata); + + rc = mdb_put(dst_txn, dbi_cache_size, &tkey, &tval, 0); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put cache size in generate_freespace_cache (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put cache size in generate_freespace_cache (SIGBUS) sfs " << db_path; + ret = false; + } + + return ret; +} + +bool SingleFileStorage::freespace_check(MDB_txn* source_txn, MDB_txn* freespace_txn, bool fast_check) +{ + THREAD_ID tid = gettid(); + + MDB_val tkey; + int rc; + + int64_t data_size = fileSize(data_file.fd()); + int64_t bitmap_size = div_up(data_size, 4096); + + TmpMmapedFileBitmap bmap(bitmap_size, false); + + auto set_extent_bmap_val = [&bmap](MDB_val& tkey, MDB_val& tval) { + CRData rdata(reinterpret_cast(tval.mv_data), tval.mv_size); + + SPunchItem first_ext; + + if (!rdata.getVarInt(&first_ext.offset) + || !rdata.getVarInt(&first_ext.len)) + { + std::string key_name = std::string(reinterpret_cast(tkey.mv_data), tkey.mv_size); + XLOG(ERR) << "Error reading first ext " << decompress_filename(key_name); + return false; + } + + std::vector extents; + + int64_t num_extents = extract_num_exts(first_ext.offset); + + bool has_error = false; + for (int64_t i = 0; i < num_extents; ++i) + { + SPunchItem cfrag; + if (!rdata.getVarInt(&cfrag.offset) + || !rdata.getVarInt(&cfrag.len)) + { + has_error = true; + break; + } + + extents.push_back(cfrag); + } + + if (has_error) + { + std::string key_name = std::string(reinterpret_cast(tkey.mv_data), tkey.mv_size); + XLOG(ERR) << "Error getting extents of " << decompress_filename(key_name) << " in freespace_check"; + return false; + } + + bool ret = true; + for (size_t i = 0; i < extents.size() + 1; ++i) + { + SPunchItem& ext = (i == 0) ? first_ext : extents[i - 1]; + + int64_t start_bit = ext.offset / 4096; + int64_t len_bits = div_up(ext.len, 4096); + + if (bmap.get_range(start_bit, start_bit + len_bits)) + { + std::string key_name = std::string(reinterpret_cast(tkey.mv_data), tkey.mv_size); + XLOG(ERR) << "Range of " << decompress_filename(key_name) << " already set offset=" << std::to_string(ext.offset) << " len=" << std::to_string(ext.len); + ret = false; + } + + bmap.set_range(start_bit, start_bit + len_bits, true); + } + return ret; + }; + + bool ret = true; + + XLOG(INFO) << "Freespace check sfs " << db_path << " setting bitmap using main..."; + + MDB_cursor* it_cursor; + mdb_cursor_open(source_txn, dbi_main, &it_cursor); + MDB_cursor_op op = MDB_FIRST; + + if (fast_check) + { + XLOG(INFO) << "Reading page ids from " << db_path; + + TmpMmapedPgIds page_ids; + + if (!read_pgids(source_txn, dbi_main, tid, page_ids)) + { + XLOG(ERR) << "Reading page ids from " << db_path << " failed"; + mdb_cursor_close(it_cursor); + return false; + } + + XLOG(INFO) << "Iterating through " << std::to_string(page_ids.size()) << " pages " << db_path; + + for (size_t* pg_it = page_ids.begin(); pg_it != page_ids.end(); ++pg_it) + { + unsigned int nkeys; + rc = mdb_page_get_nkeys(it_cursor, *pg_it, &nkeys); + + if (rc) + { + XLOG(ERR) << "LMDB: Error getting mdb_page_get_nkeys (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in mdb_page_get_nkeys (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + for (unsigned int idx = 0; idx < nkeys; ++idx) + { + MDB_val tval; + rc = mdb_page_get_val(it_cursor, *pg_it, idx, &tkey, &tval); + + if (rc) + { + XLOG(ERR) << "LMDB: Error getting item mdb_page_get_val in main freespace_check (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item mdb_page_get_val in main freespace_check (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + if (!set_extent_bmap_val(tkey, tval)) + ret = false; + } + + if (!ret) + break; + } + } + else + { + do + { + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in main freespace_check (" + (std::string)mdb_strerror(rc) + ") sfs " << db_path; + ret = false; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in main freespace_check (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + if (rc != MDB_NOTFOUND) + { + if (!set_extent_bmap_val(tkey, tval)) + ret = false; + } + } while (rc != MDB_NOTFOUND); + } + + mdb_cursor_close(it_cursor); + + if (!ret) + return ret; + + mdb_cursor_open(source_txn, dbi_old, &it_cursor); + op = MDB_FIRST; + + XLOG(INFO) << "Freespace check sfs " << db_path << " setting bitmap using old..."; + int64_t old_items = 0; + do + { + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in old freespace_check (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in old freespace_check (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + if (rc != MDB_NOTFOUND) + { + ++old_items; + CRData rdata(reinterpret_cast(tval.mv_data), tval.mv_size); + + SPunchItem first_ext; + + int64_t transid; + + if (!rdata.getVarInt(&transid) + || !rdata.getVarInt(&first_ext.offset) + || !rdata.getVarInt(&first_ext.len)) + { + std::string key_name = std::string(reinterpret_cast(tkey.mv_data), tkey.mv_size); + XLOG(ERR) << "Error reading first old ext " << decompress_filename(key_name); + ret = false; + continue; + } + + std::vector extents; + + int64_t num_extents = extract_num_exts(first_ext.offset); + + bool has_error = false; + for (int64_t i = 0; i < num_extents; ++i) + { + SPunchItem cfrag; + if (!rdata.getVarInt(&cfrag.offset) + || !rdata.getVarInt(&cfrag.len)) + { + has_error = true; + break; + } + + extents.push_back(cfrag); + } + + if (has_error) + { + std::string key_name = std::string(reinterpret_cast(tkey.mv_data), tkey.mv_size); + XLOG(ERR) << "Error getting old extents of " << decompress_filename(key_name) << " in freespace_check"; + ret = false; + continue; + } + + for (size_t i = 0; i < extents.size() + 1; ++i) + { + SPunchItem& ext = (i == 0) ? first_ext : extents[i - 1]; + + int64_t start_bit = ext.offset / 4096; + int64_t len_bits = div_up(ext.len, 4096); + + if (bmap.get_range(start_bit, start_bit + len_bits)) + { + std::string key_name = std::string(reinterpret_cast(tkey.mv_data), tkey.mv_size); + XLOG(ERR) << "Range of " << decompress_filename(key_name) << " already set (old) offset=" << std::to_string(ext.offset) + " len=" << std::to_string(ext.len); + ret = false; + } + + bmap.set_range(start_bit, start_bit + len_bits, true); + } + } + } while (rc != MDB_NOTFOUND); + + mdb_cursor_close(it_cursor); + + XLOG(INFO) << "Freespace check sfs " << db_path << " setting bitmap using holes... (old_items=" << std::to_string(old_items) << ")"; + int64_t n_holes = 0; + mdb_cursor_open(source_txn, dbi_holes, &it_cursor); + op = MDB_FIRST; + + do + { + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in holes freespace_check (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in holes freespace_check (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + if (rc != MDB_NOTFOUND) + { + ++n_holes; + CRData rdata_val(reinterpret_cast(tval.mv_data), tval.mv_size); + CRData rdata_key(reinterpret_cast(tkey.mv_data), tkey.mv_size); + + int64_t offset, len; + if (rdata_key.getVarInt(&offset) + && rdata_val.getVarInt(&len)) + { + int64_t start_bit = offset / 4096; + int64_t len_bits = div_up(len, 4096); + + if (start_bit + len_bits >= bitmap_size) + { + XLOG(ERR) << "Freespace check main offset too large offset " << std::to_string(offset) << " len " << std::to_string(len) << " sfs " << db_path; + ret = false; + continue; + } + + if(bmap.get_range(start_bit, start_bit + len_bits)) + { + XLOG(ERR) << "Range of hole already set offset=" << std::to_string(offset) << " len=" << std::to_string(len); + ret=false; + } + + bmap.set_range(start_bit, start_bit + len_bits, true); + } + else + { + XLOG(ERR) << "Error parsing hole data in freespace_check"; + ret = false; + } + } + + } while (rc != MDB_NOTFOUND); + + mdb_cursor_close(it_cursor); + + XLOG(INFO) << "Freespace check sfs " << db_path << " setting bitmap using holes (alt iteration order, n_holes="<(tval.mv_data), tval.mv_size); + CRData rdata_key(reinterpret_cast(tkey.mv_data), tkey.mv_size); + + int64_t offset, len; + if (rdata_key.getVarInt(&offset) + && rdata_val.getVarInt(&len)) + { + int64_t start_bit = offset / 4096; + int64_t len_bits = div_up(len, 4096); + + if (start_bit + len_bits >= bitmap_size) + { + XLOG(ERR) << "Freespace check main offset too large offset " << std::to_string(offset) << " len " << std::to_string(len) << " sfs " << db_path; + ret = false; + continue; + } + + if (!bmap.get_range(start_bit, start_bit + len_bits)) + { + XLOG(ERR) << "Range of hole not already set offset=" << std::to_string(offset) << " len=" << std::to_string(len); + } + + bmap.set_range(start_bit, start_bit + len_bits, true); + } + else + { + XLOG(ERR) << "Error parsing hole data in freespace_check"; + ret = false; + } + } + + } while (rc != MDB_NOTFOUND); + + mdb_cursor_close(it_cursor); + + rc = mdb_set_compare(source_txn, dbi_holes, mdb_cmp_varint); + if (rc) + { + XLOG(ERR) << "Error setting holes comparison function (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + } + + if(data_file_offset_end>0) + { + int64_t curr_end = (std::min)(data_size, data_file_offset_end); + int64_t start_bit = data_file_offset/4096; + int64_t len_bits = div_up(curr_end - data_file_offset, 4096); + + if(bmap.get_range(start_bit, start_bit + len_bits)) + { + XLOG(ERR) << "Range of data_file_offset+end already set offset=" << std::to_string(data_file_offset) << " len=" << std::to_string(curr_end - data_file_offset); + ret=false; + } + + bmap.set_range(start_bit, start_bit + len_bits, true); + } + + + mdb_cursor_open(freespace_txn, dbi_free, &it_cursor); + op = MDB_FIRST; + + XLOG(INFO) << "Freespace check sfs " << db_path << " setting bitmap using freespace..."; + do + { + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in free freespace_check (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in free freespace_check (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + if (rc != MDB_NOTFOUND) + { + CRData rdata_val(reinterpret_cast(tval.mv_data), tval.mv_size); + CRData rdata_key(reinterpret_cast(tkey.mv_data), tkey.mv_size); + + int64_t offset, len; + if (rdata_key.getVarInt(&offset) + && rdata_val.getVarInt(&len)) + { + if (div_up(offset + len, 4096) > bitmap_size) + { + XLOG(ERR) << "Freespace check offset too large offset " << std::to_string(offset) << " len " << std::to_string(len) << " sfs " << db_path+" "<" << std::to_string(bitmap_size); + ret = false; + continue; + } + + bool b = bmap.get_range(offset / 4096, div_up(offset + len, 4096)); + if (b) + { + XLOG(ERR) << "Freespace check freespace is set offset " << std::to_string(offset) << " len " << std::to_string(len) << " sfs " << db_path; + ret = false; + } + bmap.set_range(offset / 4096, div_up(offset + len, 4096), true); + + MDB_cursor* it_cursor2; + mdb_cursor_open(freespace_txn, dbi_free_len, &it_cursor2); + + rc = mdb_cursor_get(it_cursor2, &tval, &tkey, MDB_GET_BOTH); + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in free len freespace_check (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in free len freespace_check (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + if (rc == MDB_NOTFOUND) + { + XLOG(ERR) << "Freespace check freespace item not found in len idx offset " << std::to_string(offset) << " len " << std::to_string(len) << " sfs " << db_path; + ret = false; + } + + mdb_cursor_close(it_cursor2); + } + else + { + XLOG(ERR) << "Error parsing free data in freespace_check"; + ret = false; + } + } + + } while (rc != MDB_NOTFOUND); + + mdb_cursor_close(it_cursor); + + XLOG(INFO) << "Freespace check sfs " << db_path << " checking freespace len idx..."; + + mdb_cursor_open(freespace_txn, dbi_free_len, &it_cursor); + op = MDB_FIRST; + + do + { + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in freelen freespace_check (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in freelen freespace_check (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + if (rc != MDB_NOTFOUND) + { + CRData rdata_val(reinterpret_cast(tval.mv_data), tval.mv_size); + CRData rdata_key(reinterpret_cast(tkey.mv_data), tkey.mv_size); + + int64_t offset, len; + if (rdata_key.getVarInt(&len) + && rdata_val.getVarInt(&offset)) + { + MDB_val tval_out; + rc = mdb_get(freespace_txn, dbi_free, &tval, &tval_out); + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in free len free freespace_check (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in free len free freespace_check (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + CRData rdata_tval_out(reinterpret_cast(tval_out.mv_data), tval_out.mv_size); + + int64_t tval_len; + if (rc == MDB_NOTFOUND + || !rdata_tval_out.getVarInt(&tval_len) + || tval_len!=len) + { + XLOG(ERR) << "Freespace check free len item not found in free offset " << std::to_string(offset) << " len " << std::to_string(len) << " sfs " << db_path; + ret = false; + } + } + else + { + XLOG(ERR) << "Error parsing free data in freespace_check"; + ret = false; + } + } + + } while (rc != MDB_NOTFOUND); + + XLOG(INFO) << "Freespace check sfs " << db_path << " checking bitmap..."; + + int64_t notset_size = 0; + int64_t notset_start=-1; + for (size_t i = 0; i < bitmap_size; ++i) + { + bool bset=bmap.get(i); + if (!bset + && notset_start==-1) + { + notset_start = i; + } + else if(bset + && notset_start!=-1) + { + int64_t len = i - notset_start; + XLOG(ERR) << "Freespace check sfs " << db_path << " bits not set start " << std::to_string(notset_start * 4096)<< + " len " << std::to_string(len*4096) << " ("+folly::prettyPrint(len*4096, folly::PRETTY_BYTES_IEC)+") "<< + " ("+std::to_string(i)<<"/"< disk_id_size + && fn[disk_id_size]=='#') + return fn.substr(disk_id_size + 1); + else + return fn; + } + + + if (fn[0] == 1) + { + CRData rdata(fn.data(), fn.size()); + + char ch; + int64_t e_disk_id; + int64_t transid; + std::string nameb; + + if (!rdata.getChar(&ch) + || !rdata.getVarInt(&e_disk_id) + || !rdata.getVarInt(&transid) + || !rdata.getStr2(&nameb)) + { + return fn; + } + + CWData compr; + compr.addChar(0); + compr.addVarInt(transid); + compr.addString2(nameb); + + return std::string(compr.getDataPtr(), compr.getDataSize()); + } + else if(fn[0]==3) + { + CRData rdata(fn.data(), fn.size()); + + char ch; + int64_t e_disk_id; + int64_t clouddrive_id; + int64_t transid; + std::string nameb; + + if (!rdata.getChar(&ch) + || !rdata.getVarInt(&e_disk_id) + || !rdata.getVarInt(&clouddrive_id) + || !rdata.getVarInt(&transid) + || !rdata.getStr2(&nameb)) + { + return fn; + } + + CWData compr; + compr.addChar(2); + compr.addVarInt(clouddrive_id); + compr.addVarInt(transid); + compr.addString2(nameb); + + return std::string(compr.getDataPtr(), compr.getDataSize()); + } + else + { + abort(); + } +} + +class Buckets +{ +public: + Buckets() + :curr_bucket_div(4096) {} + + void add(int64_t len) + { + if (len%curr_bucket_div != 0) + { + len /= curr_bucket_div; + ++len; + } + else + { + len /= curr_bucket_div; + } + + ++buckets[len]; + + if (buckets.size() > 40) + { + rebuild(); + } + } + + std::string hist() + { + std::string ret; + + for (auto &it : buckets) + { + ret += folly::prettyPrint(it.first*curr_bucket_div, folly::PRETTY_BYTES_IEC) + ": " + std::to_string(it.second) + " exts\n"; + } + + return ret; + } + +private: + void rebuild() + { + curr_bucket_div *= 2; + + std::map new_buckets; + for (auto& it : buckets) + { + int64_t nval; + if (it.first % 2 == 0) + { + nval /= 2; + } + else + { + nval /= 2; + ++nval; + } + new_buckets[it.first / 2] += it.second; + } + + buckets = new_buckets; + } + + std::map buckets; + int64_t curr_bucket_div; +}; + + +std::string SingleFileStorage::freespace_stats() +{ + + MDB_txn* txn; + int rc = mdb_txn_begin(cache_db_env!=nullptr ? cache_db_env : db_env, NULL, MDB_RDONLY, &txn); + if (rc) + { + XLOG(ERR) << "LMDB: Error starting transaction for read (" << mdb_strerror(rc) << ") sfs " << db_path; + mdb_txn_abort(txn); + return std::string(); + } + + rc = mdb_set_compare(txn, dbi_free, mdb_cmp_varint); + if (rc) + { + XLOG(ERR) << "LMDB: Error setting compare func (" << mdb_strerror(rc) << ") sfs " << db_path; + mdb_txn_abort(txn); + return std::string(); + } + + Buckets buckets; + + int64_t prev_offset = -1; + int64_t prev_len; + int64_t total_len = 0; + int64_t total_count = 0; + + MDB_cursor* it_cursor; + mdb_cursor_open(txn, dbi_free, &it_cursor); + MDB_cursor_op op = MDB_FIRST; + do + { + MDB_val tkey; + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in freemap freespace_stats (" << mdb_strerror(rc) << ") sfs " << db_path; + } + + if (rc != MDB_NOTFOUND) + { + CRData dkey(reinterpret_cast(tkey.mv_data), tkey.mv_size); + CRData dval(reinterpret_cast(tval.mv_data), tval.mv_size); + + int64_t cstart; + int64_t clen; + if (dkey.getVarInt(&cstart) + && dval.getVarInt(&clen)) + { + assert(dkey.getStreampos() == dkey.getSize()); + assert(dval.getStreampos() == dval.getSize()); + + buckets.add(clen); + total_len += clen; + ++total_count; + + assert(prev_offset == -1 || prev_offset + prev_len < cstart); + + prev_offset = cstart; + prev_len = clen; + + } + else + { + assert(false); + } + } + } while (rc != MDB_NOTFOUND); + + mdb_cursor_close(it_cursor); + + mdb_txn_abort(txn); + + if (total_count == 0) + total_count = 1; + + std::string total_str = "Total: " + folly::prettyPrint(total_len, folly::PRETTY_BYTES_IEC) + " " + std::to_string(total_count) + " exts. Avg size " + folly::prettyPrint(static_cast(total_len) / total_count, folly::PRETTY_BYTES_IEC)+"\n"; + + return "#### "+ db_path + " #####\n"+ buckets.hist() + total_str + "\n"; +} + +void SingleFileStorage::wait_for_startup_finish() +{ + std::unique_lock lock(mutex); + + wait_startup_finished(lock); +} + +SingleFileStorage::SFragInfo SingleFileStorage::get_frag_info(MDB_txn* txn, const std::string & fn) +{ + THREAD_ID tid = gettid(); + + bool tear_down_txn = false; + if (txn == nullptr) + { + tear_down_txn = true; + + setup_mmap_read_error(tid); + + int rc = mdb_txn_begin(db_env, NULL, MDB_RDONLY, &txn); + if (rc) + { + XLOG(ERR) << "LMDB: Error starting transaction for read (" << mdb_strerror(rc) << ") txn=" << std::to_string(reinterpret_cast(txn)) << " sfs " << db_path; + clear_mmap_read_error(tid); + return SFragInfo(); + } + } + + MDB_val tkey; + tkey.mv_data = const_cast(&fn[0]); + tkey.mv_size = fn.size(); + + MDB_val tvalue; + int rc = mdb_get(txn, dbi_main, &tkey, &tvalue); + + if (tear_down_txn) + { + if (clear_mmap_read_error(tid)) + { + XLOG(ERR) << "LMDB: Error getting fragment info because of SIGBUS txn=" << std::to_string(reinterpret_cast(txn)) << " sfs " << db_path; + mdb_txn_abort(txn); + return SFragInfo(); + } + } + else + { + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting fragment info because of SIGBUS txn = " << std::to_string(reinterpret_cast(txn)) << " sfs " << db_path; + if (tear_down_txn) + { + mdb_txn_abort(txn); + } + return SFragInfo(); + } + } + + if (rc) + { + if (rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting fragment info (" << mdb_strerror(rc) << ") txn=" << std::to_string(reinterpret_cast(txn)) << " sfs " << db_path; + } + if (tear_down_txn) + { + mdb_txn_abort(txn); + } + return SFragInfo(); + } + + CRData rdata(reinterpret_cast(tvalue.mv_data), tvalue.mv_size); + + SFragInfo ret; + + if (!rdata.getVarInt(&ret.offset) + || !rdata.getVarInt(&ret.len)) + { + XLOG(ERR) << "LMDB: Error extracting offset and len from fragment info txn=" << std::to_string(reinterpret_cast(txn)) << " sfs " << db_path; + ret = SFragInfo(); + } + else + { + int64_t exts = extract_num_exts(ret.offset); + for (int64_t i = 0; i < exts; ++i) + { + SPunchItem ext; + if (!rdata.getVarInt(&ext.offset) + || !rdata.getVarInt(&ext.len)) + { + XLOG(ERR) << "LMDB: Error extracting ext " << std::to_string(i) << " offset and len from fragment info txn=" << std::to_string(reinterpret_cast(txn)) << " sfs " << db_path; + ret = SFragInfo(); + break; + } + ret.extra_exts.push_back(ext); + } + + if (ret.offset != -1) + { + rdata.getVarInt(&ret.last_modified); + } + } + + if (tear_down_txn) + { + mdb_txn_abort(txn); + } + + return ret; +} + +bool SingleFileStorage::generate_free_len_idx(MDB_txn * txn) +{ + THREAD_ID tid = gettid(); + MDB_cursor* it_cursor; + mdb_cursor_open(txn, dbi_free, &it_cursor); + MDB_cursor_op op = MDB_FIRST; + int rc; + bool ret = true; + do + { + MDB_val tkey; + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in freemap startup (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in freemap startup (SIGBUS) sfs " << db_path; + ret = false; + break; + } + + if (rc != MDB_NOTFOUND) + { + CRData dkey(reinterpret_cast(tkey.mv_data), tkey.mv_size); + CRData dval(reinterpret_cast(tval.mv_data), tval.mv_size); + + int64_t cstart; + int64_t clen; + if (dkey.getVarInt(&cstart) + && dval.getVarInt(&clen) + && dkey.getStreampos() == dkey.getSize() + && dval.getStreampos() == dval.getSize()) + { + rc = mdb_put(txn, dbi_free_len, &tval, &tkey, 0); + + if (rc) + { + XLOG(ERR) << "Error adding freemap idx item (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "Error adding freemap idx item (SIGBUS) sfs " << db_path; + ret = false; + break; + } + } + else + { + XLOG(WARN) << "Deleting invalid freemap extent sfs " << db_path; + mdb_cursor_del(it_cursor, 0); + } + } + } while (rc != MDB_NOTFOUND); + + mdb_cursor_close(it_cursor); + + return ret; +} + +int64_t SingleFileStorage::get_disk_id(MDB_txn* txn, THREAD_ID tid, const std::string & uuid) +{ + MDB_val val; + val.mv_data = const_cast(uuid.data()); + val.mv_size = uuid.size(); + + MDB_val disk_id_out; + + int rc = mdb_get(txn, dbi_size, &val, &disk_id_out); + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting data file max size (" << mdb_strerror(rc) << ")"; + return 0; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting data file max size (SIGBUS)"; + return 0; + } + + int64_t ret = 0; + if (rc != MDB_NOTFOUND + && disk_id_out.mv_size==sizeof(int64_t)) + { + memcpy(&ret, disk_id_out.mv_data, sizeof(ret)); + } + + if (ret==0) + { + ret = next_disk_id; + ++next_disk_id; + + MDB_val tkey; + char ch = 1; + tkey.mv_data = &ch; + tkey.mv_size = 1; + + MDB_val tval; + + tval.mv_data = &next_disk_id; + tval.mv_size = sizeof(next_disk_id); + + rc = mdb_put(txn, dbi_size, &tkey, &tval, 0); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put next_disk_id in commit (" << mdb_strerror(rc) << ") sfs " << db_path; + return 0; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put next_disk_id in commit (SIGBUS) sfs " << db_path; + return 0; + } + + tkey.mv_data = const_cast(uuid.data()); + tkey.mv_size = uuid.size(); + + tval.mv_data = &ret; + tval.mv_size = sizeof(ret); + + rc = mdb_put(txn, dbi_size, &tkey, &tval, 0); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put next_disk_id in commit (" << mdb_strerror(rc) << ") sfs " << db_path; + return 0; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put next_disk_id in commit (SIGBUS) sfs " << db_path; + return 0; + } + } + + return ret; +} + +int64_t SingleFileStorage::get_disk_trans_id(MDB_txn * txn, THREAD_ID tid, int64_t disk_id) +{ + MDB_val val; + val.mv_data = &disk_id; + val.mv_size = sizeof(disk_id); + + MDB_val disk_id_out; + + int rc = mdb_get(txn, dbi_size, &val, &disk_id_out); + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting disk trans id (" << mdb_strerror(rc) << ")"; + return -1; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting disk trans id (SIGBUS)"; + return -1; + } + + if (rc != MDB_NOTFOUND + && disk_id_out.mv_size == sizeof(int64_t)) + { + int64_t ret; + memcpy(&ret, disk_id_out.mv_data, sizeof(ret)); + return ret; + } + else if (rc == MDB_NOTFOUND) + { + return 0; + } + + return -1; +} + +bool SingleFileStorage::set_disk_trans_id(MDB_txn * txn, THREAD_ID tid, int64_t disk_id, int64_t trans_id) +{ + MDB_val tkey; + tkey.mv_data = &disk_id; + tkey.mv_size = sizeof(disk_id); + + MDB_val tval; + tval.mv_data = &trans_id; + tval.mv_size = sizeof(trans_id); + + int rc = mdb_put(txn, dbi_size, &tkey, &tval, 0); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put disk trans id in commit (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put disk trans id in commit (SIGBUS) sfs " << db_path; + return false; + } + + return true; +} + +bool SingleFileStorage::rewrite_npages(MDB_txn* txn, MDB_cursor* mc, THREAD_ID tid, size_t npages) +{ + for (size_t i = 0; i < npages; ++i) + { + size_t pgno; + int rc = mdb_cursor_next_leaf_page(mc, &pgno); + + if (rc == MDB_NOTFOUND) + return true; + + if (rc) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_next_leaf_page in rewrite_npages (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_next_leaf_page in rewrite_npages (SIGBUS) sfs " << db_path; + return false; + } + + int dirty; + if (mdb_page_is_dirty(mc, pgno, &dirty) == MDB_SUCCESS && + !dirty) + { + MDB_val tkey; + MDB_val tval; + rc = mdb_cursor_get(mc, &tkey, &tval, MDB_GET_CURRENT); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_get in rewrite_npages (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_get in rewrite_npages (SIGBUS) sfs " << db_path; + return false; + } + + rc = mdb_cursor_put(mc, &tkey, &tval, MDB_CURRENT); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_put in rewrite_npages (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_put in rewrite_npages (SIGBUS) sfs " << db_path; + return false; + } + } + } + + return true; +} + +int SingleFileStorage::put_with_rewrite(MDB_txn* txn, MDB_dbi dbi, MDB_val* tkey, MDB_val* tval, THREAD_ID tid, size_t npages) +{ + MDB_cursor* mc; + int rc = mdb_cursor_open(txn, dbi, &mc); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_open in put_with_rewrite (" << mdb_strerror(rc) << ") sfs " << db_path; + return rc; + } + + SCOPE_EXIT { mdb_cursor_close(mc); }; + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_open in put_with_rewrite (SIGBUS) sfs " << db_path; + return rc; + } + + rc = mdb_cursor_put(mc, tkey, tval, 0); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_open in put_with_rewrite (" << mdb_strerror(rc) << ") sfs " << db_path; + return rc; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_open in put_with_rewrite (SIGBUS) sfs " << db_path; + return rc; + } + + if (with_rewrite) + rewrite_npages(txn, mc, tid, npages); + + return MDB_SUCCESS; +} + +void SingleFileStorage::operator()() +{ + int64_t curr_write_ext_start; + int64_t curr_write_ext_end; + { + std::scoped_lock lock(datafileoffset_mutex); + curr_write_ext_start = data_file_offset; + curr_write_ext_end = data_file_offset_end; + } + + THREAD_ID tid = gettid(); + setup_mmap_read_error(tid); + + int64_t commit_errors = 0; + MDB_txn* txn; + int rc = mdb_txn_begin(db_env, NULL, 0, &txn); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for commit (1) (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for commit (1) (SIGBUS) sfs " << db_path; + ++commit_errors; + } + + XLOG(INFO) << "Initial txn id " << std::to_string(mdb_get_txnid(txn)); + + MDB_txn* freespace_txn; + + if (cache_db_env != nullptr) + { + rc = mdb_txn_begin(cache_db_env, NULL, 0, &freespace_txn); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for cache_db commit (1) (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for cache_db commit (1) (SIGBUS) sfs " << db_path; + ++commit_errors; + } + XLOG(INFO) << "Using seperate freespace cache"; + + rc = mdb_dbi_open(freespace_txn, "free", 0, &dbi_free); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to open free dbi cache_db commit (1) (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to open free dbi for cache_db commit (1) (SIGBUS) sfs " << db_path; + ++commit_errors; + } + + rc = mdb_dbi_open(freespace_txn, "free_len", 0, &dbi_free_len); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to open free len dbi cache_db commit (1) (" << (std::string)mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to open free len dbi for cache_db commit (1) (SIGBUS) sfs " << db_path; + ++commit_errors; + } + + rc = mdb_dbi_open(freespace_txn, "size", 0, &dbi_cache_size); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to open cache size dbi cache_db commit (1) (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to open cache size dbi for cache_db commit (1) (SIGBUS) sfs " << db_path; + ++commit_errors; + } + + rc = mdb_txn_commit(freespace_txn); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to commit freespace_txn beg (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to commit freespace_txn beg (SIGBUS) sfs " << db_path; + ++commit_errors; + } + + rc = mdb_txn_begin(cache_db_env, NULL, 0, &freespace_txn); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for cache_db commit (3) (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for cache_db commit (3) (SIGBUS) sfs " << db_path; + ++commit_errors; + } + } + else + { + XLOG(INFO) << "Not having seperate freespace cache"; + freespace_txn = txn; + } + + rc = mdb_set_compare(freespace_txn, dbi_free, mdb_cmp_varint); + if (rc) + { + XLOG(ERR) << "Error setting free comparison function (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + + rc = mdb_set_compare(freespace_txn, dbi_free_len, mdb_cmp_varint_rev); + if (rc) + { + XLOG(ERR) << "Error setting free_len comparison function (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + + rc = mdb_set_compare(txn, dbi_holes, mdb_cmp_varint); + if (rc) + { + XLOG(ERR) << "Error setting holes comparison function (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + + if (!regen_freespace_cache + && cache_db_env == nullptr + && std::filesystem::exists("/etc/urbackup/regen_free_space_cache")) + { + regen_freespace_cache = true; + + XLOG(WARN) <<" Clearing main freespace cache (manual regen)..."; + + if (!clear_freespace_cache(txn)) + { + XLOG(ERR) << "Error clearing main freespace cache"; + ++commit_errors; + } + } + + if (regen_freespace_cache) + { + data_file_free=0; + if (!generate_freespace_cache(txn, freespace_txn, false)) + { + XLOG(ERR) << "Generating freespace cache failed"; + + if (cache_db_env == nullptr) + { + abort(); + } + + ++commit_errors; + } + } + + if (freespace_txn != txn + && std::filesystem::exists("/etc/urbackup/check_free_space_cache") ) + { + if (!freespace_check(txn, freespace_txn, true)) + { + XLOG(ERR) << "Freespace check failed"; + ++commit_errors; + } + } + + if (regen_freespace_cache + && cache_db_env!=nullptr + && commit_errors == 0) + { + XLOG(INFO) << "Clearing main freespace cache..."; + + if (!clear_freespace_cache(txn)) + { + XLOG(ERR) << "Error clearing main freespace cache"; + ++commit_errors; + } + } + + { + char ch = dbi_size_info_ext_freespace; + MDB_val key; + key.mv_data = &ch; + key.mv_size = 1; + + if (cache_db_env == nullptr) + { + XLOG(INFO) << "Ext freespace disabled"; + rc = mdb_del(txn, dbi_size, &key, NULL); + + if (rc && rc!=MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Failed to mdb_del dbi_size_info_ext_freespace (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to mdb_del dbi_size_info_ext_freespace (SIGBUS) sfs " << db_path; + ++commit_errors; + } + } + else + { + int64_t ival = sync_freespace_cache ? 1 : 0; + + if(sync_freespace_cache) + { + XLOG(INFO) << "Ext freespace sync"; + } + else + { + XLOG(INFO) << "Ext freespace nosync"; + } + + MDB_val val; + val.mv_data = &ival; + val.mv_size = sizeof(ival); + + rc = mdb_put(txn, dbi_size, &key, &val, 0); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed to mdb_put dbi_size_info_ext_freespace (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to mdb_put dbi_size_info_ext_freespace (SIGBUS) sfs " << db_path; + ++commit_errors; + } + } + } + + if (data_file_free == 0) + { + if(!regen_datafile_free(freespace_txn)) + { + ++commit_errors; + } + } + + if(!regen_free_len_idx(freespace_txn)) + { + ++commit_errors; + } + + if (std::filesystem::exists("/etc/urbackup/sfs_delete_holes")) + { + commit_errors += reset_holes(txn, freespace_txn, tid); + } + + std::unique_lock lock(mutex); + + commit_errors += reset_del_log_fn(txn, freespace_txn, tid, 0, curr_transid); + + commit_errors += reset_del_queue(txn, freespace_txn, tid, 0, curr_transid); + + startup_finished = true; + + lock.unlock(); + + if (!do_free_minspace(txn, freespace_txn, tid)) + { + XLOG(ERR) << "Freeing minspace failed"; + ++commit_errors; + } + + lock.lock(); + + size_t mod_items = 0; + + while (!do_quit + && !is_dead) + { + if (commit_errors > 0) + { + write_offline = true; + do_stop_on_error(); + } + + int first_wait = 0; + while (commit_queue.empty() + && (commit_background_queue.empty() || mdb_curr_sync ) + && !do_quit) + { + if (first_wait == 0) + { + first_wait = 1; + cond.wait_for(lock, 10s); + } + else if (first_wait == 1) + { + lock.unlock(); + first_wait = 2; + //TODO: Server->mallocFlushTcache(); + lock.lock(); + } + else + { + cond.wait(lock); + } + } + + if (do_quit + || is_dead) + { + break; + } + + SFragInfo frag_info; + /*if (mod_items > 100000) + { + frag_info.action = FragAction::Commit; + mod_items = 0; + } + else + {*/ + if (commit_queue.empty()) + { + assert(!commit_background_queue.empty()); + frag_info = commit_background_queue.front(); + commit_background_queue.pop_front(); + } + else + { + frag_info = commit_queue.front(); + commit_queue.pop_front(); + } + //} + + if (frag_info.action == FragAction::Commit + && is_defragging) + { + XLOG(INFO) <<"Waiting for defrag to restart before commiting..."; + defrag_restart = 1; + bool postpone_commit = false; + while (defrag_restart==1 + && is_defragging + && !postpone_commit) + { + lock.unlock(); + std::this_thread::sleep_for(100ms); + lock.lock(); + + //Reorder getting free extent -- otherwise there might be deadlocks + for (auto it=commit_queue.begin();it!=commit_queue.end();++it) + { + if (it->action == FragAction::FindFree + && it->len == 1) + { + XLOG(WARN) << "Found getting free extent during waiting for defrag to restart for commit. Getting free exitent first..."; + postpone_commit = true; + commit_queue.push_front(frag_info); + frag_info = *it; + commit_queue.erase(it); + break; + } + } + } + } + else if (frag_info.action == FragAction::Commit) + { + defrag_restart = 0; + } + + lock.unlock(); + + if (frag_info.action == FragAction::Commit + && frag_info.offset != -1) + { + int64_t disk_id = frag_info.len; + if (disk_id == 0) + curr_transid = frag_info.offset; + else + set_disk_trans_id(txn, tid, disk_id, frag_info.offset); + } + + if (frag_info.action == FragAction::Add + || frag_info.action == FragAction::AddNoDelOld + || frag_info.action == FragAction::Del + || frag_info.action == FragAction::DelOld + || frag_info.action == FragAction::RestoreOld + || frag_info.action==FragAction::DelWithQueued) + { + ++mod_items; + --commit_items[std::hash()(frag_info.fn)]; + } + + if (frag_info.action == FragAction::FindFree + || frag_info.action == FragAction::Del + || frag_info.action == FragAction::DelWithQueued) + { + mod_items += 2; + } + + if (frag_info.action == FragAction::Commit ) + { + if (frag_info.md5sum == "reset" + && commit_errors>0) + { + XLOG(ERR) << "Resetting commit errors (" << std::to_string(commit_errors) << ") by resetting (abort+begin) transaction"; + + mdb_txn_abort(txn); + if (freespace_txn == txn) + { + freespace_txn = nullptr; + } + txn = nullptr; + + commit_errors = 0; + + int rc = mdb_txn_begin(db_env, NULL, 0, &txn); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for commit (2) (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for commit (2) (SIGBUS) sfs " << db_path; + ++commit_errors; + } + + if (cache_db_env == nullptr) + { + freespace_txn = txn; + } + else + { + mdb_txn_abort(freespace_txn); + freespace_txn = nullptr; + + int rc = mdb_txn_begin(cache_db_env, NULL, 0, &freespace_txn); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for freespace commit (2) (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for freespace commit (2) (SIGBUS) sfs " << db_path; + ++commit_errors; + } + } + + commit_errors += reset_del_queue(txn, freespace_txn, tid, 0, curr_transid); + } + + if (curr_new_free_extents.size() < 1000 || + std::filesystem::exists("/var/urbackup/sfs_always_run_do_free_minspace") ) + { + if (!do_free_minspace(txn, freespace_txn, tid)) + { + XLOG(ERR) << "Freeing minspace failed (2)"; + ++commit_errors; + } + } + + if (folly::fsyncNoInt(data_file.fd())!=0) + { + XLOG(ERR) << "Failed to sync data file " << data_file_path << ". " << folly::errnoStr(errno); + ++commit_errors; + } + + if (new_data_file.fd() != -1 + && folly::fsyncNoInt(new_data_file.fd())!=0) + { + XLOG(ERR) << "Failed to sync new data file " << (data_file_path.parent_path() / "new_data") << ". " << folly::errnoStr(errno); + ++commit_errors; +#ifndef _WIN32 + posix_fadvise64(new_data_file.fd(), 0, 0, POSIX_FADV_DONTNEED); +#endif + } + +#ifndef _WIN32 + posix_fadvise64(data_file.fd(), 0, 0, POSIX_FADV_DONTNEED); +#endif + size_t num_reserved_extents; + { + std::vector local_reserved_extents; + { + std::scoped_lock lock(datafileoffset_mutex); + local_reserved_extents.reserve(reserved_extents.size()); + for(const auto& it: reserved_extents) + { + if(it.first + it.second <= data_file_max_size) + local_reserved_extents.emplace_back(SPunchItem(it.first, it.second)); + } + } + + num_reserved_extents = local_reserved_extents.size(); + + size_t ext_idx = 0; + for(const auto& reserved_extent: local_reserved_extents) + { + commit_errors += add_tmp(ext_idx, txn, tid, reserved_extent.offset, reserved_extent.len); + ++ext_idx; + } + } + + + if (curr_write_ext_start != -1) + { + MDB_val tkey; + char ch = dbi_size_info_size; + tkey.mv_data = &ch; + tkey.mv_size = 1; + + MDB_val tval; + int64_t tdata[5]; + + tdata[0] = data_file_max_size; + tdata[1] = curr_write_ext_start; + tdata[2] = curr_write_ext_end; + tdata[3] = data_file_free; + tdata[4] = curr_transid; + + tval.mv_data = tdata; + tval.mv_size = sizeof(tdata); + + rc = mdb_put(txn, dbi_size, &tkey, &tval, 0); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put max size in commit (" << mdb_strerror(rc) << ") sfs "< 0) + { + MDB_val tkey; + char ch = 0; + tkey.mv_data = &ch; + tkey.mv_size = dbi_size_info_migration; + + MDB_val tval; + tval.mv_data = &data_file_copy_done; + tval.mv_size = sizeof(data_file_copy_done); + + rc = mdb_put(txn, dbi_size, &tkey, &tval, 0); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put data_file_copy_done in commit (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put data_file_copy_done in commit (SIGBUS) sfs " << db_path; + ++commit_errors; + } + + if (commit_errors == 0) + { + new_data_file_copy_done = data_file_copy_done; + } + } + } + + if (commit_errors > 0) + { + write_offline = true; + do_stop_on_error(); + } + + bool commit_ok = true; + if (!write_offline) + { + bool commit_freespace = true; + if (sync_freespace_cache + && cache_db_env!=nullptr + && commit_freespace) + { + MDB_val main_txnid_key; + char ch = dbi_size_info_size; + main_txnid_key.mv_data = &ch; + main_txnid_key.mv_size = 1; + + MDB_val main_txnid_val; + int64_t main_txnid = mdb_get_txnid(txn); + main_txnid_val.mv_data = &main_txnid; + main_txnid_val.mv_size = sizeof(main_txnid); + + rc = mdb_put(freespace_txn, dbi_cache_size, + &main_txnid_key, &main_txnid_val, 0); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put main txnid (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + commit_ok = false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to put main txnid (SIGBUS) sfs " << db_path; + ++commit_errors; + commit_ok = false; + } + + rc = mdb_txn_commit(freespace_txn); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed to commit freespace (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + commit_ok = false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to commit freespace (SIGBUS) sfs " << db_path; + ++commit_errors; + commit_ok = false; + } + + freespace_txn = nullptr; + + int rc = mdb_txn_begin(cache_db_env, NULL, 0, &freespace_txn); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for freespace commit (2) (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for freespace commit (2) (SIGBUS) sfs " << db_path; + ++commit_errors; + } + } + + rc = mdb_txn_commit(txn); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to commit (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + commit_ok = false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to commit (SIGBUS) sfs " << db_path; + ++commit_errors; + commit_ok = false; + } + txn = nullptr; + + if (commit_ok) + { + auto active_fn = data_file_path.parent_path() / "active"; + if (!std::filesystem::exists(active_fn)) + { + int fd = open(active_fn.c_str(), O_WRONLY | O_CREAT | O_CLOEXEC); + if (fd !=-1) + { + folly::fsyncNoInt(fd); + os_sync(active_fn.parent_path().string()); + } + } + } + } + else + { + commit_ok = false; + } + + if(commit_ok) + { + for(size_t ext_idx =0;ext_idx 0) + commit_ok = false; + } + } + + if (commit_ok) + { + curr_new_free_extents.clear(); + + if (new_data_file_copy_done > 0) + { + std::scoped_lock copy_lock(data_file_copy_mutex); + data_file_copy_done_sync = new_data_file_copy_done; + } + } + else + { + write_offline = true; + do_stop_on_error(); + } + + bool has_commit_info = false; + { + std::scoped_lock llock(mutex); + if (frag_info.commit_info != nullptr) + { + frag_info.commit_info->commit_errors = commit_errors; + frag_info.commit_info->commit_done.notify_all(); + has_commit_info = true; + } + + if (is_defragging) + { + XLOG(INFO) << "Defrag ctr incremented. Defrag can continue..."; + defrag_restart=0; + } + } + + if (has_commit_info + && commit_ok) + { + commit_errors = 0; + } + + if (commit_ok) + { + int rc = mdb_txn_begin(db_env, NULL, 0, &txn); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for commit (2) (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to open transaction handle for commit (2) (SIGBUS) sfs " << db_path; + ++commit_errors; + } + + if (cache_db_env == nullptr) + freespace_txn = txn; + + XLOG(INFO) << "After commit txn id " + std::to_string(mdb_get_txnid(txn)); + + rc = mdb_set_compare(freespace_txn, dbi_free, mdb_cmp_varint); + if (rc) + { + XLOG(ERR) << "Error setting free comparison function (2) (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + + rc = mdb_set_compare(freespace_txn, dbi_free_len, mdb_cmp_varint_rev); + if (rc) + { + XLOG(ERR) << "Error setting free_len comparison function (2) (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + + rc = mdb_set_compare(txn, dbi_holes, mdb_cmp_varint); + if (rc) + { + XLOG(ERR) << "Error setting holes comparison function (2) (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + + + for(auto it=commit_items.begin();it!=commit_items.end();) + { + if(it->second==0) + { + auto it_prev=it; + ++it; + commit_items.erase(it_prev); + } + else + { + ++it; + } + } + } + } + else if (frag_info.action == FragAction::ResetDelLog) + { + int64_t disk_id = frag_info.last_modified; + int64_t reset_transid = frag_info.offset; + + commit_errors+=reset_del_log_fn(txn, freespace_txn, tid, disk_id, reset_transid); + + if (frag_info.commit_info != nullptr) + { + std::scoped_lock llock(mutex); + frag_info.commit_info->commit_errors = commit_errors; + frag_info.commit_info->commit_done.notify_all(); + } + } + else if (frag_info.action == FragAction::GetDiskId) + { + int64_t disk_id = get_disk_id(txn, tid, frag_info.fn); + + if (disk_id == 0) + ++commit_errors; + + if (frag_info.commit_info != nullptr) + { + std::scoped_lock llock(mutex); + frag_info.commit_info->commit_errors = commit_errors; + frag_info.commit_info->new_datafile_offset = disk_id; + frag_info.commit_info->commit_done.notify_all(); + } + } + else if (frag_info.action == FragAction::EmptyQueue) + { + if (frag_info.commit_info != nullptr) + { + std::scoped_lock llock(mutex); + frag_info.commit_info->commit_errors = 0; + frag_info.commit_info->commit_done.notify_all(); + } + } + else if (frag_info.action == FragAction::ReadFragInfo) + { + if (frag_info.commit_info != nullptr) + { + frag_info.commit_info->commit_errors = 0; + + std::scoped_lock llock(mutex); + + if (frag_info.commit_info->frag_info != nullptr) + { + *frag_info.commit_info->frag_info = get_frag_info(txn, frag_info.fn); + } + + frag_info.commit_info->commit_done.notify_all(); + } + } + else if (frag_info.action == FragAction::Del + || frag_info.action == FragAction::DelOld + || frag_info.action == FragAction::DelWithQueued) + { + bool del_old = frag_info.action == FragAction::DelOld; + commit_errors += remove_fn(frag_info.fn, txn, freespace_txn, true, del_old, tid); + + if (frag_info.action == FragAction::DelWithQueued) + { + commit_errors += unqueue_del(frag_info.fn, txn, tid); + } + } + else if (frag_info.action == FragAction::AssertDelQueueEmpty) + { + MDB_cursor* it_cursor; + mdb_cursor_open(txn, dbi_queue_del, &it_cursor); + + SCOPE_EXIT { mdb_cursor_close(it_cursor); }; + MDB_val tkey; + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, MDB_FIRST); + + if (rc && rc != MDB_NOTFOUND) + { + ++commit_errors; + } + else if(rc!=MDB_NOTFOUND) + { + CRData rdata(reinterpret_cast(tval.mv_data), tval.mv_size); + int64_t transid = 0; + rdata.getVarInt(&transid); + std::string fn = std::string(reinterpret_cast(tkey.mv_data), tkey.mv_size); + XLOG(ERR) << "Found item " << decompress_filename(fn) << " transid " + << std::to_string(transid) << " in del queue. Abort()"; + abort(); + } + else + { + XLOG(INFO) << "Assert del queue empty okay"; + } + } + else if (frag_info.action == FragAction::QueueDel) + { + int64_t log_transid = curr_transid; + + int64_t disk_id = get_fn_disk_id(frag_info.fn); + + if (disk_id != 0) + { + log_transid = get_disk_trans_id(txn, tid, disk_id); + + if (log_transid == -1) + { + ++commit_errors; + } + } + + commit_errors += queue_del(frag_info.fn, txn, tid, log_transid); + } + else if (frag_info.action == FragAction::UnqueueDel) + { + commit_errors += unqueue_del(frag_info.fn, txn, tid); + } + else if (frag_info.action == FragAction::RestoreOld) + { + commit_errors += restore_fn(frag_info.fn, txn, freespace_txn, tid); + } + else if (frag_info.action == FragAction::Add + || frag_info.action == FragAction::AddNoDelOld) + { + if (frag_info.action == FragAction::Add) + { + commit_errors += remove_fn(frag_info.fn, txn, freespace_txn, false, false, tid); + } + else + { + int64_t disk_id = get_fn_disk_id(frag_info.fn); + + int64_t log_transid = curr_transid; + + if (disk_id != 0) + { + log_transid = get_disk_trans_id(txn, tid, disk_id); + + if (log_transid==-1) + { + ++commit_errors; + } + } + + commit_errors += log_fn(frag_info.fn, txn, tid, log_transid); + } + + CWData wdata; + wdata.addVarInt(encode_num_exts(frag_info.offset, static_cast(frag_info.extra_exts.size()))); + wdata.addVarInt(frag_info.len); + int64_t size = frag_info.offset + div_up(frag_info.len, block_size)*block_size; + + if (size > curr_write_ext_start + && size <= curr_write_ext_end) + curr_write_ext_start = size; + if (size > data_file_max_size) + data_file_max_size = size; + + for (const SPunchItem& ext : frag_info.extra_exts) + { + wdata.addVarInt(ext.offset); + wdata.addVarInt(ext.len); + size = ext.offset + div_up(ext.len, block_size)*block_size; + + if (size > curr_write_ext_start + && size <= curr_write_ext_end) + curr_write_ext_start = size; + if (size > data_file_max_size) + data_file_max_size = size; + } + + { + std::scoped_lock lock(datafileoffset_mutex); + + auto it_r = reserved_extents.find(frag_info.offset); + if(it_r!=reserved_extents.end()) + reserved_extents.erase(it_r); + + for (const SPunchItem& ext : frag_info.extra_exts) + { + it_r = reserved_extents.find(ext.offset); + if(it_r!=reserved_extents.end()) + reserved_extents.erase(it_r); + } + } + + wdata.addVarInt(frag_info.last_modified); + wdata.addString2(frag_info.md5sum); + + MDB_val tval; + tval.mv_data = wdata.getDataPtr(); + tval.mv_size = wdata.getDataSize(); + + MDB_val tkey; + tkey.mv_data = const_cast(&frag_info.fn[0]); + tkey.mv_size = frag_info.fn.size(); + + rc = put_with_rewrite(txn, dbi_main, &tkey, &tval, tid, n_rewrite_pages); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to put extent info in commit (" << mdb_strerror(rc) << ") sfs " << db_path; + ++commit_errors; + } + } + else if (frag_info.action == FragAction::FindFree) + { + if (frag_info.offset > curr_write_ext_start) + curr_write_ext_start = frag_info.offset; + if (frag_info.offset > data_file_max_size) + data_file_max_size = frag_info.offset; + + bool search_for_free_space = true; + bool add_remaining_ext = true; + if (curr_write_ext_start == data_file_max_size) + { + add_remaining_ext = false; + int64_t free_space = os_free_space(data_file_path.parent_path().string()); + free_space += get_burn_in_data_size(); + int64_t index_file_size; + int64_t really_min_space = get_really_min_space(index_file_size); + if ( (data_file_size_limit<=0 || curr_write_ext_start + alloc_chunk_size < data_file_size_limit) && + free_space > min_free_space + && free_space > really_min_space) + { + curr_write_ext_end = curr_write_ext_start + alloc_chunk_size; + search_for_free_space = false; + } + else + { + XLOG(WARN) << "Volume full (" << folly::prettyPrint(free_space, folly::PRETTY_BYTES_IEC) << " free space). Switching to using free space in data file"; + } + } + else if (force_freespace_check) + { + force_freespace_check = false; + + if (curr_write_ext_end - curr_write_ext_start >= block_size + && add_remaining_ext) + { + add_freemap_ext(freespace_txn, curr_write_ext_start, + curr_write_ext_end - curr_write_ext_start, false, tid); + add_remaining_ext = false; + } + + int64_t free_space = os_free_space(data_file_path.parent_path().string()); + free_space += get_burn_in_data_size(); + int64_t index_file_size; + int64_t really_min_space = get_really_min_space(index_file_size); + if (free_space > min_free_space + alloc_chunk_size + && free_space > really_min_space + alloc_chunk_size) + { + XLOG(WARN) << "Volume has free space (" << folly::prettyPrint(free_space, folly::PRETTY_BYTES_IEC) << " free space). Switching to appending to data file"; + curr_write_ext_start = data_file_max_size; + curr_write_ext_end = curr_write_ext_start + alloc_chunk_size; + search_for_free_space = false; + } + } + + if (search_for_free_space) + { + if (curr_write_ext_end - curr_write_ext_start >= block_size + && add_remaining_ext) + { + add_freemap_ext(freespace_txn, curr_write_ext_start, + curr_write_ext_end - curr_write_ext_start, false, tid); + } + + int64_t start, len; + if (find_freemap_ext(freespace_txn, tid, start, len)) + { + XLOG(DBG) << "Writing to free extent (" << std::to_string(start) << ", " << std::to_string(len) << ")"; + + MDB_val tkey; + CWData wtkey; + wtkey.addVarInt(start); + tkey.mv_data = wtkey.getDataPtr(); + tkey.mv_size = wtkey.getDataSize(); + rc = mdb_del(freespace_txn, dbi_free, &tkey, nullptr); + + bool mmap_err = has_mmap_read_error_reset(tid); + if (mmap_err) + { + XLOG(ERR) << "LMDB: Error removing free extent (SIGBUS) sfs " << db_path; + frag_info.commit_info->commit_errors = 1; + ++commit_errors; + } + + if (rc) + { + XLOG(ERR) << "Error removing free extent (" << mdb_strerror(rc) << ") sfs " << db_path; + frag_info.commit_info->commit_errors = 1; + } + else if(!mmap_err) + { + MDB_val tval; + CWData wtval; + wtval.addVarInt(len); + tval.mv_data = wtval.getDataPtr(); + tval.mv_size = wtval.getDataSize(); + + rc = mdb_del(freespace_txn, dbi_free_len, &tval, &tkey); + + if (rc) + { + XLOG(ERR) << "Error removing free extent (2) (" << mdb_strerror(rc) << ") sfs " << db_path; + frag_info.commit_info->commit_errors = 1; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error removing free extent (2) (SIGBUS) sfs " << db_path; + frag_info.commit_info->commit_errors = 1; + ++commit_errors; + } + } + + { + std::scoped_lock lock2(freespace_mutex); + data_file_free -= len; + } + + curr_write_ext_start = start; + curr_write_ext_end = start + len; + } + else + { + int64_t free_space = os_free_space(data_file_path.parent_path().string()); + free_space += get_burn_in_data_size(); + int64_t index_file_size; + int64_t really_min_space = get_really_min_space(index_file_size); + + if (free_space < min_free_space + || free_space < really_min_space) + { + XLOG(INFO) << "Could not find free extent and not enough free space (" << folly::prettyPrint(free_space, folly::PRETTY_BYTES_IEC) << ") path "<commit_errors = LLONG_MAX; + } + else + { + curr_write_ext_start = data_file_max_size; + curr_write_ext_end = curr_write_ext_start + alloc_chunk_size; + } + } + } + + std::scoped_lock llock(mutex); + frag_info.commit_info->new_datafile_offset = curr_write_ext_start; + frag_info.commit_info->new_datafile_offset_end = curr_write_ext_end; + frag_info.commit_info->commit_done.notify_all(); + } + else if (frag_info.action == FragAction::FreeExtents) + { + int64_t size = frag_info.offset + div_up(frag_info.len, block_size)*block_size; + + if (size > curr_write_ext_start + && size <= curr_write_ext_end) + curr_write_ext_start = size; + if (size > data_file_max_size) + data_file_max_size = size; + + if (!add_freemap_ext(freespace_txn, frag_info.offset, frag_info.len, true, tid)) + { + XLOG(ERR) << "LMDB: Failed to put free extent (0) in FreeExtents sfs " << db_path; + ++commit_errors; + } + + for (SPunchItem& ext : frag_info.extra_exts) + { + size = ext.offset + div_up(ext.len, block_size)*block_size; + + if (size > curr_write_ext_start + && size <= curr_write_ext_end) + curr_write_ext_start = size; + if (size > data_file_max_size) + data_file_max_size = size; + + if (!add_freemap_ext(freespace_txn, ext.offset, ext.len, true, tid)) + { + XLOG(ERR) << "LMDB: Failed to put free extent (1) in FreeExtents sfs " << db_path; + ++commit_errors; + } + } + } + else + { + assert(false); + } + + lock.lock(); + } + + if (is_dead && !do_quit) + { + SFragInfo frag_info; + while (!commit_queue.empty() + || !commit_background_queue.empty()) + { + if (!commit_background_queue.empty()) + { + frag_info = commit_background_queue.front(); + commit_background_queue.pop_front(); + } + else + { + frag_info = commit_queue.front(); + commit_queue.pop_front(); + } + + if (frag_info.commit_info != nullptr) + { + std::scoped_lock llock(mutex); + frag_info.commit_info->commit_errors = 1; + frag_info.commit_info->commit_done.notify_all(); + } + } + + } + + while (references > 0) + { + std::this_thread::sleep_for(100ms); + } +} + +int64_t SingleFileStorage::get_free_space_in_data_file() +{ + int64_t curr_ext_add = 0; + { + std::scoped_lock lock(datafileoffset_mutex); + if (data_file_offset_end > 0 + && data_file_offset_end <= div_up(fileSize(data_file.fd()), block_size)*block_size) + { + curr_ext_add = data_file_offset_end - data_file_offset; + } + } + + std::scoped_lock lock(freespace_mutex); + return data_file_free + curr_ext_add; +} + +int64_t SingleFileStorage::get_free_space_real() +{ + int64_t free_space = os_free_space(data_file_path.parent_path().string()); + free_space += get_burn_in_data_size(); + + if (free_space < min_free_space) + free_space = 0; + else + free_space -= min_free_space; + + int64_t data_file_free_space = get_free_space_in_data_file(); + + if(data_file_free_space>0) + data_file_free_space = (data_file_free_space * 9) / 10; + + free_space += (std::max)((int64_t)0, data_file_free_space); + + return free_space; +} + +int64_t SingleFileStorage::get_total_space() +{ + int64_t total_space = os_total_space(data_file_path.parent_path().string()); + return total_space; +} + +int64_t SingleFileStorage::get_data_file_size() +{ + return fileSize(data_file.fd()); +} + +int64_t SingleFileStorage::max_free_extent(int64_t& len) +{ + THREAD_ID tid = gettid(); + setup_mmap_read_error(tid); + + MDB_txn* txn; + int rc = mdb_txn_begin(cache_db_env!=nullptr ? cache_db_env : db_env, NULL, MDB_RDONLY, &txn); + if (rc) + { + XLOG(ERR) << "LMDB: Error starting transaction for read (" << mdb_strerror(rc) << ") sfs " << db_path; + clear_mmap_read_error(tid); + return -1; + } + + int64_t ret; + + if (!find_freemap_ext(txn, tid, ret, len)) + { + ret = -1; + } + + mdb_txn_abort(txn); + + clear_mmap_read_error(tid); + + return ret; +} + +int64_t SingleFileStorage::get_free_space_slow(bool verbose, int64_t& freespace_extents, std::vector* items) +{ + freespace_extents = 0; + + MDB_txn* txn; + int rc = mdb_txn_begin(cache_db_env!=nullptr ? cache_db_env : db_env, NULL, MDB_RDONLY, &txn); + if (rc) + { + XLOG(ERR) << "LMDB: Error starting transaction for read (" << mdb_strerror(rc) << ") sfs " << db_path; + mdb_txn_abort(txn); + return -1; + } + + rc = mdb_set_compare(txn, dbi_free, mdb_cmp_varint); + if (rc) + { + XLOG(ERR) << "LMDB: Error setting compare func (" << mdb_strerror(rc) << ") sfs " << db_path; + mdb_txn_abort(txn); + return -1; + } + + int64_t ret = 0; + + int64_t prev_offset = -1; + int64_t prev_len; + + MDB_cursor* it_cursor; + mdb_cursor_open(txn, dbi_free, &it_cursor); + MDB_cursor_op op = MDB_FIRST; + do + { + MDB_val tkey; + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in freemap get_free_space_slow (" << mdb_strerror(rc) << ") sfs " << db_path; + } + + if (rc != MDB_NOTFOUND) + { + CRData dkey(reinterpret_cast(tkey.mv_data), tkey.mv_size); + CRData dval(reinterpret_cast(tval.mv_data), tval.mv_size); + + int64_t cstart; + int64_t clen; + if (dkey.getVarInt(&cstart) + && dval.getVarInt(&clen)) + { + assert(dkey.getStreampos() == dkey.getSize()); + assert(dval.getStreampos() == dval.getSize()); + + ret += clen; + + assert(prev_offset==-1 || prev_offset + prev_len < cstart); + + prev_offset = cstart; + prev_len = clen; + + if (verbose) + { + XLOG(INFO) << "Free extent at " << std::to_string(cstart) << " len " << std::to_string(clen); + } + + if (items != nullptr) + { + items->push_back(SPunchItem(cstart, clen)); + } + } + else + { + assert(false); + } + + ++freespace_extents; + } + } while (rc != MDB_NOTFOUND); + + mdb_cursor_close(it_cursor); + + mdb_txn_abort(txn); + + int64_t curr_ext_add = 0; + { + std::scoped_lock lock(datafileoffset_mutex); + if (data_file_offset_end > 0 + && data_file_offset_end <= div_up(fileSize(data_file.fd()), block_size)*block_size) + { + curr_ext_add = data_file_offset_end - data_file_offset; + + if (verbose) + { + XLOG(INFO) << "Curr ext add " << std::to_string(curr_ext_add) << " data_file_offset " << std::to_string(data_file_offset) << " data_file_offset_end " << std::to_string(data_file_offset_end); + } + } + } + + return ret + curr_ext_add; +} + +bool SingleFileStorage::check_len_idx() +{ + MDB_txn* txn; + int rc = mdb_txn_begin(cache_db_env!=nullptr ? cache_db_env : db_env, NULL, MDB_RDONLY, &txn); + if (rc) + { + XLOG(ERR) << "LMDB: Error starting transaction for read (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + + rc = mdb_set_compare(txn, dbi_free, mdb_cmp_varint); + if (rc) + { + XLOG(ERR) << "LMDB: Error setting compare func (" << mdb_strerror(rc) << ") sfs " << db_path; + mdb_txn_abort(txn); + return false; + } + + rc = mdb_set_compare(txn, dbi_free_len, mdb_cmp_varint_rev); + if (rc) + { + XLOG(ERR) << "LMDB: Error setting compare func (" << mdb_strerror(rc) << ") sfs " << db_path; + mdb_txn_abort(txn); + return false; + } + + MDB_cursor* it_cursor; + mdb_cursor_open(txn, dbi_free, &it_cursor); + MDB_cursor_op op = MDB_FIRST; + bool ret = true; + do + { + MDB_val tkey; + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in freemap startup (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + + if (rc != MDB_NOTFOUND) + { + CRData dkey(reinterpret_cast(tkey.mv_data), tkey.mv_size); + CRData dval(reinterpret_cast(tval.mv_data), tval.mv_size); + + int64_t cstart; + int64_t clen; + if (dkey.getVarInt(&cstart) + && dval.getVarInt(&clen) + && dkey.getStreampos() == dkey.getSize() + && dval.getStreampos() == dval.getSize()) + { + rc = mdb_get(txn, dbi_free_len, &tval, &tkey); + + if (rc) + { + XLOG(ERR) << "Could not find free len ext (" << std::to_string(cstart) << ", "<< std::to_string(clen) << ") (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + } + else + { + ret = false; + break; + } + } + } while (rc != MDB_NOTFOUND); + + mdb_cursor_close(it_cursor); + + if (!ret) + { + mdb_txn_abort(txn); + return ret; + } + + mdb_cursor_open(txn, dbi_free_len, &it_cursor); + op = MDB_FIRST; + do + { + MDB_val tkey; + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in freemap startup (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + + if (rc != MDB_NOTFOUND) + { + CRData dkey(reinterpret_cast(tkey.mv_data), tkey.mv_size); + CRData dval(reinterpret_cast(tval.mv_data), tval.mv_size); + + int64_t cstart; + int64_t clen; + if (dkey.getVarInt(&clen) + && dval.getVarInt(&cstart) + && dkey.getStreampos() == dkey.getSize() + && dval.getStreampos() == dval.getSize()) + { + rc = mdb_get(txn, dbi_free, &tval, &tkey); + + if (rc) + { + XLOG(ERR) << "Could not find free ext (" << std::to_string(cstart) << ", " << std::to_string(clen) << ") (" << mdb_strerror(rc) << ") sfs " << db_path; + ret = false; + break; + } + } + else + { + ret = false; + break; + } + } + } while (rc != MDB_NOTFOUND); + + mdb_cursor_close(it_cursor); + + mdb_txn_abort(txn); + + return ret; +} + +void SingleFileStorage::defrag(str_map& params, relaxed_atomic& defrag_items) +{ + { + std::scoped_lock lock(mutex); + if (is_defragging) + { + XLOG(WARN) << "Already defragging. Not starting another defrag"; + return; + } + + if (!allow_defrag) + { + XLOG(WARN) << "Defrag is not allowed (1)"; + return; + } + } + + if (params.find("ratio") == params.end() + && params.find("chunksize")==params.end() ) + { + XLOG(ERR) << "Defrag setting 'ratio' or 'chunksize' not specified for defrag"; + return; + } + + float dratio = static_cast(atof(params["ratio"].c_str())); + int64_t window = std::atoll(params["window"].c_str()); + int64_t chunksize = std::atoll(params["chunksize"].c_str()); + + size_t n_chunks = 1; + if (params.find("n_chunks") != params.end()) + { + n_chunks = std::atoi(params["n_chunks"].c_str()); + } + + XLOG(INFO) << "Defrag settings: dratio=" << std::to_string(dratio) << " window=" << folly::prettyPrint(window, folly::PRETTY_BYTES_IEC) << " chunksize=" << folly::prettyPrint(chunksize, folly::PRETTY_BYTES_IEC) << " n_chunks=" << std::to_string(n_chunks); + + stop_defrag = false; + + THREAD_ID tid = gettid(); + setup_mmap_read_error(tid); + + MDB_txn* freespace_txn; + if(cache_db_env!=nullptr) + { + int rc = mdb_txn_begin(cache_db_env, NULL, MDB_RDONLY, &freespace_txn); + if (rc) + { + XLOG(ERR) << "LMDB: Error starting freepsace transaction for read (2) (" << mdb_strerror(rc) << ") sfs " << db_path; + return; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error starting freespace transaction for read (2) (SIGBUS) sfs " << db_path; + return; + } + } + else + { + int rc = mdb_txn_begin(db_env, NULL, MDB_RDONLY, &freespace_txn); + if (rc) + { + XLOG(ERR) << "LMDB: Error starting transaction for read (2) (" << mdb_strerror(rc) << ") sfs " << db_path; + return; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error starting transaction for read (2) (SIGBUS) sfs " << db_path; + return; + } + } + + + int rc = mdb_set_compare(freespace_txn, dbi_free, mdb_cmp_varint); + if (rc) + { + XLOG(ERR) << "LMDB: Error setting compare func (2) (" << mdb_strerror(rc) << ") sfs " << db_path; + mdb_txn_abort(freespace_txn); + return; + } + + XLOG(INFO) << "Starting defrag of " << data_file_path << " dratio=" << std::to_string(dratio); + + int64_t prev_offset = -1; + int64_t prev_len; + + std::vector defrag_extents; + + struct SDefragInfo + { + SDefragInfo() + : freespace(0), max_cont_freespace(0), frag(-1) {} + + int64_t freespace; + int64_t max_cont_freespace; + double frag; + size_t idx; + }; + + std::vector chunks_defraginfo; + + MDB_cursor* it_cursor; + mdb_cursor_open(freespace_txn, dbi_free, &it_cursor); + MDB_cursor_op op = MDB_FIRST; + do + { + MDB_val tkey; + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in freemap defrag (" << mdb_strerror(rc) << ") sfs " << db_path; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in freemap defrag (SIGBUS) sfs " << db_path; + break; + } + + if (rc != MDB_NOTFOUND) + { + CRData dkey(reinterpret_cast(tkey.mv_data), tkey.mv_size); + CRData dval(reinterpret_cast(tval.mv_data), tval.mv_size); + + int64_t cstart; + int64_t clen; + if (dkey.getVarInt(&cstart) + && dval.getVarInt(&clen)) + { + assert(dkey.getStreampos() == dkey.getSize()); + assert(dval.getStreampos() == dval.getSize()); + + assert(prev_offset == -1 || prev_offset + prev_len < cstart); + + if (chunksize == 0) + { + if (prev_offset != -1) + { + int64_t used_len = cstart - (prev_offset + prev_len); + int64_t potential_continuous_extent = prev_len + used_len + clen; + int64_t curr_continuous_extent = (std::max)(prev_len, clen); + int64_t more_continuous_extent = potential_continuous_extent - curr_continuous_extent; + + if (static_cast(used_len) < more_continuous_extent * dratio) + { + XLOG(INFO) << "Can create continous extent of " << folly::prettyPrint(potential_continuous_extent, folly::PRETTY_BYTES_IEC) << " (" + folly::prettyPrint(more_continuous_extent, folly::PRETTY_BYTES_IEC) << " more)" + << " from (" << folly::prettyPrint(prev_len, folly::PRETTY_BYTES_IEC) << ", " << folly::prettyPrint(clen, folly::PRETTY_BYTES_IEC) << ") by defragging " << folly::prettyPrint(used_len + 2 * window, folly::PRETTY_BYTES_IEC) << " at " << std::to_string(prev_offset + prev_len - window) + << " (offset " << std::to_string(prev_offset + prev_len) << " len " << std::to_string(used_len) << ")"; + defrag_extents.push_back(SPunchItem(prev_offset + prev_len - window, used_len + window)); + + if (defrag_extents.size() > max_defrag_extents) + { + XLOG(INFO) << "Max number of defrag extents reached (" << std::to_string(max_defrag_extents) << ")"; + break; + } + } + } + } + else + { + size_t chunkidx = cstart / chunksize; + + if (chunkidx >= chunks_defraginfo.size()) + { + chunks_defraginfo.resize(chunkidx + 1); + } + + chunks_defraginfo[chunkidx].freespace += clen; + if (clen > chunks_defraginfo[chunkidx].max_cont_freespace) + chunks_defraginfo[chunkidx].max_cont_freespace = clen; + } + + prev_offset = cstart; + prev_len = clen; + } + else + { + assert(false); + } + } + + if (stop_defrag) + { + mdb_cursor_close(it_cursor); + mdb_txn_abort(freespace_txn); + XLOG(WARN) << "Defrag stopped (1)"; + return; + } + + } while (rc != MDB_NOTFOUND); + + mdb_cursor_close(it_cursor); + + mdb_txn_abort(freespace_txn); + + { + std::scoped_lock lock(mutex); + + if (!allow_defrag) + { + XLOG(WARN) << "Defrag is not allowed (2)"; + return; + } + + is_defragging = true; + + add_defrag_skip_items_queue(); + } + + if (chunksize != 0) + { + for (size_t i = 0; i < chunks_defraginfo.size(); ++i) + { + chunks_defraginfo[i].idx = i; + if (chunks_defraginfo[i].freespace <= 0) + continue; + + double frag = 1.0 - (static_cast(chunks_defraginfo[i].max_cont_freespace) / chunks_defraginfo[i].freespace); + chunks_defraginfo[i].frag = frag; + + XLOG(INFO) << "Chunk " << std::to_string(i) << " at " << folly::prettyPrint(i * chunksize, folly::PRETTY_BYTES_IEC) << " len " << folly::prettyPrint(chunksize, folly::PRETTY_BYTES_IEC) << " " + "Frag ratio=" << std::to_string(frag * 100) << "% free space " << folly::prettyPrint(chunks_defraginfo[i].freespace, folly::PRETTY_BYTES_IEC) + << " max continuous free space " << folly::prettyPrint(chunks_defraginfo[i].max_cont_freespace, folly::PRETTY_BYTES_IEC); + } + + + auto heap_comp = [](const SDefragInfo& a, const SDefragInfo& b) { + return a.frag < b.frag; + }; + + std::make_heap(chunks_defraginfo.begin(), chunks_defraginfo.end(), heap_comp); + + for (size_t i = 0; i < n_chunks && !chunks_defraginfo.empty(); ++i) + { + std::pop_heap(chunks_defraginfo.begin(), chunks_defraginfo.end(), heap_comp); + const SDefragInfo& curr = chunks_defraginfo.back(); + + XLOG(INFO) << "Selected chunk " << std::to_string(curr.idx) << " at " << folly::prettyPrint(curr.idx* chunksize, folly::PRETTY_BYTES_IEC) << " len " << folly::prettyPrint(chunksize, folly::PRETTY_BYTES_IEC) << " for defrag. " + "Frag ratio=" << std::to_string(curr.frag * 100) << "% free space " << folly::prettyPrint(curr.freespace, folly::PRETTY_BYTES_IEC) + << " max continuous free space " << folly::prettyPrint(curr.max_cont_freespace, folly::PRETTY_BYTES_IEC); + + SPunchItem defrag_ext(curr.idx * chunksize, chunksize); + defrag_extents.push_back(defrag_ext); + + std::scoped_lock lock(mutex); + curr_free_skip_extents.push_back(defrag_ext); + + chunks_defraginfo.pop_back(); + } + + if(defrag_extents.empty()) + { + XLOG(WARN) << "Did not find chunk to defragment"; + std::scoped_lock lock(mutex); + is_defragging = false; + defrag_skip_items.clear(); + curr_free_skip_extents.clear(); + return; + } + + std::sort(defrag_extents.begin(), defrag_extents.end()); + } + + MDB_txn* txn; + rc = mdb_txn_begin(db_env, NULL, MDB_RDONLY, &txn); + if (rc) + { + XLOG(ERR) << "LMDB: Error starting transaction for read (3) (" << mdb_strerror(rc) << ") sfs " << db_path; + std::scoped_lock lock(mutex); + is_defragging = false; + defrag_skip_items.clear(); + curr_free_skip_extents.clear(); + return; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error starting transaction for read (3) (SIGBUS) sfs " << db_path; + std::scoped_lock lock(mutex); + is_defragging = false; + defrag_skip_items.clear(); + curr_free_skip_extents.clear(); + return; + } + + XLOG(INFO) << "Defragging using txn id " << std::to_string(mdb_get_txnid(txn)); + + size_t defrag_write_errors = 0; + bool do_restart = true; + std::string continue_key; + auto lasttime = std::chrono::steady_clock::now(); + while (do_restart) + { + do_restart = false; + + mdb_cursor_open(txn, dbi_main, &it_cursor); + op = MDB_FIRST; + + MDB_val tkey; + + if (!continue_key.empty()) + { + op = MDB_SET_RANGE; + tkey.mv_data = &continue_key[0]; + tkey.mv_size = continue_key.size(); + } + + do + { + if (stop_defrag) + { + mdb_txn_abort(txn); + + { + std::scoped_lock lock(mutex); + is_defragging = false; + defrag_skip_items.clear(); + curr_free_skip_extents.clear(); + } + + XLOG(WARN) << "Defrag stopped (2)"; + return; + } + + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in main defrag (" << mdb_strerror(rc) << ") sfs " << db_path; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in main defrag (SIGBUS) sfs " << db_path; + break; + } + + if (rc != MDB_NOTFOUND) + { + std::string curr_key = std::string(reinterpret_cast(tkey.mv_data), tkey.mv_size); + + if (curr_key == continue_key) + continue; + + CRData rdata(reinterpret_cast(tval.mv_data), tval.mv_size); + + SPunchItem first_ext; + + if (!rdata.getVarInt(&first_ext.offset) + || !rdata.getVarInt(&first_ext.len)) + { + continue; + } + + std::vector extents; + + int64_t num_extents = extract_num_exts(first_ext.offset); + + bool has_error = false; + for (int64_t i = 0; i < num_extents; ++i) + { + SPunchItem cfrag; + if (!rdata.getVarInt(&cfrag.offset) + || !rdata.getVarInt(&cfrag.len)) + { + has_error = true; + break; + } + + extents.push_back(cfrag); + } + + if (has_error) + { + continue; + } + + for (size_t i = 0; i < extents.size() + 1; ++i) + { + SPunchItem& ext = (i == 0) ? first_ext : extents[i - 1]; + bool ext_found = false; + auto it = std::upper_bound(defrag_extents.begin(), defrag_extents.end(), ext); + if (it != defrag_extents.begin()) + { + --it; + ext_found = true; + } + + if (ext_found + && it->offset <= ext.offset + && it->offset + it->len >= ext.offset + ext.len) + { + continue_key = curr_key; + + std::string fn = decompress_filename(continue_key); + + int64_t last_modified; + std::string md5sum; + + if (!rdata.getVarInt(&last_modified) + || !rdata.getStr2(&md5sum)) + { + break; + } + + lock_defrag(continue_key); + + if (is_defrag_skip_item(continue_key)) + { + unlock_defrag(continue_key); + break; + } + + int64_t fn_size = first_ext.len; + + for (SPunchItem& eext : extents) + fn_size += eext.len; + + XLOG(INFO) << "Rewriting item " << fn << " size " << folly::prettyPrint(fn_size, folly::PRETTY_BYTES_IEC) << " because ext " << std::to_string(ext.offset) << " len " << std::to_string(ext.len) << " (defrag) sfs " << db_path << " defrag ext (" << std::to_string(it->offset) << ", " << std::to_string(it->len) << ")"; + ++defrag_items; + + /*char* data; + size_t data_len; + if (read(first_ext.offset, first_ext.len, extents, fn, 0, data, data_len)) + { + int rc = write_int(fn, data, data_len, last_modified, md5sum, false, false, std::string::npos); + if(rc!=0) + { + if (rc == ENOSPC) + { + XLOG(WARN) << "Error defragging item " << fn << ". Out of space. rc " << std::to_string(rc); + stop_defrag = true; + } + else if (rc!=EDEADLK) + { + XLOG(WARN) << "Error defragging item " << fn << " rc " << std::to_string(rc); + + ++defrag_write_errors; + if (defrag_write_errors > 10) + stop_defrag = true; + } + } + else + { + defrag_write_errors = 0; + } + free(data); + }*/ + + unlock_defrag(continue_key); + + break; + } + } + + auto ctime = std::chrono::steady_clock::now(); + std::unique_lock lock(mutex); + if (ctime - lasttime > 5min + || defrag_restart==1) + { + mdb_txn_abort(txn); + + defrag_skip_items.clear(); + + if (defrag_restart != 1) + { + add_defrag_skip_items_queue(); + + lock.unlock(); + } + else + { + XLOG(INFO) << "Restarting defrag after commit. Waiting for commit..."; + defrag_restart = 2; + + while (defrag_restart == 2) + { + lock.unlock(); + std::this_thread::sleep_for(100ms); + lock.lock(); + } + + add_defrag_skip_items_queue(); + + lock.unlock(); + XLOG(INFO) << "Commit finished. Restarting transaction and continuing defrag..."; + } + + XLOG(INFO) << "Restarting defrag transaction"; + + rc = mdb_txn_begin(db_env, NULL, MDB_RDONLY, &txn); + if (rc) + { + XLOG(ERR) << "LMDB: Error starting transaction for read (4) (" << mdb_strerror(rc) << ") sfs " << db_path; + break; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error starting transaction for read (4) (SIGBUS) sfs " << db_path; + break; + } + + XLOG(INFO) << "Defragging using txn id " + std::to_string(mdb_get_txnid(txn)); + + do_restart = true; + lasttime = std::chrono::steady_clock::now(); + } + } + } while (!do_restart + && rc != MDB_NOTFOUND); + } + + mdb_txn_abort(txn); + + { + std::scoped_lock lock(mutex); + is_defragging = false; + defrag_skip_items.clear(); + curr_free_skip_extents.clear(); + } +} + +bool SingleFileStorage::start_thread(int64_t transid) +{ + curr_transid = transid; + + commit_thread_h = std::thread([this](){ + (*this)(); + }); + + return true; +} + +bool SingleFileStorage::regen_datafile_free(MDB_txn* freespace_txn) +{ + THREAD_ID tid = gettid(); + bool ret=true; + MDB_cursor* it_cursor; + mdb_cursor_open(freespace_txn, dbi_free, &it_cursor); + MDB_cursor_op op = MDB_FIRST; + int rc; + do + { + MDB_val tkey; + MDB_val tval; + rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + op = MDB_NEXT; + if (rc && rc != MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Error getting item in freemap startup (" << mdb_strerror(rc) << ")"; + ret=false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error getting item in freemap startup (SIGBUS)"; + ret=false; + } + + if (rc != MDB_NOTFOUND) + { + CRData dkey(reinterpret_cast(tkey.mv_data), tkey.mv_size); + CRData dval(reinterpret_cast(tval.mv_data), tval.mv_size); + + int64_t cstart; + int64_t clen; + if (dkey.getVarInt(&cstart) + && dval.getVarInt(&clen) + && dkey.getStreampos() == dkey.getSize() + && dval.getStreampos() == dval.getSize()) + { + data_file_free += clen; + } + else + { + ret=false; + XLOG(WARN) << "Deleting invalid freemap extent"; + mdb_cursor_del(it_cursor, 0); + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Error deleting freemap item in startup (SIGBUS)"; + } + } + } + } while (rc != MDB_NOTFOUND); + + mdb_cursor_close(it_cursor); + + return ret; +} + +bool SingleFileStorage::regen_free_len_idx(MDB_txn* freespace_txn) +{ + THREAD_ID tid = gettid(); + if (data_file_free > 0) + { + MDB_cursor* it_cursor; + mdb_cursor_open(freespace_txn, dbi_free_len, &it_cursor); + MDB_cursor_op op = MDB_FIRST; + MDB_val tkey; + MDB_val tval; + int rc = mdb_cursor_get(it_cursor, &tkey, &tval, op); + + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Freelen idx (SIGBUS)"; + return false; + } + + if (rc == MDB_NOTFOUND) + { + XLOG(INFO) << "Free len idx empty. Generating it..."; + + if (!generate_free_len_idx(freespace_txn)) + { + XLOG(ERR) << "Error generating free_len idx"; + return false; + } + } + } + return true; +} + +bool SingleFileStorage::read_pgids(MDB_txn* txn, MDB_dbi dbi, THREAD_ID tid, + TmpMmapedPgIds& mmap_pg_ids) +{ + MDB_cursor* cur; + int rc = mdb_cursor_open(txn, dbi, &cur); + if (rc) + { + XLOG(ERR) << "LMDB: Failed to open cursor for read_pgids (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed to open cursor for read_pgids (SIGBUS) sfs " << db_path; + is_dead = true; + do_stop_on_error(); + return false; + } + + size_t pgno; + + rc = mdb_cursor_first_leaf_page(cur, &pgno); + + if (rc) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_first_leaf_page in read_pgids (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_first_leaf_page for read_pgids (SIGBUS) sfs " << db_path; + is_dead = true; + do_stop_on_error(); + return false; + } + + mmap_pg_ids.add_pgid(pgno); + + while (rc == MDB_SUCCESS) + { + rc = mdb_cursor_next_leaf_page(cur, &pgno); + + if (rc && rc!=MDB_NOTFOUND) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_next_leaf_page in read_pgids (" << mdb_strerror(rc) << ") sfs " << db_path; + return false; + } + if (has_mmap_read_error_reset(tid)) + { + XLOG(ERR) << "LMDB: Failed mdb_cursor_next_leaf_page for read_pgids (SIGBUS) sfs " << db_path; + is_dead = true; + do_stop_on_error(); + return false; + } + + if(rc!=MDB_NOTFOUND) + mmap_pg_ids.add_pgid(pgno); + } + + std::sort(mmap_pg_ids.begin(), mmap_pg_ids.end()); + + return true; +} + +void SingleFileStorage::do_stop_on_error() +{ + if(is_dead || write_offline) + { + XLOG(ERR) << "Stopping on error rc=1."; + _exit(1); + } +} + +SingleFileStorage::TmpMmapedPgIds::TmpMmapedPgIds() + : mmap_ptr(nullptr), n_pgids(0) +{ + std::string tmp_fn = "/var/tmp/tmp_mmaped_pgids_"; + tmp_fn += random_uuid(); + backing_file = folly::File(tmp_fn, O_RDWR | O_CLOEXEC | O_CREAT | O_TMPFILE ); + + mmap_size = 10ULL * 1024 * 1024 * 1024; + ftruncate64(backing_file.fd(), mmap_size); + + mmap_ptr = reinterpret_cast(mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_SHARED, backing_file.fd(), 0)); + + if (mmap_ptr == reinterpret_cast(-1)) + { + XLOG(ERR) << "Error creating mmap of mmap-pgids. " << folly::errnoStr(errno); + abort(); + } +} + +SingleFileStorage::TmpMmapedPgIds::~TmpMmapedPgIds() +{ + if (mmap_ptr != nullptr) + { + munmap(mmap_ptr, mmap_size); + } +} \ No newline at end of file diff --git a/SingleFileStorage.h b/SingleFileStorage.h new file mode 100644 index 0000000..7896560 --- /dev/null +++ b/SingleFileStorage.h @@ -0,0 +1,590 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lmdb/lmdb.h" +#include +#include "relaxed_atomic.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +using THREAD_ID = pid_t; + +class SingleFileStorage +{ +public: + + struct SPunchItem + { + SPunchItem() + : offset(-1), len(0) {} + + SPunchItem(int64_t offset, int64_t len) + : offset(offset), len(len) {} + + bool operator<(const SPunchItem& other) const + { + return offset < other.offset; + } + + int64_t offset; + int64_t len; + }; + + struct Ext + { + Ext() + : obj_offset(0), data_file_offset(-1), len(0) {} + + Ext(int64_t obj_offset, int64_t data_file_offset, int64_t len) + : obj_offset(obj_offset), data_file_offset(data_file_offset), len(len) {} + + bool operator<(const Ext& other) const + { + return obj_offset < other.obj_offset; + } + + int64_t obj_offset; + int64_t data_file_offset; + int64_t len; + }; + + enum class DelAction + { + Del = 0, + DelOld = 1, + DelWithQueued = 2, + Queue = 3, + Unqueue = 4, + AssertQueueEmpty = 5 + }; + + struct SFSOptions + { + std::string data_path; + std::string db_path; + std::string freespace_cache_path; + std::string dm_cache_path; + int64_t dm_cache_size = 0; + bool use_direct_io = false; + int64_t data_file_size_limit_mb = 0; + int64_t alloc_chunk_size = 512 * 1024 * 1024; + std::string runtime_id; + bool manual_commit = false; + bool stop_on_error = false; + bool punch_holes = true; + }; + + SingleFileStorage(SFSOptions options); + + //Start with dead SFS + SingleFileStorage(); + + void operator=(const SingleFileStorage&) = delete; + SingleFileStorage(SingleFileStorage&) = delete; + + ~SingleFileStorage(); + + static void init_mutex(); + + static void handle_mmap_read_error(void* addr); + + struct WritePrepareResult + { + int err; + std::vector extents; + }; + + WritePrepareResult write_prepare(const std::string& fn, size_t data_size, size_t max_data_fragments); + + int write_ext(const Ext& ext, const void* data, size_t data_size); + + int write_finalize(const std::string& fn, const std::vector& extents, int64_t last_modified, const std::string& md5sum, + bool no_del_old, bool is_fragment); + + int write(const std::string& fn, + const char* data, size_t data_size, int64_t last_modified, const std::string& md5sum, + bool no_del_old, bool is_fragment, size_t max_data_fragments); + + const static unsigned int ReadWithReadahead = 1; + const static unsigned int ReadUnsynced = 2; + const static unsigned int ReadMetaOnly = 4; + + struct ReadPrepareResult + { + int err; + std::vector extents; + int64_t total_len; + }; + + ReadPrepareResult read_prepare(const std::string& fn, unsigned int flags); + + struct ReadExtResult + { + int err; + std::unique_ptr buf; + }; + + ReadExtResult read_ext(const Ext& ext, const unsigned int flags, const size_t bufsize, folly::IOBufQueue& buf); + + int read_finalize(const std::string& fn, const std::vector& extents, unsigned int flags); + + bool del(const std::string& fn, DelAction da, + bool background_queue); + + bool restore_old(const std::string& fn); + + + bool commit(bool background_queue, int64_t transid) { + return commit(background_queue, transid, 0); + } + + bool commit(bool background_queue, int64_t transid, int64_t disk_id); + + bool empty_queue(bool background_queue); + + struct IterData + { + MDB_txn* iter_txn; + MDB_cursor* iter_cur; + MDB_val iter_key; + MDB_val iter_val; + }; + + bool iter_start(int64_t disk_id, bool compressed, IterData& iter_data); + + bool iter_start(bool compressed, IterData& iter_data); + + bool iter_start(std::string fn, bool compressed, IterData& iter_data); + + void start_debug(); + + void iter_stop(IterData& iter_data); + + bool iter_next(IterData& iter_data); + + bool iter_curr_val(std::string& fn, int64_t& offset, int64_t& size, std::vector& exta_exts, int64_t& last_modified, std::string& md5sum, IterData& iter_data); + + bool iter_curr_val(std::string& fn, std::string& data, IterData& iter_data); + + virtual void operator()(); + + int64_t get_free_space_in_data_file(); + + int64_t get_free_space_real(); + + int64_t get_total_space(); + + int64_t get_data_file_size(); + + int64_t max_free_extent(int64_t& len); + + int64_t get_free_space_slow(bool verbose, int64_t& freespace_extents, std::vector* items); + + bool check_len_idx(); + + using str_map = std::map; + + void defrag(str_map& params, relaxed_atomic& defrag_items); + + std::string get_db_path() { return db_path; } + + std::string get_cache_path() { return freespace_cache_path; } + + bool is_write_offline() { return write_offline; } + + bool start_thread(int64_t transid); + + int64_t get_transid() { + std::scoped_lock lock(mutex); + return curr_transid; + } + + int64_t get_transid(int64_t disk_id); + + std::string meminfo(); + + bool set_write_offline(bool b); + + bool get_is_dead() { return is_dead; } + + bool set_allow_defrag(bool b) { + return set_allow_defrag(b, 0); + } + + bool set_allow_defrag(bool b, int64_t disk_id); + + bool set_stop_defrag(bool b) { stop_defrag = b; return true; } + + bool reset_del_log(int64_t disk_id, int64_t reset_transid); + + bool reset_del_queue(int64_t disk_id, int64_t reset_transid); + + int64_t get_disk_id(const std::string& uuid); + + void migrate_thread(); + + bool start_migrate(); + + void reference(); + + void unreference(); + + static std::string decompress_filename(const std::string& fn); + + static int64_t get_fn_disk_id(const std::string& fn); + + static std::string remove_disk_id(const std::string& fn, size_t disk_id_size); + + std::string freespace_stats(); + + virtual void wait_for_startup_finish(); + + std::string get_runtime_id() { + return runtime_id; + } + + bool get_manual_commit() { + return manual_commit; + } + +private: + + int write_int(const std::string& fn, const char* data, size_t data_size, + int64_t last_modified, const std::string& md5sum, bool allow_defrag_lock, bool no_del_old, + size_t max_data_fragments); + + int64_t remove_fn(const std::string& fn, + MDB_txn* txn, MDB_txn* freespace_txn, bool del_from_main, bool del_old, THREAD_ID tid); + + int64_t restore_fn(const std::string& fn, + MDB_txn* txn, MDB_txn* freespace_txn, THREAD_ID tid); + + int64_t log_fn(const std::string& fn, + MDB_txn* txn, THREAD_ID tid, int64_t transid); + + int64_t add_tmp(int64_t idx, MDB_txn* txn, THREAD_ID tid, int64_t offset, int64_t len); + + int64_t rm_tmp(int64_t idx, MDB_txn* txn, THREAD_ID tid); + + void wait_queue(std::unique_lock& lock, bool background_queue, bool defrag_check); + + bool add_freemap_ext(MDB_txn* txn, int64_t offset, int64_t len, bool used_in_curr_trans, THREAD_ID tid); + + bool add_freemap_ext_simple(MDB_txn* txn, int64_t offset, int64_t len, THREAD_ID tid); + + bool find_freemap_ext(MDB_txn* txn, THREAD_ID tid, int64_t& start, int64_t& len); + + void lock_defrag(const std::string& fn); + + bool is_defrag_skip_item(const std::string& fn); + + void unlock_defrag(const std::string& fn); + + void wait_defrag(const std::string& fn, std::unique_lock& lock); + + void setup_mmap_read_error(THREAD_ID tid); + + bool clear_mmap_read_error(THREAD_ID tid); + + bool has_mmap_read_error_reset(THREAD_ID tid); + + int64_t reset_del_log_fn(MDB_txn* txn, MDB_txn* freespace_txn, THREAD_ID tid, int64_t disk_id, int64_t transid); + + int64_t reset_holes(MDB_txn* txn, MDB_txn* freespace_txn, THREAD_ID tid); + + int64_t reset_del_queue(MDB_txn* txn, MDB_txn* freespace_txn, THREAD_ID tid, int64_t disk_id, int64_t transid); + + void wait_startup_finished(std::unique_lock& lock); + + void free_extents(const std::vector& extents); + + int64_t get_really_min_space(int64_t& index_file_size); + + int64_t get_burn_in_data_size(); + + bool do_free_minspace(MDB_txn* txn, MDB_txn* freespace_txn, THREAD_ID tid); + + static std::mutex mmap_read_error_mutex; + + static std::unordered_map > > mmap_read_error_jmp; + static std::vector mmap_dbs; + + std::vector mmap_cleanup_addrs; + + int64_t queue_del(const std::string& fn, MDB_txn* txn, THREAD_ID tid, int64_t transid); + + int64_t unqueue_del(const std::string& fn, MDB_txn* txn, THREAD_ID tid); + + void add_defrag_skip_items_queue(); + + bool open_cache_db(int64_t current_txn_id, int64_t mapsize, bool use_other, bool del_create, MDB_txn*& freespace_txn); + + bool generate_freespace_cache(MDB_txn* source_txn, MDB_txn* dst_txncs, bool fast_gen); + + bool freespace_check(MDB_txn* source_txn, MDB_txn* freespace_txn, bool fast_check); + + bool clear_freespace_cache(MDB_txn* txn); + + bool regen_datafile_free(MDB_txn* freespace_txn); + + bool regen_free_len_idx(MDB_txn* freespace_txn); + + class TmpMmapedPgIds + { + public: + TmpMmapedPgIds(); + + ~TmpMmapedPgIds(); + + + void add_pgid(size_t pgid) + { + if (n_pgids * sizeof(pgid) + sizeof(pgid) >= mmap_size) + { + std::cerr << "pgid mmap too small" << std::endl; + abort(); + } + memcpy(mmap_ptr + n_pgids * sizeof(pgid), &pgid, sizeof(pgid)); + ++n_pgids; + } + + size_t size() + { + return n_pgids; + } + + size_t* begin() + { + return reinterpret_cast(mmap_ptr); + } + + size_t* end() + { + return reinterpret_cast(mmap_ptr + n_pgids * sizeof(size_t)); + } + + size_t& get(size_t idx) + { + return *(begin() + idx); + } + + private: + folly::File backing_file; + size_t n_pgids; + size_t mmap_size; + char* mmap_ptr; + }; + + bool read_pgids(MDB_txn* txn, MDB_dbi dbi, THREAD_ID tid, TmpMmapedPgIds& mmap_pg_ids); + + enum class FragAction + { + Add, + Del, + Commit, + FindFree, + AddNoDelOld, + DelOld, + RestoreOld, + EmptyQueue, + ReadFragInfo, + FreeExtents, + ResetDelLog, + GetDiskId, + QueueDel, + UnqueueDel, + DelWithQueued, + ResetDelQueue, + AssertDelQueueEmpty + }; + + struct SFragInfo; + + struct SCommitInfo + { + SCommitInfo() + : commit_errors(0), + frag_info(nullptr) + {} + int64_t commit_errors; + std::condition_variable commit_done; + int64_t new_datafile_offset; + int64_t new_datafile_offset_end; + SFragInfo* frag_info; + }; + + struct SFragInfo + { + SFragInfo() : offset(-1), len(0), + last_modified(0), commit_info(nullptr) { + } + SFragInfo(int64_t offset, int64_t len) + : offset(offset), len(len), + last_modified(0), commit_info(nullptr) {} + + FragAction action; + std::string fn; + int64_t offset; + int64_t len; + int64_t last_modified; + std::string md5sum; + SCommitInfo* commit_info; + std::vector extra_exts; + }; + + std::string compress_filename(const std::string& fn); + + SFragInfo get_frag_info(MDB_txn* txn, const std::string& fn); + + bool generate_free_len_idx(MDB_txn* txn); + + int64_t get_disk_id(MDB_txn * txn, THREAD_ID tid, const std::string& uuid); + + int64_t get_disk_trans_id(MDB_txn * txn, THREAD_ID tid, int64_t disk_id); + + bool set_disk_trans_id(MDB_txn * txn, THREAD_ID tid, int64_t disk_id, int64_t trans_id); + + bool rewrite_npages(MDB_txn* txn, MDB_cursor* mc, THREAD_ID tid, size_t npages); + + int put_with_rewrite(MDB_txn* txn, MDB_dbi dbi, MDB_val* tkey, MDB_val* tval, THREAD_ID tid, size_t npages); + + void add_reading_item(const SFragInfo& fi); + + void remove_reading_item(const std::vector& extents); + + void do_stop_on_error(); + + bool with_rewrite; + + std::unordered_set defrag_skip_items; + bool is_defragging; + int defrag_restart; + std::atomic stop_defrag; + bool allow_defrag; + std::set disallow_defrag_disk_id; + std::deque commit_queue; + std::deque commit_background_queue; + std::thread commit_thread_h; + std::unordered_map commit_items; + + bool do_quit; + + bool startup_finished; + + int64_t data_file_max_size; + int64_t data_file_offset; + int64_t data_file_offset_end; + int64_t data_file_free; + std::map reserved_extents; + folly::File data_file; + folly::File data_file_dio; + folly::File new_data_file; + folly::File new_data_file_dio; + MDB_env* db_env; + MDB_dbi dbi_main; + MDB_dbi dbi_free; + MDB_dbi dbi_free_len; + MDB_dbi dbi_size; + MDB_dbi dbi_old; + MDB_dbi dbi_holes; + MDB_dbi dbi_queue_del; + MDB_env* cache_db_env; + MDB_dbi dbi_cache_size; + std::set curr_new_free_extents; + std::set reading_free_skip_extents; + std::unordered_set defrag_items; + std::vector curr_free_skip_extents; + + struct ReadingItem + { + size_t refs = 0; + bool free_skip = false; + }; + + std::map reading_items; + + std::mutex mutex; + std::condition_variable cond; + + std::mutex datafileoffset_mutex; + std::mutex freespace_mutex; + + int64_t min_free_space; + + std::string db_path; + + std::string freespace_cache_path; + + relaxed_atomic is_dead; + relaxed_atomic write_offline; + + int64_t curr_transid; + + bool force_freespace_check; + + bool regen_freespace_cache; + bool sync_freespace_cache; + + int64_t next_disk_id; + + int64_t data_file_copy_done; + int64_t data_file_copy_done_sync; + int64_t data_file_copy_max; + bool stop_data_file_copy; + std::shared_mutex data_file_copy_mutex; + std::thread migrate_thread_h; + relaxed_atomic references; + + bool mdb_curr_sync; + + std::filesystem::path data_file_path; + + int64_t data_file_size_limit; + int64_t alloc_chunk_size; + + std::string runtime_id; + bool manual_commit; + bool stop_on_error; + bool punch_holes; +}; + + +class ScopedSFSRef +{ + SingleFileStorage* sfs; +public: + ScopedSFSRef(SingleFileStorage* sfs) + : sfs(sfs) { + if(sfs!=nullptr) + sfs->reference(); + } + + void reset(SingleFileStorage* nsfs) + { + if (sfs != nullptr) sfs->unreference(); + sfs = nsfs; + if (sfs != nullptr) sfs->reference(); + } + + ~ScopedSFSRef() { + if (sfs != nullptr) + sfs->unreference(); + } +}; diff --git a/data.cpp b/data.cpp new file mode 100644 index 0000000..f7a4c2b --- /dev/null +++ b/data.cpp @@ -0,0 +1,933 @@ +/** + * Copyright Martin Raiber. All Rights Reserved. + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +#include +#include "data.h" +#include +#include + +namespace +{ +bool is_big_endian(void) +{ + union { + unsigned int i; + char c[4]; + } bint = {0x01020304}; + + return bint.c[0] == 1; +} + +unsigned int endian_swap(unsigned int x) +{ + return (x>>24) | + ((x<<8) & 0x00FF0000) | + ((x>>8) & 0x0000FF00) | + (x<<24); +} + +unsigned short endian_swap(unsigned short x) +{ + return x = (x>>8) | + (x<<8); +} + +std::string endian_swap_utf16(std::string str) +{ + for(size_t i=0;i>56) | + ((x<<40) & 0x00FF000000000000) | + ((x<<24) & 0x0000FF0000000000) | + ((x<<8) & 0x000000FF00000000) | + ((x>>8) & 0x00000000FF000000) | + ((x>>24) & 0x0000000000FF0000) | + ((x>>40) & 0x000000000000FF00) | + (x<<56); +#else + return (x>>56) | + ((x<<40) & 0x00FF000000000000LLU) | + ((x<<24) & 0x0000FF0000000000LLU) | + ((x<<8) & 0x000000FF00000000LLU) | + ((x>>8) & 0x00000000FF000000LLU) | + ((x>>24) & 0x0000000000FF0000LLU) | + ((x>>40) & 0x000000000000FF00LLU) | + (x<<56); +#endif +} + +unsigned int little_endian(unsigned int x) +{ + if(is_big_endian()) + { + return endian_swap(x); + } + else + { + return x; + } +} + +unsigned short little_endian(unsigned short x) +{ + if(is_big_endian()) + { + return endian_swap(x); + } + else + { + return x; + } +} + +int little_endian(int x) +{ + if(is_big_endian()) + { + return static_cast(endian_swap(static_cast(x))); + } + else + { + return x; + } +} + +uint64_t little_endian(uint64_t x) +{ + if(is_big_endian()) + { + return endian_swap(x); + } + else + { + return x; + } +} + +int64_t little_endian(int64_t x) +{ + if(is_big_endian()) + { + return static_cast(endian_swap(static_cast(x))); + } + else + { + return x; + } +} + +float little_endian(float x) +{ + if(is_big_endian()) + { + unsigned int* ptr=reinterpret_cast(&x); + unsigned int ret = endian_swap(*ptr); + return *reinterpret_cast(&ret); + } + else + { + return x; + } +} + +double little_endian(double x) +{ + if (is_big_endian()) + { + uint64_t* ptr = reinterpret_cast(&x); + uint64_t ret = endian_swap(*ptr); + return *reinterpret_cast(&ret); + } + else + { + return x; + } +} + +unsigned int big_endian(unsigned int x) +{ + if(!is_big_endian()) + { + return endian_swap(x); + } + else + { + return x; + } +} + +unsigned short big_endian(unsigned short x) +{ + if(!is_big_endian()) + { + return endian_swap(x); + } + else + { + return x; + } +} + +int big_endian(int x) +{ + if(!is_big_endian()) + { + return static_cast(endian_swap(static_cast(x))); + } + else + { + return x; + } +} + +uint64_t big_endian(uint64_t x) +{ + if(!is_big_endian()) + { + return endian_swap(x); + } + else + { + return x; + } +} + +int64_t big_endian(int64_t x) +{ + if(!is_big_endian()) + { + return static_cast(endian_swap(static_cast(x))); + } + else + { + return x; + } +} + +float big_endian(float x) +{ + if(!is_big_endian()) + { + unsigned int* ptr=reinterpret_cast(&x); + unsigned int ret = endian_swap(*ptr); + return *reinterpret_cast(&ret); + } + else + { + return x; + } +} + +std::string big_endian_utf16(std::string str) +{ + if(!is_big_endian()) + { + return endian_swap_utf16(str); + } + else + { + return str; + } +} + + //FROM SQLITE + + typedef uint64_t u64; + typedef unsigned char u8; + typedef uint32_t u32; + + /* + ** Return the number of bytes that will be needed to store the given + ** 64-bit integer. + */ + int sqlite3VarintLen(u64 v){ + int i; + for(i=1; (v >>= 7)!=0; i++) + { + assert( i<10 ); + } + + if (i == 10) + { + return 9; + } + + return i; + } + + /* + ** Bitmasks used by sqlite3GetVarint(). These precomputed constants + ** are defined here rather than simply putting the constant expressions + ** inline in order to work around bugs in the RVT compiler. + ** + ** SLOT_2_0 A mask for (0x7f<<14) | 0x7f + ** + ** SLOT_4_2_0 A mask for (0x7f<<28) | SLOT_2_0 + */ +#define SLOT_2_0 0x001fc07f +#define SLOT_4_2_0 0xf01fc07f + + + /* + ** Read a 64-bit variable-length integer from memory starting at p[0]. + ** Return the number of bytes read. The value is stored in *v. + */ + u8 sqlite3GetVarint(const unsigned char *p, u64 *v, size_t max_length){ + u32 a,b,s; + + if(max_length==0) + { + return 0; + } + + a = *p; + /* a: p0 (unmasked) */ + if (!(a&0x80)) + { + *v = a; + return 1; + } + + if(max_length<=1) + { + return 0; + } + + p++; + b = *p; + /* b: p1 (unmasked) */ + if (!(b&0x80)) + { + a &= 0x7f; + a = a<<7; + a |= b; + *v = a; + return 2; + } + + /* Verify that constants are precomputed correctly */ + assert( SLOT_2_0 == ((0x7f<<14) | (0x7f)) ); + assert( SLOT_4_2_0 == ((0xfU<<28) | (0x7f<<14) | (0x7f)) ); + + if(max_length<=2) + { + return 0; + } + + p++; + a = a<<14; + a |= *p; + /* a: p0<<14 | p2 (unmasked) */ + if (!(a&0x80)) + { + a &= SLOT_2_0; + b &= 0x7f; + b = b<<7; + a |= b; + *v = a; + return 3; + } + + /* CSE1 from below */ + a &= SLOT_2_0; + + if(max_length<=3) + { + return 0; + } + + p++; + b = b<<14; + b |= *p; + /* b: p1<<14 | p3 (unmasked) */ + if (!(b&0x80)) + { + b &= SLOT_2_0; + /* moved CSE1 up */ + /* a &= (0x7f<<14)|(0x7f); */ + a = a<<7; + a |= b; + *v = a; + return 4; + } + + /* a: p0<<14 | p2 (masked) */ + /* b: p1<<14 | p3 (unmasked) */ + /* 1:save off p0<<21 | p1<<14 | p2<<7 | p3 (masked) */ + /* moved CSE1 up */ + /* a &= (0x7f<<14)|(0x7f); */ + b &= SLOT_2_0; + s = a; + /* s: p0<<14 | p2 (masked) */ + + if(max_length<=4) + { + return 0; + } + + p++; + a = a<<14; + a |= *p; + /* a: p0<<28 | p2<<14 | p4 (unmasked) */ + if (!(a&0x80)) + { + /* we can skip these cause they were (effectively) done above in calc'ing s */ + /* a &= (0x7f<<28)|(0x7f<<14)|(0x7f); */ + /* b &= (0x7f<<14)|(0x7f); */ + b = b<<7; + a |= b; + s = s>>18; + *v = ((u64)s)<<32 | a; + return 5; + } + + /* 2:save off p0<<21 | p1<<14 | p2<<7 | p3 (masked) */ + s = s<<7; + s |= b; + /* s: p0<<21 | p1<<14 | p2<<7 | p3 (masked) */ + + if(max_length<=5) + { + return 0; + } + + p++; + b = b<<14; + b |= *p; + /* b: p1<<28 | p3<<14 | p5 (unmasked) */ + if (!(b&0x80)) + { + /* we can skip this cause it was (effectively) done above in calc'ing s */ + /* b &= (0x7f<<28)|(0x7f<<14)|(0x7f); */ + a &= SLOT_2_0; + a = a<<7; + a |= b; + s = s>>18; + *v = ((u64)s)<<32 | a; + return 6; + } + + if(max_length<=6) + { + return 0; + } + + p++; + a = a<<14; + a |= *p; + /* a: p2<<28 | p4<<14 | p6 (unmasked) */ + if (!(a&0x80)) + { + a &= SLOT_4_2_0; + b &= SLOT_2_0; + b = b<<7; + a |= b; + s = s>>11; + *v = ((u64)s)<<32 | a; + return 7; + } + + /* CSE2 from below */ + a &= SLOT_2_0; + + if(max_length<=7) + { + return 0; + } + + p++; + b = b<<14; + b |= *p; + /* b: p3<<28 | p5<<14 | p7 (unmasked) */ + if (!(b&0x80)) + { + b &= SLOT_4_2_0; + /* moved CSE2 up */ + /* a &= (0x7f<<14)|(0x7f); */ + a = a<<7; + a |= b; + s = s>>4; + *v = ((u64)s)<<32 | a; + return 8; + } + + if(max_length<=8) + { + return 0; + } + + p++; + a = a<<15; + a |= *p; + /* a: p4<<29 | p6<<15 | p8 (unmasked) */ + + /* moved CSE2 up */ + /* a &= (0x7f<<29)|(0x7f<<15)|(0xff); */ + b &= SLOT_2_0; + b = b<<8; + a |= b; + + s = s<<4; + b = p[-4]; + b &= 0x7f; + b = b>>3; + s |= b; + + *v = ((u64)s)<<32 | a; + + return 9; + } + + /* + ** Write a 64-bit variable-length integer to memory starting at p[0]. + ** The length of data write will be between 1 and 9 bytes. The number + ** of bytes written is returned. + ** + ** A variable-length integer consists of the lower 7 bits of each byte + ** for all bytes that have the 8th bit set and one byte with the 8th + ** bit clear. Except, if we get to the 9th byte, it stores the full + ** 8 bits and is the last byte. + */ + static int putVarint64(unsigned char *p, u64 v){ + int i, j, n; + u8 buf[10]; + if( v & (((u64)0xff000000)<<32) ){ + p[8] = (u8)v; + v >>= 8; + for(i=7; i>=0; i--){ + p[i] = (u8)((v & 0x7f) | 0x80); + v >>= 7; + } + return 9; + } + n = 0; + do{ + buf[n++] = (u8)((v & 0x7f) | 0x80); + v >>= 7; + }while( v!=0 ); + buf[0] &= 0x7f; + assert( n<=9 ); + for(i=0, j=n-1; j>=0; j--, i++){ + p[i] = buf[j]; + } + return n; + } + + int sqlite3PutVarint(unsigned char *p, u64 v){ + if( v<=0x7f ){ + p[0] = v&0x7f; + return 1; + } + if( v<=0x3fff ){ + p[0] = ((v>>7)&0x7f)|0x80; + p[1] = v&0x7f; + return 2; + } + return putVarint64(p,v); + } +} + + +char* CWData::getDataPtr(void) +{ + if(data.size()>0) + return &data[0]; + else + return NULL; +} + +unsigned long CWData::getDataSize(void) +{ + return (unsigned long)data.size(); +} + +void CWData::addInt(int ta) +{ + ta=little_endian(ta); + data.insert(data.end(), reinterpret_cast(&ta), reinterpret_cast(&ta)+sizeof(int)); +} + +void CWData::addUInt(unsigned int ta) +{ + ta=little_endian(ta); + data.insert(data.end(), reinterpret_cast(&ta), reinterpret_cast(&ta) + sizeof(unsigned int)); +} + +void CWData::addInt64(int64_t ta) +{ + ta=little_endian(ta); + data.insert(data.end(), reinterpret_cast(&ta), reinterpret_cast(&ta) + sizeof(int64_t)); +} + +void CWData::addUInt64(uint64_t ta) +{ + ta=little_endian(ta); + data.insert(data.end(), reinterpret_cast(&ta), reinterpret_cast(&ta) + sizeof(uint64_t)); +} + +void CWData::addFloat(float ta) +{ + ta=little_endian(ta); + data.insert(data.end(), reinterpret_cast(&ta), reinterpret_cast(&ta) + sizeof(float)); +} + +void CWData::addDouble(double ta) +{ + ta = little_endian(ta); + data.insert(data.end(), reinterpret_cast(&ta), reinterpret_cast(&ta) + sizeof(double)); +} + +void CWData::addUShort(unsigned short ta) +{ + ta=little_endian(ta); + data.insert(data.end(), reinterpret_cast(&ta), reinterpret_cast(&ta) + sizeof(unsigned short)); +} + +void CWData::addString(const std::string& ta) +{ + addUInt(static_cast(ta.size())); + if(!ta.empty()) + { + data.insert(data.end(), ta.begin(), ta.end()); + } +} + +void CWData::addString2(const std::string& ta) +{ + addVarInt(ta.size()); + if (!ta.empty()) + { + data.insert(data.end(), ta.begin(), ta.end()); + } +} + +void CWData::addChar(char ta) +{ + data.insert(data.end(), ta); +} + +void CWData::addUChar(unsigned char ta) +{ + data.insert(data.end(), static_cast(ta)); +} + +void CWData::addVoidPtr(void* ta) +{ + data.insert(data.end(), reinterpret_cast(&ta), reinterpret_cast(&ta) + sizeof(void*)); +} + +void CWData::addBuffer(const char* buffer, size_t bsize) +{ + data.insert(data.end(), buffer, buffer + bsize); +} + +void CWData::clear() +{ + data.clear(); +} + +void CWData::reserve(size_t count) +{ + data.reserve(count); +} + +void CWData::resize(size_t count) +{ + data.resize(count); +} + +size_t CWData::capacity() +{ + return data.capacity(); +} + +void CWData::addVarInt( int64_t ta ) +{ + size_t cpos=data.size(); + int needed_bytes = sqlite3VarintLen(static_cast(ta)); + data.resize(cpos+needed_bytes); + int p = sqlite3PutVarint(reinterpret_cast(&data[cpos]), ta); + assert(p==needed_bytes); +} + +CRData::CRData(const char* c,size_t datalength, bool pCopy) +{ + data=NULL; + set(c,datalength, pCopy); +} + +CRData::CRData(void) +{ + data=NULL; + streampos=0; + datalen=0; +} + +void CRData::set(const char* c,size_t datalength, bool pCopy) +{ + datalen = (std::min)(size_t(100*1024*1024), datalength); + + copy=pCopy; + if( copy==false ) + { + data=c; + } + else + { + if( data!=NULL ) + delete [] data; + data=new char[datalen]; + memcpy(const_cast(data), c, datalen); + } + streampos=0; +} + +CRData::CRData(const std::string *str) +{ + set(str->c_str(), str->size(), false); +} + +CRData::~CRData() +{ + if( copy ) + delete []data; +} + +bool CRData::getInt(int *ret) +{ + if(streampos+sizeof(int)>datalen ) + { + return false; + } + + memcpy(ret, &data[streampos], sizeof(int) ); + streampos+=sizeof(int); + *ret=little_endian(*ret); + return true; +} + +bool CRData::getInt64(int64_t *ret) +{ + if(streampos+sizeof(int64_t)>datalen ) + { + return false; + } + + memcpy(ret, &data[streampos], sizeof(int64_t) ); + streampos+=sizeof(int64_t); + *ret=little_endian(*ret); + return true; +} + +bool CRData::getUInt(unsigned int *ret) +{ + if(streampos+sizeof(unsigned int )>datalen ) + { + return false; + } + + memcpy(ret, &data[streampos], sizeof(unsigned int ) ); + streampos+=sizeof(unsigned int); + *ret=little_endian(*ret); + return true; +} + +bool CRData::getFloat(float *ret) +{ + if(streampos+sizeof(float)>datalen ) + { + return false; + } + + memcpy(ret, &data[streampos], sizeof(float) ); + streampos+=sizeof(float); + *ret=little_endian(*ret); + return true; +} + +bool CRData::getDouble(double * ret) +{ + if (streampos + sizeof(double)>datalen) + { + return false; + } + + memcpy(ret, &data[streampos], sizeof(double)); + streampos += sizeof(double); + *ret = little_endian(*ret); + return true; +} + +bool CRData::getUShort( unsigned short *ret) +{ + if(streampos+sizeof(unsigned short)>datalen ) + { + return false; + } + + memcpy(ret, &data[streampos], sizeof(unsigned short) ); + streampos+=sizeof(unsigned short); + *ret=little_endian(*ret); + return true; +} + +bool CRData::getStr(std::string *ret) +{ + unsigned int strlen; + if (!getUInt(&strlen)) + { + return false; + } + + if(strlen>10*1024*1024) + { + return false; + } + + if(streampos+strlen>datalen) + { + return false; + } + + if(strlen>0) + { + ret->assign(&data[streampos], strlen); + } + else + { + ret->clear(); + } + streampos+=strlen; + return true; +} + +bool CRData::getStr2(std::string *ret) +{ + int64_t strlen; + if (!getVarInt(&strlen)) + { + return false; + } + + if (strlen>10 * 1024 * 1024 + || strlen<0) + { + return false; + } + + if (streampos + strlen>datalen) + { + return false; + } + + if (strlen>0) + { + ret->assign(&data[streampos], strlen); + } + else + { + ret->clear(); + } + streampos += strlen; + return true; +} + +bool CRData::getChar(char *ret) +{ + if(streampos+sizeof(char)>datalen ) + { + return false; + } + + (*ret)=data[streampos]; + streampos+=sizeof(char); + + return true; +} + +bool CRData::getUChar(unsigned char *ret) +{ + if(streampos+sizeof(unsigned char)>datalen ) + { + return false; + } + + (*ret)=data[streampos]; + streampos+=sizeof(unsigned char); + + return true; +} + +bool CRData::getVoidPtr(void **ret) +{ + if(streampos+sizeof(void*)>datalen ) + { + return false; + } + + memcpy(ret, &data[streampos], sizeof(void*) ); + streampos+=sizeof(void*); + return true; +} + +bool CRData::getVarInt( int64_t* ret ) +{ + u8 b = sqlite3GetVarint(reinterpret_cast(&data[streampos]), reinterpret_cast(ret), getLeft()); + streampos+=b; + return b!=0; +} + +unsigned int CRData::getSize(void) +{ + return static_cast(datalen); +} + +unsigned int CRData::getLeft(void) +{ + return static_cast(datalen - streampos); +} + +unsigned int CRData::getStreampos(void) +{ + return static_cast(streampos); +} + +const char *CRData::getDataPtr(void) +{ + return data; +} + +const char *CRData::getCurrDataPtr(void) +{ + return data+streampos; +} + +void CRData::setStreampos(unsigned int spos) +{ + if( spos <= datalen ) + { + streampos=spos; + } +} + +bool CRData::incrementPtr(unsigned int amount) +{ + if(amount>getLeft()) + return false; + + streampos+=amount; + return true; +} + + diff --git a/data.h b/data.h new file mode 100644 index 0000000..d71b0e5 --- /dev/null +++ b/data.h @@ -0,0 +1,81 @@ +#ifndef DATA_H_ +#define DATA_H_ + +#include +#include +#include + + +class CWData +{ +public: + char* getDataPtr(void); + unsigned long getDataSize(void); + + void addInt(int ta); + void addUInt(unsigned int ta); + void addInt64(int64_t ta); + void addUInt64(uint64_t ta); + void addFloat(float ta); + void addDouble(double ta); + void addUShort(unsigned short ta); + void addString(const std::string& ta); + void addString2(const std::string& ta); + void addChar(char ta); + void addUChar(unsigned char ta); + void addVoidPtr(void *ptr); + void addBuffer(const char* buffer, size_t bsize); + void addVarInt(int64_t ta); + + void clear(); + + void reserve(size_t count); + void resize(size_t count); + + size_t capacity(); +protected: + std::string data; +}; + +class CRData +{ +public: + CRData(const char* c,size_t datalength, bool pCopy=false); + CRData(const std::string *str); + CRData(void); + ~CRData(); + + void set(const char* c,size_t datalength, bool pCopy=false); + + bool getInt(int *ret); + bool getInt64(int64_t *ret); + bool getUInt(unsigned int *ret); + bool getFloat(float *ret); + bool getDouble(double *ret); + bool getUShort( unsigned short *ret); + bool getStr(std::string *ret); + bool getStr2(std::string *ret); + bool getChar(char *ret); + bool getUChar(unsigned char *ret); + bool getVoidPtr(void **ret); + bool getVarInt(int64_t* ret); + + unsigned int getSize(void); + unsigned int getLeft(void); + unsigned int getStreampos(void); + void setStreampos(unsigned int spos); + const char *getDataPtr(void); + const char *getCurrDataPtr(void); + bool incrementPtr(unsigned int amount); + +private: + + const char* data; + size_t streampos; + size_t datalen; + + bool copy; +}; + + +#endif //DATA_H_ diff --git a/lmdb/COPYRIGHT b/lmdb/COPYRIGHT new file mode 100644 index 0000000..f076556 --- /dev/null +++ b/lmdb/COPYRIGHT @@ -0,0 +1,20 @@ +Copyright 2011-2019 Howard Chu, Symas Corp. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted only as authorized by the OpenLDAP +Public License. + +A copy of this license is available in the file LICENSE in the +top-level directory of the distribution or, alternatively, at +. + +OpenLDAP is a registered trademark of the OpenLDAP Foundation. + +Individual files and/or contributed packages may be copyright by +other parties and/or subject to additional restrictions. + +This work also contains materials derived from public sources. + +Additional information about OpenLDAP can be obtained at +. diff --git a/lmdb/LICENSE b/lmdb/LICENSE new file mode 100644 index 0000000..05ad757 --- /dev/null +++ b/lmdb/LICENSE @@ -0,0 +1,47 @@ +The OpenLDAP Public License + Version 2.8, 17 August 2003 + +Redistribution and use of this software and associated documentation +("Software"), with or without modification, are permitted provided +that the following conditions are met: + +1. Redistributions in source form must retain copyright statements + and notices, + +2. Redistributions in binary form must reproduce applicable copyright + statements and notices, this list of conditions, and the following + disclaimer in the documentation and/or other materials provided + with the distribution, and + +3. Redistributions must contain a verbatim copy of this document. + +The OpenLDAP Foundation may revise this license from time to time. +Each revision is distinguished by a version number. You may use +this Software under terms of this license revision or under the +terms of any subsequent revision of the license. + +THIS SOFTWARE IS PROVIDED BY THE OPENLDAP FOUNDATION AND ITS +CONTRIBUTORS ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE OPENLDAP FOUNDATION, ITS CONTRIBUTORS, OR THE AUTHOR(S) +OR OWNER(S) OF THE SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +The names of the authors and copyright holders must not be used in +advertising or otherwise to promote the sale, use or other dealing +in this Software without specific, written prior permission. Title +to copyright in this Software shall at all times remain with copyright +holders. + +OpenLDAP is a registered trademark of the OpenLDAP Foundation. + +Copyright 1999-2003 The OpenLDAP Foundation, Redwood City, +California, USA. All Rights Reserved. Permission to copy and +distribute verbatim copies of this document is granted. diff --git a/lmdb/cppmidl.cpp b/lmdb/cppmidl.cpp new file mode 100644 index 0000000..187b9c2 --- /dev/null +++ b/lmdb/cppmidl.cpp @@ -0,0 +1,71 @@ +#include "cppmidl.h" +#include + +struct _I_CPPMIDL +{ + std::set ids; + std::set::iterator it; +}; + +extern "C" +{ + struct _CPPMIDL + { + struct _I_CPPMIDL i; + }; +} + +CPPMIDL cppmidl_alloc() +{ + return new _CPPMIDL; +} + +const MDB_ID* cppmidl_search(CPPMIDL ids, MDB_ID id) +{ + std::set::iterator it = ids->i.ids.lower_bound(id); + if (it == ids->i.ids.end()) + { + return NULL; + } + return &(*it); +} + +void cppmidl_erase(CPPMIDL ids, MDB_ID id) +{ + ids->i.ids.erase(id); +} + +int cppmidl_empty(CPPMIDL ids) +{ + return ids->i.ids.empty() ? 1 : 0; +} + +void cppmidl_begin(CPPMIDL ids) +{ + ids->i.it = ids->i.ids.begin(); +} + +const MDB_ID* cppmidl_next(CPPMIDL ids) +{ + if (ids->i.it == ids->i.ids.end()) + return NULL; + + const MDB_ID* ret = &(*ids->i.it); + ++ids->i.it; + return ret; +} + +void cppmidl_free(CPPMIDL ids) +{ + delete ids; +} + +void cppmidl_insert(CPPMIDL ids, MDB_ID id) +{ + ids->i.ids.insert(id); +} + +void cppmidl_insert_list(CPPMIDL ids, CPPMIDL other) +{ + ids->i.ids.insert(other->i.ids.begin(), other->i.ids.end()); +} diff --git a/lmdb/cppmidl.h b/lmdb/cppmidl.h new file mode 100644 index 0000000..7bf03ca --- /dev/null +++ b/lmdb/cppmidl.h @@ -0,0 +1,36 @@ +#pragma once +#include + +#ifdef __cplusplus +extern "C" { +#endif + + typedef size_t MDB_ID; + + struct _CPPMIDL; + typedef struct _CPPMIDL* CPPMIDL; + + struct _CPPMIDL_IT; + typedef struct __CPPMIDL_IT* CPPMIDL_IT; + + CPPMIDL cppmidl_alloc(); + + const MDB_ID* cppmidl_search(CPPMIDL ids, MDB_ID id); + + void cppmidl_erase(CPPMIDL ids, MDB_ID id); + + int cppmidl_empty(CPPMIDL ids); + + void cppmidl_begin(CPPMIDL ids); + + const MDB_ID* cppmidl_next(CPPMIDL ids); + + void cppmidl_free(CPPMIDL ids); + + void cppmidl_insert_list(CPPMIDL ids, CPPMIDL other); + + void cppmidl_insert(CPPMIDL ids, MDB_ID id); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/lmdb/lmdb.h b/lmdb/lmdb.h new file mode 100644 index 0000000..faf8ed8 --- /dev/null +++ b/lmdb/lmdb.h @@ -0,0 +1,1639 @@ +/** @file lmdb.h + * @brief Lightning memory-mapped database library + * + * @mainpage Lightning Memory-Mapped Database Manager (LMDB) + * + * @section intro_sec Introduction + * LMDB is a Btree-based database management library modeled loosely on the + * BerkeleyDB API, but much simplified. The entire database is exposed + * in a memory map, and all data fetches return data directly + * from the mapped memory, so no malloc's or memcpy's occur during + * data fetches. As such, the library is extremely simple because it + * requires no page caching layer of its own, and it is extremely high + * performance and memory-efficient. It is also fully transactional with + * full ACID semantics, and when the memory map is read-only, the + * database integrity cannot be corrupted by stray pointer writes from + * application code. + * + * The library is fully thread-aware and supports concurrent read/write + * access from multiple processes and threads. Data pages use a copy-on- + * write strategy so no active data pages are ever overwritten, which + * also provides resistance to corruption and eliminates the need of any + * special recovery procedures after a system crash. Writes are fully + * serialized; only one write transaction may be active at a time, which + * guarantees that writers can never deadlock. The database structure is + * multi-versioned so readers run with no locks; writers cannot block + * readers, and readers don't block writers. + * + * Unlike other well-known database mechanisms which use either write-ahead + * transaction logs or append-only data writes, LMDB requires no maintenance + * during operation. Both write-ahead loggers and append-only databases + * require periodic checkpointing and/or compaction of their log or database + * files otherwise they grow without bound. LMDB tracks free pages within + * the database and re-uses them for new write operations, so the database + * size does not grow without bound in normal use. + * + * The memory map can be used as a read-only or read-write map. It is + * read-only by default as this provides total immunity to corruption. + * Using read-write mode offers much higher write performance, but adds + * the possibility for stray application writes thru pointers to silently + * corrupt the database. Of course if your application code is known to + * be bug-free (...) then this is not an issue. + * + * If this is your first time using a transactional embedded key/value + * store, you may find the \ref starting page to be helpful. + * + * @section caveats_sec Caveats + * Troubleshooting the lock file, plus semaphores on BSD systems: + * + * - A broken lockfile can cause sync issues. + * Stale reader transactions left behind by an aborted program + * cause further writes to grow the database quickly, and + * stale locks can block further operation. + * + * Fix: Check for stale readers periodically, using the + * #mdb_reader_check function or the \ref mdb_stat_1 "mdb_stat" tool. + * Stale writers will be cleared automatically on some systems: + * - Windows - automatic + * - Linux, systems using POSIX mutexes with Robust option - automatic + * - not on BSD, systems using POSIX semaphores. + * Otherwise just make all programs using the database close it; + * the lockfile is always reset on first open of the environment. + * + * - On BSD systems or others configured with MDB_USE_POSIX_SEM, + * startup can fail due to semaphores owned by another userid. + * + * Fix: Open and close the database as the user which owns the + * semaphores (likely last user) or as root, while no other + * process is using the database. + * + * Restrictions/caveats (in addition to those listed for some functions): + * + * - Only the database owner should normally use the database on + * BSD systems or when otherwise configured with MDB_USE_POSIX_SEM. + * Multiple users can cause startup to fail later, as noted above. + * + * - There is normally no pure read-only mode, since readers need write + * access to locks and lock file. Exceptions: On read-only filesystems + * or with the #MDB_NOLOCK flag described under #mdb_env_open(). + * + * - An LMDB configuration will often reserve considerable \b unused + * memory address space and maybe file size for future growth. + * This does not use actual memory or disk space, but users may need + * to understand the difference so they won't be scared off. + * + * - By default, in versions before 0.9.10, unused portions of the data + * file might receive garbage data from memory freed by other code. + * (This does not happen when using the #MDB_WRITEMAP flag.) As of + * 0.9.10 the default behavior is to initialize such memory before + * writing to the data file. Since there may be a slight performance + * cost due to this initialization, applications may disable it using + * the #MDB_NOMEMINIT flag. Applications handling sensitive data + * which must not be written should not use this flag. This flag is + * irrelevant when using #MDB_WRITEMAP. + * + * - A thread can only use one transaction at a time, plus any child + * transactions. Each transaction belongs to one thread. See below. + * The #MDB_NOTLS flag changes this for read-only transactions. + * + * - Use an MDB_env* in the process which opened it, not after fork(). + * + * - Do not have open an LMDB database twice in the same process at + * the same time. Not even from a plain open() call - close()ing it + * breaks fcntl() advisory locking. (It is OK to reopen it after + * fork() - exec*(), since the lockfile has FD_CLOEXEC set.) + * + * - Avoid long-lived transactions. Read transactions prevent + * reuse of pages freed by newer write transactions, thus the + * database can grow quickly. Write transactions prevent + * other write transactions, since writes are serialized. + * + * - Avoid suspending a process with active transactions. These + * would then be "long-lived" as above. Also read transactions + * suspended when writers commit could sometimes see wrong data. + * + * ...when several processes can use a database concurrently: + * + * - Avoid aborting a process with an active transaction. + * The transaction becomes "long-lived" as above until a check + * for stale readers is performed or the lockfile is reset, + * since the process may not remove it from the lockfile. + * + * This does not apply to write transactions if the system clears + * stale writers, see above. + * + * - If you do that anyway, do a periodic check for stale readers. Or + * close the environment once in a while, so the lockfile can get reset. + * + * - Do not use LMDB databases on remote filesystems, even between + * processes on the same host. This breaks flock() on some OSes, + * possibly memory map sync, and certainly sync between programs + * on different hosts. + * + * - Opening a database can fail if another process is opening or + * closing it at exactly the same time. + * + * @author Howard Chu, Symas Corporation. + * + * @copyright Copyright 2011-2019 Howard Chu, Symas Corp. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + * + * @par Derived From: + * This code is derived from btree.c written by Martin Hedenfalk. + * + * Copyright (c) 2009, 2010 Martin Hedenfalk + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef _LMDB_H_ +#define _LMDB_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** Unix permissions for creating files, or dummy definition for Windows */ +#ifdef _MSC_VER +typedef int mdb_mode_t; +#else +typedef mode_t mdb_mode_t; +#endif + +/** An abstraction for a file handle. + * On POSIX systems file handles are small integers. On Windows + * they're opaque pointers. + */ +#ifdef _WIN32 +typedef void *mdb_filehandle_t; +#else +typedef int mdb_filehandle_t; +#endif + +/** @defgroup mdb LMDB API + * @{ + * @brief OpenLDAP Lightning Memory-Mapped Database Manager + */ +/** @defgroup Version Version Macros + * @{ + */ +/** Library major version */ +#define MDB_VERSION_MAJOR 0 +/** Library minor version */ +#define MDB_VERSION_MINOR 9 +/** Library patch version */ +#define MDB_VERSION_PATCH 24 + +/** Combine args a,b,c into a single integer for easy version comparisons */ +#define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) + +/** The full library version as a single integer */ +#define MDB_VERSION_FULL \ + MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) + +/** The release date of this library version */ +#define MDB_VERSION_DATE "July 24, 2019" + +/** A stringifier for the version info */ +#define MDB_VERSTR(a,b,c,d) "LMDB " #a "." #b "." #c ": (" d ")" + +/** A helper for the stringifier macro */ +#define MDB_VERFOO(a,b,c,d) MDB_VERSTR(a,b,c,d) + +/** The full library version as a C string */ +#define MDB_VERSION_STRING \ + MDB_VERFOO(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH,MDB_VERSION_DATE) +/** @} */ + +/** @brief Opaque structure for a database environment. + * + * A DB environment supports multiple databases, all residing in the same + * shared-memory map. + */ +typedef struct MDB_env MDB_env; + +/** @brief Opaque structure for a transaction handle. + * + * All database operations require a transaction handle. Transactions may be + * read-only or read-write. + */ +typedef struct MDB_txn MDB_txn; + +/** @brief A handle for an individual database in the DB environment. */ +typedef unsigned int MDB_dbi; + +/** @brief Opaque structure for navigating through a database */ +typedef struct MDB_cursor MDB_cursor; + +/** @brief Generic structure used for passing keys and data in and out + * of the database. + * + * Values returned from the database are valid only until a subsequent + * update operation, or the end of the transaction. Do not modify or + * free them, they commonly point into the database itself. + * + * Key sizes must be between 1 and #mdb_env_get_maxkeysize() inclusive. + * The same applies to data sizes in databases with the #MDB_DUPSORT flag. + * Other data items can in theory be from 0 to 0xffffffff bytes long. + */ +typedef struct MDB_val { + size_t mv_size; /**< size of the data item */ + void *mv_data; /**< address of the data item */ +} MDB_val; + +/** @brief A callback function used to compare two keys in a database */ +typedef int (MDB_cmp_func)(const MDB_val *a, const MDB_val *b); + +/** @brief A callback function used to relocate a position-dependent data item + * in a fixed-address database. + * + * The \b newptr gives the item's desired address in + * the memory map, and \b oldptr gives its previous address. The item's actual + * data resides at the address in \b item. This callback is expected to walk + * through the fields of the record in \b item and modify any + * values based at the \b oldptr address to be relative to the \b newptr address. + * @param[in,out] item The item that is to be relocated. + * @param[in] oldptr The previous address. + * @param[in] newptr The new address to relocate to. + * @param[in] relctx An application-provided context, set by #mdb_set_relctx(). + * @todo This feature is currently unimplemented. + */ +typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *relctx); + +/** @defgroup mdb_env Environment Flags + * @{ + */ + /** mmap at a fixed address (experimental) */ +#define MDB_FIXEDMAP 0x01 + /** no environment directory */ +#define MDB_NOSUBDIR 0x4000 + /** don't fsync after commit */ +#define MDB_NOSYNC 0x10000 + /** read only */ +#define MDB_RDONLY 0x20000 + /** don't fsync metapage after commit */ +#define MDB_NOMETASYNC 0x40000 + /** use writable mmap */ +#define MDB_WRITEMAP 0x80000 + /** use asynchronous msync when #MDB_WRITEMAP is used */ +#define MDB_MAPASYNC 0x100000 + /** tie reader locktable slots to #MDB_txn objects instead of to threads */ +#define MDB_NOTLS 0x200000 + /** don't do any locking, caller must manage their own locks */ +#define MDB_NOLOCK 0x400000 + /** don't do readahead (no effect on Windows) */ +#define MDB_NORDAHEAD 0x800000 + /** don't initialize malloc'd memory before writing to datafile */ +#define MDB_NOMEMINIT 0x1000000 + /** use the previous snapshot rather than the latest one */ +#define MDB_PREVSNAPSHOT 0x2000000 +/** @} */ + +/** @defgroup mdb_dbi_open Database Flags + * @{ + */ + /** use reverse string keys */ +#define MDB_REVERSEKEY 0x02 + /** use sorted duplicates */ +#define MDB_DUPSORT 0x04 + /** numeric keys in native byte order: either unsigned int or size_t. + * The keys must all be of the same size. */ +#define MDB_INTEGERKEY 0x08 + /** with #MDB_DUPSORT, sorted dup items have fixed size */ +#define MDB_DUPFIXED 0x10 + /** with #MDB_DUPSORT, dups are #MDB_INTEGERKEY-style integers */ +#define MDB_INTEGERDUP 0x20 + /** with #MDB_DUPSORT, use reverse string dups */ +#define MDB_REVERSEDUP 0x40 + /** create DB if not already existing */ +#define MDB_CREATE 0x40000 +/** @} */ + +/** @defgroup mdb_put Write Flags + * @{ + */ +/** For put: Don't write if the key already exists. */ +#define MDB_NOOVERWRITE 0x10 +/** Only for #MDB_DUPSORT
+ * For put: don't write if the key and data pair already exist.
+ * For mdb_cursor_del: remove all duplicate data items. + */ +#define MDB_NODUPDATA 0x20 +/** For mdb_cursor_put: overwrite the current key/data pair */ +#define MDB_CURRENT 0x40 +/** For put: Just reserve space for data, don't copy it. Return a + * pointer to the reserved space. + */ +#define MDB_RESERVE 0x10000 +/** Data is being appended, don't split full pages. */ +#define MDB_APPEND 0x20000 +/** Duplicate data is being appended, don't split full pages. */ +#define MDB_APPENDDUP 0x40000 +/** Store multiple data items in one call. Only for #MDB_DUPFIXED. */ +#define MDB_MULTIPLE 0x80000 +/* @} */ + +/** @defgroup mdb_copy Copy Flags + * @{ + */ +/** Compacting copy: Omit free space from copy, and renumber all + * pages sequentially. + */ +#define MDB_CP_COMPACT 0x01 +/* @} */ + +/** @brief Cursor Get operations. + * + * This is the set of all operations for retrieving data + * using a cursor. + */ +typedef enum MDB_cursor_op { + MDB_FIRST, /**< Position at first key/data item */ + MDB_FIRST_DUP, /**< Position at first data item of current key. + Only for #MDB_DUPSORT */ + MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */ + MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for #MDB_DUPSORT */ + MDB_GET_CURRENT, /**< Return key/data at current cursor position */ + MDB_GET_MULTIPLE, /**< Return up to a page of duplicate data items + from current cursor position. Move cursor to prepare + for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ + MDB_LAST, /**< Position at last key/data item */ + MDB_LAST_DUP, /**< Position at last data item of current key. + Only for #MDB_DUPSORT */ + MDB_NEXT, /**< Position at next data item */ + MDB_NEXT_DUP, /**< Position at next data item of current key. + Only for #MDB_DUPSORT */ + MDB_NEXT_MULTIPLE, /**< Return up to a page of duplicate data items + from next cursor position. Move cursor to prepare + for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ + MDB_NEXT_NODUP, /**< Position at first data item of next key */ + MDB_PREV, /**< Position at previous data item */ + MDB_PREV_DUP, /**< Position at previous data item of current key. + Only for #MDB_DUPSORT */ + MDB_PREV_NODUP, /**< Position at last data item of previous key */ + MDB_SET, /**< Position at specified key */ + MDB_SET_KEY, /**< Position at specified key, return key + data */ + MDB_SET_RANGE, /**< Position at first key greater than or equal to specified key. */ + MDB_PREV_MULTIPLE /**< Position at previous page and return up to + a page of duplicate data items. Only for #MDB_DUPFIXED */ +} MDB_cursor_op; + +/** @defgroup errors Return Codes + * + * BerkeleyDB uses -30800 to -30999, we'll go under them + * @{ + */ + /** Successful result */ +#define MDB_SUCCESS 0 + /** key/data pair already exists */ +#define MDB_KEYEXIST (-30799) + /** key/data pair not found (EOF) */ +#define MDB_NOTFOUND (-30798) + /** Requested page not found - this usually indicates corruption */ +#define MDB_PAGE_NOTFOUND (-30797) + /** Located page was wrong type */ +#define MDB_CORRUPTED (-30796) + /** Update of meta page failed or environment had fatal error */ +#define MDB_PANIC (-30795) + /** Environment version mismatch */ +#define MDB_VERSION_MISMATCH (-30794) + /** File is not a valid LMDB file */ +#define MDB_INVALID (-30793) + /** Environment mapsize reached */ +#define MDB_MAP_FULL (-30792) + /** Environment maxdbs reached */ +#define MDB_DBS_FULL (-30791) + /** Environment maxreaders reached */ +#define MDB_READERS_FULL (-30790) + /** Too many TLS keys in use - Windows only */ +#define MDB_TLS_FULL (-30789) + /** Txn has too many dirty pages */ +#define MDB_TXN_FULL (-30788) + /** Cursor stack too deep - internal error */ +#define MDB_CURSOR_FULL (-30787) + /** Page has not enough space - internal error */ +#define MDB_PAGE_FULL (-30786) + /** Database contents grew beyond environment mapsize */ +#define MDB_MAP_RESIZED (-30785) + /** Operation and DB incompatible, or DB type changed. This can mean: + *
    + *
  • The operation expects an #MDB_DUPSORT / #MDB_DUPFIXED database. + *
  • Opening a named DB when the unnamed DB has #MDB_DUPSORT / #MDB_INTEGERKEY. + *
  • Accessing a data record as a database, or vice versa. + *
  • The database was dropped and recreated with different flags. + *
+ */ +#define MDB_INCOMPATIBLE (-30784) + /** Invalid reuse of reader locktable slot */ +#define MDB_BAD_RSLOT (-30783) + /** Transaction must abort, has a child, or is invalid */ +#define MDB_BAD_TXN (-30782) + /** Unsupported size of key/DB name/data, or wrong DUPFIXED size */ +#define MDB_BAD_VALSIZE (-30781) + /** The specified DBI was changed unexpectedly */ +#define MDB_BAD_DBI (-30780) + /** The last defined error code */ +#define MDB_LAST_ERRCODE MDB_BAD_DBI +/** @} */ + +/** @brief Statistics for a database in the environment */ +typedef struct MDB_stat { + unsigned int ms_psize; /**< Size of a database page. + This is currently the same for all databases. */ + unsigned int ms_depth; /**< Depth (height) of the B-tree */ + size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ + size_t ms_leaf_pages; /**< Number of leaf pages */ + size_t ms_overflow_pages; /**< Number of overflow pages */ + size_t ms_entries; /**< Number of data items */ +} MDB_stat; + +/** @brief Information about the environment */ +typedef struct MDB_envinfo { + void *me_mapaddr; /**< Address of map, if fixed */ + size_t me_mapsize; /**< Size of the data memory map */ + size_t me_last_pgno; /**< ID of the last used page */ + size_t me_last_txnid; /**< ID of the last committed transaction */ + unsigned int me_maxreaders; /**< max reader slots in the environment */ + unsigned int me_numreaders; /**< max reader slots used in the environment */ +} MDB_envinfo; + + /** @brief Return the LMDB library version information. + * + * @param[out] major if non-NULL, the library major version number is copied here + * @param[out] minor if non-NULL, the library minor version number is copied here + * @param[out] patch if non-NULL, the library patch version number is copied here + * @retval "version string" The library version as a string + */ +char *mdb_version(int *major, int *minor, int *patch); + + /** @brief Return a string describing a given error code. + * + * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3) + * function. If the error code is greater than or equal to 0, then the string + * returned by the system function strerror(3) is returned. If the error code + * is less than 0, an error string corresponding to the LMDB library error is + * returned. See @ref errors for a list of LMDB-specific error codes. + * @param[in] err The error code + * @retval "error message" The description of the error + */ +char *mdb_strerror(int err); + + /** @brief Create an LMDB environment handle. + * + * This function allocates memory for a #MDB_env structure. To release + * the allocated memory and discard the handle, call #mdb_env_close(). + * Before the handle may be used, it must be opened using #mdb_env_open(). + * Various other options may also need to be set before opening the handle, + * e.g. #mdb_env_set_mapsize(), #mdb_env_set_maxreaders(), #mdb_env_set_maxdbs(), + * depending on usage requirements. + * @param[out] env The address where the new handle will be stored + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_create(MDB_env **env); + + /** @brief Open an environment handle. + * + * If this function fails, #mdb_env_close() must be called to discard the #MDB_env handle. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] path The directory in which the database files reside. This + * directory must already exist and be writable. + * @param[in] flags Special options for this environment. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + * Flags set by mdb_env_set_flags() are also used. + *
    + *
  • #MDB_FIXEDMAP + * use a fixed address for the mmap region. This flag must be specified + * when creating the environment, and is stored persistently in the environment. + * If successful, the memory map will always reside at the same virtual address + * and pointers used to reference data items in the database will be constant + * across multiple invocations. This option may not always work, depending on + * how the operating system has allocated memory to shared libraries and other uses. + * The feature is highly experimental. + *
  • #MDB_NOSUBDIR + * By default, LMDB creates its environment in a directory whose + * pathname is given in \b path, and creates its data and lock files + * under that directory. With this option, \b path is used as-is for + * the database main data file. The database lock file is the \b path + * with "-lock" appended. + *
  • #MDB_RDONLY + * Open the environment in read-only mode. No write operations will be + * allowed. LMDB will still modify the lock file - except on read-only + * filesystems, where LMDB does not use locks. + *
  • #MDB_WRITEMAP + * Use a writeable memory map unless MDB_RDONLY is set. This uses + * fewer mallocs but loses protection from application bugs + * like wild pointer writes and other bad updates into the database. + * This may be slightly faster for DBs that fit entirely in RAM, but + * is slower for DBs larger than RAM. + * Incompatible with nested transactions. + * Do not mix processes with and without MDB_WRITEMAP on the same + * environment. This can defeat durability (#mdb_env_sync etc). + *
  • #MDB_NOMETASYNC + * Flush system buffers to disk only once per transaction, omit the + * metadata flush. Defer that until the system flushes files to disk, + * or next non-MDB_RDONLY commit or #mdb_env_sync(). This optimization + * maintains database integrity, but a system crash may undo the last + * committed transaction. I.e. it preserves the ACI (atomicity, + * consistency, isolation) but not D (durability) database property. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_NOSYNC + * Don't flush system buffers to disk when committing a transaction. + * This optimization means a system crash can corrupt the database or + * lose the last transactions if buffers are not yet flushed to disk. + * The risk is governed by how often the system flushes dirty buffers + * to disk and how often #mdb_env_sync() is called. However, if the + * filesystem preserves write order and the #MDB_WRITEMAP flag is not + * used, transactions exhibit ACI (atomicity, consistency, isolation) + * properties and only lose D (durability). I.e. database integrity + * is maintained, but a system crash may undo the final transactions. + * Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with no + * hint for when to write transactions to disk, unless #mdb_env_sync() + * is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_MAPASYNC + * When using #MDB_WRITEMAP, use asynchronous flushes to disk. + * As with #MDB_NOSYNC, a system crash can then corrupt the + * database or lose the last transactions. Calling #mdb_env_sync() + * ensures on-disk database integrity until next commit. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_NOTLS + * Don't use Thread-Local Storage. Tie reader locktable slots to + * #MDB_txn objects instead of to threads. I.e. #mdb_txn_reset() keeps + * the slot reseved for the #MDB_txn object. A thread may use parallel + * read-only transactions. A read-only transaction may span threads if + * the user synchronizes its use. Applications that multiplex many + * user threads over individual OS threads need this option. Such an + * application must also serialize the write transactions in an OS + * thread, since LMDB's write locking is unaware of the user threads. + *
  • #MDB_NOLOCK + * Don't do any locking. If concurrent access is anticipated, the + * caller must manage all concurrency itself. For proper operation + * the caller must enforce single-writer semantics, and must ensure + * that no readers are using old transactions while a writer is + * active. The simplest approach is to use an exclusive lock so that + * no readers may be active at all when a writer begins. + *
  • #MDB_NORDAHEAD + * Turn off readahead. Most operating systems perform readahead on + * read requests by default. This option turns it off if the OS + * supports it. Turning it off may help random read performance + * when the DB is larger than RAM and system RAM is full. + * The option is not implemented on Windows. + *
  • #MDB_NOMEMINIT + * Don't initialize malloc'd memory before writing to unused spaces + * in the data file. By default, memory for pages written to the data + * file is obtained using malloc. While these pages may be reused in + * subsequent transactions, freshly malloc'd pages will be initialized + * to zeroes before use. This avoids persisting leftover data from other + * code (that used the heap and subsequently freed the memory) into the + * data file. Note that many other system libraries may allocate + * and free memory from the heap for arbitrary uses. E.g., stdio may + * use the heap for file I/O buffers. This initialization step has a + * modest performance cost so some applications may want to disable + * it using this flag. This option can be a problem for applications + * which handle sensitive data like passwords, and it makes memory + * checkers like Valgrind noisy. This flag is not needed with #MDB_WRITEMAP, + * which writes directly to the mmap instead of using malloc for pages. The + * initialization is also skipped if #MDB_RESERVE is used; the + * caller is expected to overwrite all of the memory that was + * reserved in that case. + * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_PREVSNAPSHOT + * Open the environment with the previous snapshot rather than the latest + * one. This loses the latest transaction, but may help work around some + * types of corruption. If opened with write access, this must be the + * only process using the environment. This flag is automatically reset + * after a write transaction is successfully committed. + *
+ * @param[in] mode The UNIX permissions to set on created files and semaphores. + * This parameter is ignored on Windows. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_VERSION_MISMATCH - the version of the LMDB library doesn't match the + * version that created the database environment. + *
  • #MDB_INVALID - the environment file headers are corrupted. + *
  • ENOENT - the directory specified by the path parameter doesn't exist. + *
  • EACCES - the user didn't have permission to access the environment files. + *
  • EAGAIN - the environment was locked by another process. + *
+ */ +int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode); + + /** @brief Copy an LMDB environment to the specified path. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] path The directory in which the copy will reside. This + * directory must already exist and be writable but must otherwise be + * empty. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copy(MDB_env *env, const char *path); + + /** @brief Copy an LMDB environment to the specified file descriptor. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] fd The filedescriptor to write the copy to. It must + * have already been opened for Write access. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd); + + /** @brief Copy an LMDB environment to the specified path, with options. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] path The directory in which the copy will reside. This + * directory must already exist and be writable but must otherwise be + * empty. + * @param[in] flags Special options for this operation. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_CP_COMPACT - Perform compaction while copying: omit free + * pages and sequentially renumber all pages in output. This option + * consumes more CPU and runs more slowly than the default. + * Currently it fails if the environment has suffered a page leak. + *
+ * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags); + + /** @brief Copy an LMDB environment to the specified file descriptor, + * with options. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. See + * #mdb_env_copy2() for further details. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] fd The filedescriptor to write the copy to. It must + * have already been opened for Write access. + * @param[in] flags Special options for this operation. + * See #mdb_env_copy2() for options. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copyfd2(MDB_env *env, mdb_filehandle_t fd, unsigned int flags); + + /** @brief Return statistics about the LMDB environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] stat The address of an #MDB_stat structure + * where the statistics will be copied + */ +int mdb_env_stat(MDB_env *env, MDB_stat *stat); + + /** @brief Return information about the LMDB environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] stat The address of an #MDB_envinfo structure + * where the information will be copied + */ +int mdb_env_info(MDB_env *env, MDB_envinfo *stat); + + /** @brief Flush the data buffers to disk. + * + * Data is always written to disk when #mdb_txn_commit() is called, + * but the operating system may keep it buffered. LMDB always flushes + * the OS buffers upon commit as well, unless the environment was + * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. This call is + * not valid if the environment was opened with #MDB_RDONLY. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] force If non-zero, force a synchronous flush. Otherwise + * if the environment has the #MDB_NOSYNC flag set the flushes + * will be omitted, and with #MDB_MAPASYNC they will be asynchronous. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EACCES - the environment is read-only. + *
  • EINVAL - an invalid parameter was specified. + *
  • EIO - an error occurred during synchronization. + *
+ */ +int mdb_env_sync(MDB_env *env, int force); + + /** @brief Close the environment and release the memory map. + * + * Only a single thread may call this function. All transactions, databases, + * and cursors must already be closed before calling this function. Attempts to + * use any such handles after calling this function will cause a SIGSEGV. + * The environment handle will be freed and must not be used again after this call. + * @param[in] env An environment handle returned by #mdb_env_create() + */ +void mdb_env_close(MDB_env *env); + + /** @brief Set environment flags. + * + * This may be used to set some flags in addition to those from + * #mdb_env_open(), or to unset these flags. If several threads + * change the flags at the same time, the result is undefined. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] flags The flags to change, bitwise OR'ed together + * @param[in] onoff A non-zero value sets the flags, zero clears them. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_set_flags(MDB_env *env, unsigned int flags, int onoff); + + /** @brief Get environment flags. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] flags The address of an integer to store the flags + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_get_flags(MDB_env *env, unsigned int *flags); + + /** @brief Return the path that was used in #mdb_env_open(). + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] path Address of a string pointer to contain the path. This + * is the actual string in the environment, not a copy. It should not be + * altered in any way. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_get_path(MDB_env *env, const char **path); + + /** @brief Return the filedescriptor for the given environment. + * + * This function may be called after fork(), so the descriptor can be + * closed before exec*(). Other LMDB file descriptors have FD_CLOEXEC. + * (Until LMDB 0.9.18, only the lockfile had that.) + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] fd Address of a mdb_filehandle_t to contain the descriptor. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd); + + /** @brief Set the size of the memory map to use for this environment. + * + * The size should be a multiple of the OS page size. The default is + * 10485760 bytes. The size of the memory map is also the maximum size + * of the database. The value should be chosen as large as possible, + * to accommodate future growth of the database. + * This function should be called after #mdb_env_create() and before #mdb_env_open(). + * It may be called at later times if no transactions are active in + * this process. Note that the library does not check for this condition, + * the caller must ensure it explicitly. + * + * The new size takes effect immediately for the current process but + * will not be persisted to any others until a write transaction has been + * committed by the current process. Also, only mapsize increases are + * persisted into the environment. + * + * If the mapsize is increased by another process, and data has grown + * beyond the range of the current mapsize, #mdb_txn_begin() will + * return #MDB_MAP_RESIZED. This function may be called with a size + * of zero to adopt the new size. + * + * Any attempt to set a size smaller than the space already consumed + * by the environment will be silently changed to the current size of the used space. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] size The size in bytes + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment has + * an active write transaction. + *
+ */ +int mdb_env_set_mapsize(MDB_env *env, size_t size); + + /** @brief Set the maximum number of threads/reader slots for the environment. + * + * This defines the number of slots in the lock table that is used to track readers in the + * the environment. The default is 126. + * Starting a read-only transaction normally ties a lock table slot to the + * current thread until the environment closes or the thread exits. If + * MDB_NOTLS is in use, #mdb_txn_begin() instead ties the slot to the + * MDB_txn object until it or the #MDB_env object is destroyed. + * This function may only be called after #mdb_env_create() and before #mdb_env_open(). + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] readers The maximum number of reader lock table slots + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment is already open. + *
+ */ +int mdb_env_set_maxreaders(MDB_env *env, unsigned int readers); + + /** @brief Get the maximum number of threads/reader slots for the environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] readers Address of an integer to store the number of readers + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers); + + /** @brief Set the maximum number of named databases for the environment. + * + * This function is only needed if multiple databases will be used in the + * environment. Simpler applications that use the environment as a single + * unnamed database can ignore this option. + * This function may only be called after #mdb_env_create() and before #mdb_env_open(). + * + * Currently a moderate number of slots are cheap but a huge number gets + * expensive: 7-120 words per transaction, and every #mdb_dbi_open() + * does a linear search of the opened slots. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] dbs The maximum number of databases + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment is already open. + *
+ */ +int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); + + /** @brief Get the maximum size of keys and #MDB_DUPSORT data we can write. + * + * Depends on the compile-time constant #MDB_MAXKEYSIZE. Default 511. + * See @ref MDB_val. + * @param[in] env An environment handle returned by #mdb_env_create() + * @return The maximum size of a key we can write + */ +int mdb_env_get_maxkeysize(MDB_env *env); + + /** @brief Set application information associated with the #MDB_env. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] ctx An arbitrary pointer for whatever the application needs. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_set_userctx(MDB_env *env, void *ctx); + + /** @brief Get the application information associated with the #MDB_env. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @return The pointer set by #mdb_env_set_userctx(). + */ +void *mdb_env_get_userctx(MDB_env *env); + + /** @brief A callback function for most LMDB assert() failures, + * called before printing the message and aborting. + * + * @param[in] env An environment handle returned by #mdb_env_create(). + * @param[in] msg The assertion message, not including newline. + */ +typedef void MDB_assert_func(MDB_env *env, const char *msg); + + /** Set or reset the assert() callback of the environment. + * Disabled if liblmdb is buillt with NDEBUG. + * @note This hack should become obsolete as lmdb's error handling matures. + * @param[in] env An environment handle returned by #mdb_env_create(). + * @param[in] func An #MDB_assert_func function, or 0. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_set_assert(MDB_env *env, MDB_assert_func *func); + + /** @brief Create a transaction for use with the environment. + * + * The transaction handle may be discarded using #mdb_txn_abort() or #mdb_txn_commit(). + * @note A transaction and its cursors must only be used by a single + * thread, and a thread may only have a single transaction at a time. + * If #MDB_NOTLS is in use, this does not apply to read-only transactions. + * @note Cursors may not span transactions. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] parent If this parameter is non-NULL, the new transaction + * will be a nested transaction, with the transaction indicated by \b parent + * as its parent. Transactions may be nested to any level. A parent + * transaction and its cursors may not issue any other operations than + * mdb_txn_commit and mdb_txn_abort while it has active child transactions. + * @param[in] flags Special options for this transaction. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_RDONLY + * This transaction will not perform any write operations. + *
+ * @param[out] txn Address where the new #MDB_txn handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + *
  • #MDB_MAP_RESIZED - another process wrote data beyond this MDB_env's + * mapsize and this environment's map must be resized as well. + * See #mdb_env_set_mapsize(). + *
  • #MDB_READERS_FULL - a read-only transaction was requested and + * the reader lock table is full. See #mdb_env_set_maxreaders(). + *
  • ENOMEM - out of memory. + *
+ */ +int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **txn); + + /** @brief Returns the transaction's #MDB_env + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + */ +MDB_env *mdb_txn_env(MDB_txn *txn); + + /** @brief Return the transaction's ID. + * + * This returns the identifier associated with this transaction. For a + * read-only transaction, this corresponds to the snapshot being read; + * concurrent readers will frequently have the same transaction ID. + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @return A transaction ID, valid if input is an active transaction. + */ +size_t mdb_txn_id(MDB_txn *txn); + + /** @brief Commit all the operations of a transaction into the database. + * + * The transaction handle is freed. It and its cursors must not be used + * again after this call, except with #mdb_cursor_renew(). + * @note Earlier documentation incorrectly said all cursors would be freed. + * Only write-transactions free cursors. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
  • ENOSPC - no more disk space. + *
  • EIO - a low-level I/O error occurred while writing. + *
  • ENOMEM - out of memory. + *
+ */ +int mdb_txn_commit(MDB_txn *txn); + + /** @brief Abandon all the operations of the transaction instead of saving them. + * + * The transaction handle is freed. It and its cursors must not be used + * again after this call, except with #mdb_cursor_renew(). + * @note Earlier documentation incorrectly said all cursors would be freed. + * Only write-transactions free cursors. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + */ +void mdb_txn_abort(MDB_txn *txn); + + /** @brief Reset a read-only transaction. + * + * Abort the transaction like #mdb_txn_abort(), but keep the transaction + * handle. #mdb_txn_renew() may reuse the handle. This saves allocation + * overhead if the process will start a new read-only transaction soon, + * and also locking overhead if #MDB_NOTLS is in use. The reader table + * lock is released, but the table slot stays tied to its thread or + * #MDB_txn. Use mdb_txn_abort() to discard a reset handle, and to free + * its lock table slot if MDB_NOTLS is in use. + * Cursors opened within the transaction must not be used + * again after this call, except with #mdb_cursor_renew(). + * Reader locks generally don't interfere with writers, but they keep old + * versions of database pages allocated. Thus they prevent the old pages + * from being reused when writers commit new data, and so under heavy load + * the database size may grow much more rapidly than otherwise. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + */ +void mdb_txn_reset(MDB_txn *txn); + + /** @brief Renew a read-only transaction. + * + * This acquires a new reader lock for a transaction handle that had been + * released by #mdb_txn_reset(). It must be called before a reset transaction + * may be used again. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_txn_renew(MDB_txn *txn); + +/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ +#define mdb_open(txn,name,flags,dbi) mdb_dbi_open(txn,name,flags,dbi) +/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ +#define mdb_close(env,dbi) mdb_dbi_close(env,dbi) + + /** @brief Open a database in the environment. + * + * A database handle denotes the name and parameters of a database, + * independently of whether such a database exists. + * The database handle may be discarded by calling #mdb_dbi_close(). + * The old database handle is returned if the database was already open. + * The handle may only be closed once. + * + * The database handle will be private to the current transaction until + * the transaction is successfully committed. If the transaction is + * aborted the handle will be closed automatically. + * After a successful commit the handle will reside in the shared + * environment, and may be used by other transactions. + * + * This function must not be called from multiple concurrent + * transactions in the same process. A transaction that uses + * this function must finish (either commit or abort) before + * any other transaction in the process may use this function. + * + * To use named databases (with name != NULL), #mdb_env_set_maxdbs() + * must be called before opening the environment. Database names are + * keys in the unnamed database, and may be read but not written. + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] name The name of the database to open. If only a single + * database is needed in the environment, this value may be NULL. + * @param[in] flags Special options for this database. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_REVERSEKEY + * Keys are strings to be compared in reverse order, from the end + * of the strings to the beginning. By default, Keys are treated as strings and + * compared from beginning to end. + *
  • #MDB_DUPSORT + * Duplicate keys may be used in the database. (Or, from another perspective, + * keys may have multiple data items, stored in sorted order.) By default + * keys must be unique and may have only a single data item. + *
  • #MDB_INTEGERKEY + * Keys are binary integers in native byte order, either unsigned int + * or size_t, and will be sorted as such. + * The keys must all be of the same size. + *
  • #MDB_DUPFIXED + * This flag may only be used in combination with #MDB_DUPSORT. This option + * tells the library that the data items for this database are all the same + * size, which allows further optimizations in storage and retrieval. When + * all data items are the same size, the #MDB_GET_MULTIPLE, #MDB_NEXT_MULTIPLE + * and #MDB_PREV_MULTIPLE cursor operations may be used to retrieve multiple + * items at once. + *
  • #MDB_INTEGERDUP + * This option specifies that duplicate data items are binary integers, + * similar to #MDB_INTEGERKEY keys. + *
  • #MDB_REVERSEDUP + * This option specifies that duplicate data items should be compared as + * strings in reverse order. + *
  • #MDB_CREATE + * Create the named database if it doesn't exist. This option is not + * allowed in a read-only transaction or a read-only environment. + *
+ * @param[out] dbi Address where the new #MDB_dbi handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - the specified database doesn't exist in the environment + * and #MDB_CREATE was not specified. + *
  • #MDB_DBS_FULL - too many databases have been opened. See #mdb_env_set_maxdbs(). + *
+ */ +int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi); + + /** @brief Retrieve statistics for a database. + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[out] stat The address of an #MDB_stat structure + * where the statistics will be copied + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat); + + /** @brief Retrieve the DB flags for a database handle. + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[out] flags Address where the flags will be returned. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags); + + /** @brief Close a database handle. Normally unnecessary. Use with care: + * + * This call is not mutex protected. Handles should only be closed by + * a single thread, and only if no other threads are going to reference + * the database handle or one of its cursors any further. Do not close + * a handle if an existing transaction has modified its database. + * Doing so can cause misbehavior from database corruption to errors + * like MDB_BAD_VALSIZE (since the DB name is gone). + * + * Closing a database handle is not necessary, but lets #mdb_dbi_open() + * reuse the handle value. Usually it's better to set a bigger + * #mdb_env_set_maxdbs(), unless that value would be large. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + */ +void mdb_dbi_close(MDB_env *env, MDB_dbi dbi); + + /** @brief Empty or delete+close a database. + * + * See #mdb_dbi_close() for restrictions about closing the DB handle. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] del 0 to empty the DB, 1 to delete it from the + * environment and close the DB handle. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del); + + /** @brief Set a custom key comparison function for a database. + * + * The comparison function is called whenever it is necessary to compare a + * key specified by the application with a key currently stored in the database. + * If no comparison function is specified, and no special key flags were specified + * with #mdb_dbi_open(), the keys are compared lexically, with shorter keys collating + * before longer keys. + * @warning This function must be called before any data access functions are used, + * otherwise data corruption may occur. The same comparison function must be used by every + * program accessing the database, every time the database is used. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] cmp A #MDB_cmp_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); + + /** @brief Set a custom data comparison function for a #MDB_DUPSORT database. + * + * This comparison function is called whenever it is necessary to compare a data + * item specified by the application with a data item currently stored in the database. + * This function only takes effect if the database was opened with the #MDB_DUPSORT + * flag. + * If no comparison function is specified, and no special key flags were specified + * with #mdb_dbi_open(), the data items are compared lexically, with shorter items collating + * before longer items. + * @warning This function must be called before any data access functions are used, + * otherwise data corruption may occur. The same comparison function must be used by every + * program accessing the database, every time the database is used. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] cmp A #MDB_cmp_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); + + /** @brief Set a relocation function for a #MDB_FIXEDMAP database. + * + * @todo The relocation function is called whenever it is necessary to move the data + * of an item to a different position in the database (e.g. through tree + * balancing operations, shifts as a result of adds or deletes, etc.). It is + * intended to allow address/position-dependent data items to be stored in + * a database in an environment opened with the #MDB_FIXEDMAP option. + * Currently the relocation feature is unimplemented and setting + * this function has no effect. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] rel A #MDB_rel_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel); + + /** @brief Set a context pointer for a #MDB_FIXEDMAP database's relocation function. + * + * See #mdb_set_relfunc and #MDB_rel_func for more details. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] ctx An arbitrary pointer for whatever the application needs. + * It will be passed to the callback function set by #mdb_set_relfunc + * as its \b relctx parameter whenever the callback is invoked. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx); + + /** @brief Get items from a database. + * + * This function retrieves key/data pairs from the database. The address + * and length of the data associated with the specified \b key are returned + * in the structure to which \b data refers. + * If the database supports duplicate keys (#MDB_DUPSORT) then the + * first data item for the key will be returned. Retrieval of other + * items requires the use of #mdb_cursor_get(). + * + * @note The memory pointed to by the returned values is owned by the + * database. The caller need not dispose of the memory, and may not + * modify it in any way. For values returned in a read-only transaction + * any modification attempts will cause a SIGSEGV. + * @note Values returned from the database are valid only until a + * subsequent update operation, or the end of the transaction. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] key The key to search for in the database + * @param[out] data The data corresponding to the key + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - the key was not in the database. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); + + /** @brief Store items into a database. + * + * This function stores key/data pairs in the database. The default behavior + * is to enter the new key/data pair, replacing any previously existing key + * if duplicates are disallowed, or adding a duplicate data item if + * duplicates are allowed (#MDB_DUPSORT). + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] key The key to store in the database + * @param[in,out] data The data to store + * @param[in] flags Special options for this operation. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not + * already appear in the database. This flag may only be specified + * if the database was opened with #MDB_DUPSORT. The function will + * return #MDB_KEYEXIST if the key/data pair already appears in the + * database. + *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key + * does not already appear in the database. The function will return + * #MDB_KEYEXIST if the key already appears in the database, even if + * the database supports duplicates (#MDB_DUPSORT). The \b data + * parameter will be set to point to the existing item. + *
  • #MDB_RESERVE - reserve space for data of the given size, but + * don't copy the given data. Instead, return a pointer to the + * reserved space, which the caller can fill in later - before + * the next update operation or the transaction ends. This saves + * an extra memcpy if the data is being generated later. + * LMDB does nothing else with this memory, the caller is expected + * to modify all of the space requested. This flag must not be + * specified if the database was opened with #MDB_DUPSORT. + *
  • #MDB_APPEND - append the given key/data pair to the end of the + * database. This option allows fast bulk loading when keys are + * already known to be in the correct order. Loading unsorted keys + * with this flag will cause a #MDB_KEYEXIST error. + *
  • #MDB_APPENDDUP - as above, but for sorted dup data. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). + *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + unsigned int flags); + + /** @brief Delete items from a database. + * + * This function removes key/data pairs from the database. + * If the database does not support sorted duplicate data items + * (#MDB_DUPSORT) the data parameter is ignored. + * If the database supports sorted duplicates and the data parameter + * is NULL, all of the duplicate data items for the key will be + * deleted. Otherwise, if the data parameter is non-NULL + * only the matching data item will be deleted. + * This function will return #MDB_NOTFOUND if the specified key/data + * pair is not in the database. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] key The key to delete from the database + * @param[in] data The data to delete + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); + + /** @brief Create a cursor handle. + * + * A cursor is associated with a specific transaction and database. + * A cursor cannot be used when its database handle is closed. Nor + * when its transaction has ended, except with #mdb_cursor_renew(). + * It can be discarded with #mdb_cursor_close(). + * A cursor in a write-transaction can be closed before its transaction + * ends, and will otherwise be closed when its transaction ends. + * A cursor in a read-only transaction must be closed explicitly, before + * or after its transaction ends. It can be reused with + * #mdb_cursor_renew() before finally closing it. + * @note Earlier documentation said that cursors in every transaction + * were closed when the transaction committed or aborted. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[out] cursor Address where the new #MDB_cursor handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); + + /** @brief Close a cursor handle. + * + * The cursor handle will be freed and must not be used again after this call. + * Its transaction must still be live if it is a write-transaction. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + */ +void mdb_cursor_close(MDB_cursor *cursor); + + /** @brief Renew a cursor handle. + * + * A cursor is associated with a specific transaction and database. + * Cursors that are only used in read-only + * transactions may be re-used, to avoid unnecessary malloc/free overhead. + * The cursor may be associated with a new read-only transaction, and + * referencing the same database handle as it was created with. + * This may be done whether the previous transaction is live or dead. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); + + /** @brief Return the cursor's transaction handle. + * + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + */ +MDB_txn *mdb_cursor_txn(MDB_cursor *cursor); + + /** @brief Return the cursor's database handle. + * + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + */ +MDB_dbi mdb_cursor_dbi(MDB_cursor *cursor); + + /** @brief Retrieve by cursor. + * + * This function retrieves key/data pairs from the database. The address and length + * of the key are returned in the object to which \b key refers (except for the + * case of the #MDB_SET option, in which the \b key object is unchanged), and + * the address and length of the data are returned in the object to which \b data + * refers. + * See #mdb_get() for restrictions on using the output values. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[in,out] key The key for a retrieved item + * @param[in,out] data The data of a retrieved item + * @param[in] op A cursor operation #MDB_cursor_op + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - no matching key found. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + MDB_cursor_op op); + + /** @brief Store by cursor. + * + * This function stores key/data pairs into the database. + * The cursor is positioned at the new item, or on failure usually near it. + * @note Earlier documentation incorrectly said errors would leave the + * state of the cursor unchanged. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[in] key The key operated on. + * @param[in] data The data operated on. + * @param[in] flags Options for this operation. This parameter + * must be set to 0 or one of the values described here. + *
    + *
  • #MDB_CURRENT - replace the item at the current cursor position. + * The \b key parameter must still be provided, and must match it. + * If using sorted duplicates (#MDB_DUPSORT) the data item must still + * sort into the same place. This is intended to be used when the + * new data is the same size as the old. Otherwise it will simply + * perform a delete of the old record followed by an insert. + *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not + * already appear in the database. This flag may only be specified + * if the database was opened with #MDB_DUPSORT. The function will + * return #MDB_KEYEXIST if the key/data pair already appears in the + * database. + *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key + * does not already appear in the database. The function will return + * #MDB_KEYEXIST if the key already appears in the database, even if + * the database supports duplicates (#MDB_DUPSORT). + *
  • #MDB_RESERVE - reserve space for data of the given size, but + * don't copy the given data. Instead, return a pointer to the + * reserved space, which the caller can fill in later - before + * the next update operation or the transaction ends. This saves + * an extra memcpy if the data is being generated later. This flag + * must not be specified if the database was opened with #MDB_DUPSORT. + *
  • #MDB_APPEND - append the given key/data pair to the end of the + * database. No key comparisons are performed. This option allows + * fast bulk loading when keys are already known to be in the + * correct order. Loading unsorted keys with this flag will cause + * a #MDB_KEYEXIST error. + *
  • #MDB_APPENDDUP - as above, but for sorted dup data. + *
  • #MDB_MULTIPLE - store multiple contiguous data elements in a + * single request. This flag may only be specified if the database + * was opened with #MDB_DUPFIXED. The \b data argument must be an + * array of two MDB_vals. The mv_size of the first MDB_val must be + * the size of a single data element. The mv_data of the first MDB_val + * must point to the beginning of the array of contiguous data elements. + * The mv_size of the second MDB_val must be the count of the number + * of data elements to store. On return this field will be set to + * the count of the number of elements actually written. The mv_data + * of the second MDB_val is unused. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). + *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + unsigned int flags); + + /** @brief Delete current key/data pair + * + * This function deletes the key/data pair to which the cursor refers. + * This does not invalidate the cursor, so operations such as MDB_NEXT + * can still be used on it. + * Both MDB_NEXT and MDB_GET_CURRENT will return the same record after + * this operation. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[in] flags Options for this operation. This parameter + * must be set to 0 or one of the values described here. + *
    + *
  • #MDB_NODUPDATA - delete all of the data items for the current key. + * This flag may only be specified if the database was opened with #MDB_DUPSORT. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdb_cursor_del(MDB_cursor *cursor, unsigned int flags); + + /** @brief Return count of duplicates for current key. + * + * This call is only valid on databases that support sorted duplicate + * data items #MDB_DUPSORT. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[out] countp Address where the count will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - cursor is not initialized, or an invalid parameter was specified. + *
+ */ +int mdb_cursor_count(MDB_cursor *cursor, size_t *countp); + + /** @brief Compare two data items according to a particular database. + * + * This returns a comparison as if the two data items were keys in the + * specified database. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] a The first item to compare + * @param[in] b The second item to compare + * @return < 0 if a < b, 0 if a == b, > 0 if a > b + */ +int mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); + + /** @brief Compare two data items according to a particular database. + * + * This returns a comparison as if the two items were data items of + * the specified database. The database must have the #MDB_DUPSORT flag. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] a The first item to compare + * @param[in] b The second item to compare + * @return < 0 if a < b, 0 if a == b, > 0 if a > b + */ +int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); + + /** @brief A callback function used to print a message from the library. + * + * @param[in] msg The string to be printed. + * @param[in] ctx An arbitrary context pointer for the callback. + * @return < 0 on failure, >= 0 on success. + */ +typedef int (MDB_msg_func)(const char *msg, void *ctx); + + /** @brief Dump the entries in the reader lock table. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] func A #MDB_msg_func function + * @param[in] ctx Anything the message function needs + * @return < 0 on failure, >= 0 on success. + */ +int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); + + /** @brief Check for stale entries in the reader lock table. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] dead Number of stale slots that were cleared + * @return 0 on success, non-zero on failure. + */ +int mdb_reader_check(MDB_env *env, int *dead); +/** @} */ + +int mdb_madvise(MDB_env *env, int random); + +void mdb_env_unmap(MDB_env *env); + +size_t mdb_get_txnid(MDB_txn* txn); + +void mdb_env_get_txnids(MDB_env* env, size_t* txnid1, size_t* txnid2); + +void mdb_get_map(MDB_env *env, char** p_mmap, size_t* p_size); + +int mdb_cursor_first_leaf_page(MDB_cursor* cursor, size_t* pgno); + +int mdb_cursor_next_leaf_page(MDB_cursor* cursor, size_t* pgno); + +int mdb_cursor_get_pageno(MDB_cursor* cursor, size_t* pgno); + +int mdb_page_get_nkeys(MDB_cursor* mc, size_t pgno, unsigned int* nkeys); + +int mdb_page_get_val(MDB_cursor* mc, size_t pgno, unsigned int idx, + MDB_val* key, MDB_val* data); + +int mdb_page_is_dirty(MDB_cursor* mc, size_t pgno, int* dirty); + +#ifdef __cplusplus +} +#endif +/** @page tools LMDB Command Line Tools + The following describes the command line tools that are available for LMDB. + \li \ref mdb_copy_1 + \li \ref mdb_dump_1 + \li \ref mdb_load_1 + \li \ref mdb_stat_1 +*/ + +#endif /* _LMDB_H_ */ diff --git a/lmdb/mdb.c b/lmdb/mdb.c new file mode 100644 index 0000000..2fe54ed --- /dev/null +++ b/lmdb/mdb.c @@ -0,0 +1,10488 @@ +/** @file mdb.c + * @brief Lightning memory-mapped database library + * + * A Btree-based database management library modeled loosely on the + * BerkeleyDB API, but much simplified. + */ +/* + * Copyright 2011-2019 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + * + * This code is derived from btree.c written by Martin Hedenfalk. + * + * Copyright (c) 2009, 2010 Martin Hedenfalk + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif +#if defined(__WIN64__) +#define _FILE_OFFSET_BITS 64 +#endif +#ifdef _WIN32 +#include +#include +#include /* get wcscpy() */ + +/** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it + * as int64 which is wrong. MSVC doesn't define it at all, so just + * don't use it. + */ +#define MDB_PID_T int +#define MDB_THR_T DWORD +#include +#include +#ifdef __GNUC__ +# include +#else +# define LITTLE_ENDIAN 1234 +# define BIG_ENDIAN 4321 +# define BYTE_ORDER LITTLE_ENDIAN +# ifndef SSIZE_MAX +# define SSIZE_MAX INT_MAX +# endif +#endif +#else +#include +#include +#define MDB_PID_T pid_t +#define MDB_THR_T pthread_t +#include +#include +#include +#ifdef HAVE_SYS_FILE_H +#include +#endif +#include +#endif + +#if defined(__mips) && defined(__linux) +/* MIPS has cache coherency issues, requires explicit cache control */ +#include +extern int cacheflush(char *addr, int nbytes, int cache); +#define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache) +#else +#define CACHEFLUSH(addr, bytes, cache) +#endif + +#if defined(__linux) && !defined(MDB_FDATASYNC_WORKS) +/** fdatasync is broken on ext3/ext4fs on older kernels, see + * description in #mdb_env_open2 comments. You can safely + * define MDB_FDATASYNC_WORKS if this code will only be run + * on kernels 3.6 and newer. + */ +#define BROKEN_FDATASYNC +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#include +typedef SSIZE_T ssize_t; +#else +#include +#endif + +#if defined(__sun) || defined(ANDROID) +/* Most platforms have posix_memalign, older may only have memalign */ +#define HAVE_MEMALIGN 1 +#include +/* On Solaris, we need the POSIX sigwait function */ +#if defined (__sun) +# define _POSIX_PTHREAD_SEMANTICS 1 +#endif +#endif + +#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) +#include +#include /* defines BYTE_ORDER on HPUX and Solaris */ +#endif + +#if defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) +# define MDB_USE_POSIX_SEM 1 +# define MDB_FDATASYNC fsync +#elif defined(ANDROID) +# define MDB_FDATASYNC fsync +#endif + +#ifndef _WIN32 +#include +#include +#ifdef MDB_USE_POSIX_SEM +# define MDB_USE_HASH 1 +#include +#else +#define MDB_USE_POSIX_MUTEX 1 +#endif +#endif + +#if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) \ + + defined(MDB_USE_POSIX_MUTEX) != 1 +# error "Ambiguous shared-lock implementation" +#endif + +#ifdef USE_VALGRIND +#include +#define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z) +#define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s) +#define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a) +#define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h) +#define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s) +#else +#define VGMEMP_CREATE(h,r,z) +#define VGMEMP_ALLOC(h,a,s) +#define VGMEMP_FREE(h,a) +#define VGMEMP_DESTROY(h) +#define VGMEMP_DEFINED(a,s) +#endif + +#ifndef BYTE_ORDER +# if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) +/* Solaris just defines one or the other */ +# define LITTLE_ENDIAN 1234 +# define BIG_ENDIAN 4321 +# ifdef _LITTLE_ENDIAN +# define BYTE_ORDER LITTLE_ENDIAN +# else +# define BYTE_ORDER BIG_ENDIAN +# endif +# else +# define BYTE_ORDER __BYTE_ORDER +# endif +#endif + +#ifndef LITTLE_ENDIAN +#define LITTLE_ENDIAN __LITTLE_ENDIAN +#endif +#ifndef BIG_ENDIAN +#define BIG_ENDIAN __BIG_ENDIAN +#endif + +#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) +#define MISALIGNED_OK 1 +#endif + +#include "lmdb.h" +#include "midl.h" +#include "cppmidl.h" + +#if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) +# error "Unknown or unsupported endianness (BYTE_ORDER)" +#elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF +# error "Two's complement, reasonably sized integer types, please" +#endif + +#ifdef __GNUC__ +/** Put infrequently used env functions in separate section */ +# ifdef __APPLE__ +# define ESECT __attribute__ ((section("__TEXT,text_env"))) +# else +# define ESECT __attribute__ ((section("text_env"))) +# endif +#else +#define ESECT +#endif + +#ifdef _WIN32 +#define CALL_CONV WINAPI +#else +#define CALL_CONV +#endif + +/** @defgroup internal LMDB Internals + * @{ + */ +/** @defgroup compat Compatibility Macros + * A bunch of macros to minimize the amount of platform-specific ifdefs + * needed throughout the rest of the code. When the features this library + * needs are similar enough to POSIX to be hidden in a one-or-two line + * replacement, this macro approach is used. + * @{ + */ + + /** Features under development */ +#ifndef MDB_DEVEL +#define MDB_DEVEL 0 +#endif + + /** Wrapper around __func__, which is a C99 feature */ +#if __STDC_VERSION__ >= 199901L +# define mdb_func_ __func__ +#elif __GNUC__ >= 2 || _MSC_VER >= 1300 +# define mdb_func_ __FUNCTION__ +#else +/* If a debug message says (), update the #if statements above */ +# define mdb_func_ "" +#endif + +/* Internal error codes, not exposed outside liblmdb */ +#define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) +#ifdef _WIN32 +#define MDB_OWNERDEAD ((int) WAIT_ABANDONED) +#elif defined(MDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD) +#define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */ +#endif + +#ifdef __GLIBC__ +#define GLIBC_VER ((__GLIBC__ << 16 )| __GLIBC_MINOR__) +#endif +/** Some platforms define the EOWNERDEAD error code + * even though they don't support Robust Mutexes. + * Compile with -DMDB_USE_ROBUST=0, or use some other + * mechanism like -DMDB_USE_POSIX_SEM instead of + * -DMDB_USE_POSIX_MUTEX. + * (Posix semaphores are not robust.) + */ +#ifndef MDB_USE_ROBUST +/* Android currently lacks Robust Mutex support. So does glibc < 2.4. */ +# if defined(MDB_USE_POSIX_MUTEX) && (defined(ANDROID) || \ + (defined(__GLIBC__) && GLIBC_VER < 0x020004)) +# define MDB_USE_ROBUST 0 +# else +# define MDB_USE_ROBUST 1 +# endif +#endif /* !MDB_USE_ROBUST */ + +#if defined(MDB_USE_POSIX_MUTEX) && (MDB_USE_ROBUST) +/* glibc < 2.12 only provided _np API */ +# if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \ + (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST)) +# define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP +# define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) +# define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) +# endif +#endif /* MDB_USE_POSIX_MUTEX && MDB_USE_ROBUST */ + +#if defined(MDB_OWNERDEAD) && (MDB_USE_ROBUST) +#define MDB_ROBUST_SUPPORTED 1 +#endif + +#ifdef _WIN32 +#define MDB_USE_HASH 1 +#define MDB_PIDLOCK 0 +#define THREAD_RET DWORD +#define pthread_t HANDLE +#define pthread_mutex_t HANDLE +#define pthread_cond_t HANDLE +typedef HANDLE mdb_mutex_t, mdb_mutexref_t; +#define pthread_key_t DWORD +#define pthread_self() GetCurrentThreadId() +#define pthread_key_create(x,y) \ + ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0) +#define pthread_key_delete(x) TlsFree(x) +#define pthread_getspecific(x) TlsGetValue(x) +#define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) +#define pthread_mutex_unlock(x) ReleaseMutex(*x) +#define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) +#define pthread_cond_signal(x) SetEvent(*x) +#define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) +#define THREAD_CREATE(thr,start,arg) \ + (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode()) +#define THREAD_FINISH(thr) \ + (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0) +#define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE) +#define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex) +#define mdb_mutex_consistent(mutex) 0 +#define getpid() GetCurrentProcessId() +#define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd)) +#define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len)) +#define ErrCode() GetLastError() +#define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} +#define close(fd) (CloseHandle(fd) ? 0 : -1) +#define munmap(ptr,len) UnmapViewOfFile(ptr) +#ifdef PROCESS_QUERY_LIMITED_INFORMATION +#define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION +#else +#define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000 +#endif +#define Z "I" +#else +#define THREAD_RET void * +#define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg) +#define THREAD_FINISH(thr) pthread_join(thr,NULL) +#define Z "z" /**< printf format modifier for size_t */ + + /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ +#define MDB_PIDLOCK 1 + +#ifdef MDB_USE_POSIX_SEM + +typedef sem_t *mdb_mutex_t, *mdb_mutexref_t; +#define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex) +#define UNLOCK_MUTEX(mutex) sem_post(mutex) + +static int +mdb_sem_wait(sem_t *sem) +{ + int rc; + while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ; + return rc; +} + +#else /* MDB_USE_POSIX_MUTEX: */ + /** Shared mutex/semaphore as the original is stored. + * + * Not for copies. Instead it can be assigned to an #mdb_mutexref_t. + * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it + * is array[size 1] so it can be assigned to the pointer. + */ +typedef pthread_mutex_t mdb_mutex_t[1]; + /** Reference to an #mdb_mutex_t */ +typedef pthread_mutex_t *mdb_mutexref_t; + /** Lock the reader or writer mutex. + * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX(). + */ +#define LOCK_MUTEX0(mutex) pthread_mutex_lock(mutex) + /** Unlock the reader or writer mutex. + */ +#define UNLOCK_MUTEX(mutex) pthread_mutex_unlock(mutex) + /** Mark mutex-protected data as repaired, after death of previous owner. + */ +#define mdb_mutex_consistent(mutex) pthread_mutex_consistent(mutex) +#endif /* MDB_USE_POSIX_SEM */ + + /** Get the error code for the last failed system function. + */ +#define ErrCode() errno + + /** An abstraction for a file handle. + * On POSIX systems file handles are small integers. On Windows + * they're opaque pointers. + */ +#define HANDLE int + + /** A value for an invalid file handle. + * Mainly used to initialize file variables and signify that they are + * unused. + */ +#define INVALID_HANDLE_VALUE (-1) + + /** Get the size of a memory page for the system. + * This is the basic size that the platform's memory manager uses, and is + * fundamental to the use of memory-mapped files. + */ +#define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) +#endif + +#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) +#define MNAME_LEN 32 +#else +#define MNAME_LEN (sizeof(pthread_mutex_t)) +#endif + +#ifdef __linux__ +#include +#include +#endif + +#if !defined(BLKGETSIZE64) && defined(__i386__) && defined(__x86_64__) +#define BLKGETSIZE64 _IOR(0x12,114,size_t) +#endif + +/** @} */ + +#ifdef MDB_ROBUST_SUPPORTED + /** Lock mutex, handle any error, set rc = result. + * Return 0 on success, nonzero (not rc) on error. + */ +#define LOCK_MUTEX(rc, env, mutex) \ + (((rc) = LOCK_MUTEX0(mutex)) && \ + ((rc) = mdb_mutex_failed(env, mutex, rc))) +static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc); +#else +#define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex)) +#define mdb_mutex_failed(env, mutex, rc) (rc) +#endif + +#ifndef _WIN32 +/** A flag for opening a file and requesting synchronous data writes. + * This is only used when writing a meta page. It's not strictly needed; + * we could just do a normal write and then immediately perform a flush. + * But if this flag is available it saves us an extra system call. + * + * @note If O_DSYNC is undefined but exists in /usr/include, + * preferably set some compiler flag to get the definition. + */ +#ifndef MDB_DSYNC +# ifdef O_DSYNC +# define MDB_DSYNC O_DSYNC +# else +# define MDB_DSYNC O_SYNC +# endif +#endif +#endif + +/** Function for flushing the data of a file. Define this to fsync + * if fdatasync() is not supported. + */ +#ifndef MDB_FDATASYNC +# define MDB_FDATASYNC fdatasync +#endif + +#ifndef MDB_MSYNC +# define MDB_MSYNC(addr,len,flags) msync(addr,len,flags) +#endif + +#ifndef MS_SYNC +#define MS_SYNC 1 +#endif + +#ifndef MS_ASYNC +#define MS_ASYNC 0 +#endif + + /** A page number in the database. + * Note that 64 bit page numbers are overkill, since pages themselves + * already represent 12-13 bits of addressable memory, and the OS will + * always limit applications to a maximum of 63 bits of address space. + * + * @note In the #MDB_node structure, we only store 48 bits of this value, + * which thus limits us to only 60 bits of addressable data. + */ +typedef MDB_ID pgno_t; + + /** A transaction ID. + * See struct MDB_txn.mt_txnid for details. + */ +typedef MDB_ID txnid_t; + +/** @defgroup debug Debug Macros + * @{ + */ +#ifndef MDB_DEBUG + /** Enable debug output. Needs variable argument macros (a C99 feature). + * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs + * read from and written to the database (used for free space management). + */ +#define MDB_DEBUG 0 +#endif + +#if MDB_DEBUG +static int mdb_debug; +static txnid_t mdb_debug_start; + + /** Print a debug message with printf formatting. + * Requires double parenthesis around 2 or more args. + */ +# define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args)) +# define DPRINTF0(fmt, ...) \ + fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__) +#else +# define DPRINTF(args) ((void) 0) +#endif + /** Print a debug string. + * The string is printed literally, with no format processing. + */ +#define DPUTS(arg) DPRINTF(("%s", arg)) + /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */ +#define DDBI(mc) \ + (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) +/** @} */ + + /** @brief The maximum size of a database page. + * + * It is 32k or 64k, since value-PAGEBASE must fit in + * #MDB_page.%mp_upper. + * + * LMDB will use database pages < OS pages if needed. + * That causes more I/O in write transactions: The OS must + * know (read) the whole page before writing a partial page. + * + * Note that we don't currently support Huge pages. On Linux, + * regular data files cannot use Huge pages, and in general + * Huge pages aren't actually pageable. We rely on the OS + * demand-pager to read our data and page it out when memory + * pressure from other processes is high. So until OSs have + * actual paging support for Huge pages, they're not viable. + */ +#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) + + /** The minimum number of keys required in a database page. + * Setting this to a larger value will place a smaller bound on the + * maximum size of a data item. Data items larger than this size will + * be pushed into overflow pages instead of being stored directly in + * the B-tree node. This value used to default to 4. With a page size + * of 4096 bytes that meant that any item larger than 1024 bytes would + * go into an overflow page. That also meant that on average 2-3KB of + * each overflow page was wasted space. The value cannot be lower than + * 2 because then there would no longer be a tree structure. With this + * value, items larger than 2KB will go into overflow pages, and on + * average only 1KB will be wasted. + */ +#define MDB_MINKEYS 2 + + /** A stamp that identifies a file as an LMDB file. + * There's nothing special about this value other than that it is easily + * recognizable, and it will reflect any byte order mismatches. + */ +#define MDB_MAGIC 0xBEEFC0DE + + /** The version number for a database's datafile format. */ +#define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) + /** The version number for a database's lockfile format. */ +#define MDB_LOCK_VERSION 1 + + /** @brief The max size of a key we can write, or 0 for computed max. + * + * This macro should normally be left alone or set to 0. + * Note that a database with big keys or dupsort data cannot be + * reliably modified by a liblmdb which uses a smaller max. + * The default is 511 for backwards compat, or 0 when #MDB_DEVEL. + * + * Other values are allowed, for backwards compat. However: + * A value bigger than the computed max can break if you do not + * know what you are doing, and liblmdb <= 0.9.10 can break when + * modifying a DB with keys/dupsort data bigger than its max. + * + * Data items in an #MDB_DUPSORT database are also limited to + * this size, since they're actually keys of a sub-DB. Keys and + * #MDB_DUPSORT data items must fit on a node in a regular page. + */ +#ifndef MDB_MAXKEYSIZE +#define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) +#endif + + /** The maximum size of a key we can write to the environment. */ +#if MDB_MAXKEYSIZE +#define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) +#else +#define ENV_MAXKEY(env) ((env)->me_maxkey) +#endif + + /** @brief The maximum size of a data item. + * + * We only store a 32 bit value for node sizes. + */ +#define MAXDATASIZE 0xffffffffUL + +#if MDB_DEBUG + /** Key size which fits in a #DKBUF. + * @ingroup debug + */ +#define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) + /** A key buffer. + * @ingroup debug + * This is used for printing a hex dump of a key's contents. + */ +#define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1] + /** Display a key in hex. + * @ingroup debug + * Invoke a function to display a key in hex. + */ +#define DKEY(x) mdb_dkey(x, kbuf) +#else +#define DKBUF +#define DKEY(x) 0 +#endif + + /** An invalid page number. + * Mainly used to denote an empty tree. + */ +#define P_INVALID (~(pgno_t)0) + + /** Test if the flags \b f are set in a flag word \b w. */ +#define F_ISSET(w, f) (((w) & (f)) == (f)) + + /** Round \b n up to an even number. */ +#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ + + /** Used for offsets within a single page. + * Since memory pages are typically 4 or 8KB in size, 12-13 bits, + * this is plenty. + */ +typedef uint16_t indx_t; + + /** Default size of memory map. + * This is certainly too small for any actual applications. Apps should always set + * the size explicitly using #mdb_env_set_mapsize(). + */ +#define DEFAULT_MAPSIZE 1048576 + +/** @defgroup readers Reader Lock Table + * Readers don't acquire any locks for their data access. Instead, they + * simply record their transaction ID in the reader table. The reader + * mutex is needed just to find an empty slot in the reader table. The + * slot's address is saved in thread-specific data so that subsequent read + * transactions started by the same thread need no further locking to proceed. + * + * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. + * + * No reader table is used if the database is on a read-only filesystem, or + * if #MDB_NOLOCK is set. + * + * Since the database uses multi-version concurrency control, readers don't + * actually need any locking. This table is used to keep track of which + * readers are using data from which old transactions, so that we'll know + * when a particular old transaction is no longer in use. Old transactions + * that have discarded any data pages can then have those pages reclaimed + * for use by a later write transaction. + * + * The lock table is constructed such that reader slots are aligned with the + * processor's cache line size. Any slot is only ever used by one thread. + * This alignment guarantees that there will be no contention or cache + * thrashing as threads update their own slot info, and also eliminates + * any need for locking when accessing a slot. + * + * A writer thread will scan every slot in the table to determine the oldest + * outstanding reader transaction. Any freed pages older than this will be + * reclaimed by the writer. The writer doesn't use any locks when scanning + * this table. This means that there's no guarantee that the writer will + * see the most up-to-date reader info, but that's not required for correct + * operation - all we need is to know the upper bound on the oldest reader, + * we don't care at all about the newest reader. So the only consequence of + * reading stale information here is that old pages might hang around a + * while longer before being reclaimed. That's actually good anyway, because + * the longer we delay reclaiming old pages, the more likely it is that a + * string of contiguous pages can be found after coalescing old pages from + * many old transactions together. + * @{ + */ + /** Number of slots in the reader table. + * This value was chosen somewhat arbitrarily. 126 readers plus a + * couple mutexes fit exactly into 8KB on my development machine. + * Applications should set the table size using #mdb_env_set_maxreaders(). + */ +#define DEFAULT_READERS 126 + + /** The size of a CPU cache line in bytes. We want our lock structures + * aligned to this size to avoid false cache line sharing in the + * lock table. + * This value works for most CPUs. For Itanium this should be 128. + */ +#ifndef CACHELINE +#define CACHELINE 64 +#endif + + /** The information we store in a single slot of the reader table. + * In addition to a transaction ID, we also record the process and + * thread ID that owns a slot, so that we can detect stale information, + * e.g. threads or processes that went away without cleaning up. + * @note We currently don't check for stale records. We simply re-init + * the table when we know that we're the only process opening the + * lock file. + */ +typedef struct MDB_rxbody { + /** Current Transaction ID when this transaction began, or (txnid_t)-1. + * Multiple readers that start at the same time will probably have the + * same ID here. Again, it's not important to exclude them from + * anything; all we need to know is which version of the DB they + * started from so we can avoid overwriting any data used in that + * particular version. + */ + volatile txnid_t mrb_txnid; + /** The process ID of the process owning this reader txn. */ + volatile MDB_PID_T mrb_pid; + /** The thread ID of the thread owning this txn. */ + volatile MDB_THR_T mrb_tid; +} MDB_rxbody; + + /** The actual reader record, with cacheline padding. */ +typedef struct MDB_reader { + union { + MDB_rxbody mrx; + /** shorthand for mrb_txnid */ +#define mr_txnid mru.mrx.mrb_txnid +#define mr_pid mru.mrx.mrb_pid +#define mr_tid mru.mrx.mrb_tid + /** cache line alignment */ + char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)]; + } mru; +} MDB_reader; + + /** The header for the reader table. + * The table resides in a memory-mapped file. (This is a different file + * than is used for the main database.) + * + * For POSIX the actual mutexes reside in the shared memory of this + * mapped file. On Windows, mutexes are named objects allocated by the + * kernel; we store the mutex names in this mapped file so that other + * processes can grab them. This same approach is also used on + * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support + * process-shared POSIX mutexes. For these cases where a named object + * is used, the object name is derived from a 64 bit FNV hash of the + * environment pathname. As such, naming collisions are extremely + * unlikely. If a collision occurs, the results are unpredictable. + */ +typedef struct MDB_txbody { + /** Stamp identifying this as an LMDB file. It must be set + * to #MDB_MAGIC. */ + uint32_t mtb_magic; + /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ + uint32_t mtb_format; +#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) + char mtb_rmname[MNAME_LEN]; +#else + /** Mutex protecting access to this table. + * This is the reader table lock used with LOCK_MUTEX(). + */ + mdb_mutex_t mtb_rmutex; +#endif + /** The ID of the last transaction committed to the database. + * This is recorded here only for convenience; the value can always + * be determined by reading the main database meta pages. + */ + volatile txnid_t mtb_txnid; + /** The number of slots that have been used in the reader table. + * This always records the maximum count, it is not decremented + * when readers release their slots. + */ + volatile unsigned mtb_numreaders; +} MDB_txbody; + + /** The actual reader table definition. */ +typedef struct MDB_txninfo { + union { + MDB_txbody mtb; +#define mti_magic mt1.mtb.mtb_magic +#define mti_format mt1.mtb.mtb_format +#define mti_rmutex mt1.mtb.mtb_rmutex +#define mti_rmname mt1.mtb.mtb_rmname +#define mti_txnid mt1.mtb.mtb_txnid +#define mti_numreaders mt1.mtb.mtb_numreaders + char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; + } mt1; + union { +#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) + char mt2_wmname[MNAME_LEN]; +#define mti_wmname mt2.mt2_wmname +#else + mdb_mutex_t mt2_wmutex; +#define mti_wmutex mt2.mt2_wmutex +#endif + char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)]; + } mt2; + MDB_reader mti_readers[1]; +} MDB_txninfo; + + /** Lockfile format signature: version, features and field layout */ +#define MDB_LOCK_FORMAT \ + ((uint32_t) \ + ((MDB_LOCK_VERSION) \ + /* Flags which describe functionality */ \ + + (((MDB_PIDLOCK) != 0) << 16))) +/** @} */ + +/** Common header for all page types. The page type depends on #mp_flags. + * + * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with + * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages + * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. + * + * #P_OVERFLOW records occupy one or more contiguous pages where only the + * first has a page header. They hold the real data of #F_BIGDATA nodes. + * + * #P_SUBP sub-pages are small leaf "pages" with duplicate data. + * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. + * (Duplicate data can also go in sub-databases, which use normal pages.) + * + * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. + * + * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once + * in the snapshot: Either used by a database or listed in a freeDB record. + */ +typedef struct MDB_page { +#define mp_pgno mp_p.p_pgno +#define mp_next mp_p.p_next + union { + pgno_t p_pgno; /**< page number */ + struct MDB_page *p_next; /**< for in-memory list of freed pages */ + } mp_p; + uint16_t mp_pad; /**< key size if this is a LEAF2 page */ +/** @defgroup mdb_page Page Flags + * @ingroup internal + * Flags for the page headers. + * @{ + */ +#define P_BRANCH 0x01 /**< branch page */ +#define P_LEAF 0x02 /**< leaf page */ +#define P_OVERFLOW 0x04 /**< overflow page */ +#define P_META 0x08 /**< meta page */ +#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ +#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ +#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ +#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ +#define P_KEEP 0x8000 /**< leave this page alone during spill */ +/** @} */ + uint16_t mp_flags; /**< @ref mdb_page */ +#define mp_lower mp_pb.pb.pb_lower +#define mp_upper mp_pb.pb.pb_upper +#define mp_pages mp_pb.pb_pages + union { + struct { + indx_t pb_lower; /**< lower bound of free space */ + indx_t pb_upper; /**< upper bound of free space */ + } pb; + uint32_t pb_pages; /**< number of overflow pages */ + } mp_pb; + indx_t mp_ptrs[1]; /**< dynamic size */ +} MDB_page; + + /** Size of the page header, excluding dynamic data at the end */ +#define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs)) + + /** Address of first usable data byte in a page, after the header */ +#define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) + + /** ITS#7713, change PAGEBASE to handle 65536 byte pages */ +#define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) + + /** Number of nodes on a page */ +#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1) + + /** The amount of space remaining in the page */ +#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) + + /** The percentage of space used in the page, in tenths of a percent. */ +#define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ + ((env)->me_psize - PAGEHDRSZ)) + /** The minimum page fill factor, in tenths of a percent. + * Pages emptier than this are candidates for merging. + */ +#define FILL_THRESHOLD 250 + + /** Test if a page is a leaf page */ +#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) + /** Test if a page is a LEAF2 page */ +#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) + /** Test if a page is a branch page */ +#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) + /** Test if a page is an overflow page */ +#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) + /** Test if a page is a sub page */ +#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) + + /** The number of overflow pages needed to store the given size. */ +#define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) + + /** Link in #MDB_txn.%mt_loose_pgs list. + * Kept outside the page header, which is needed when reusing the page. + */ +#define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) + + /** Header for a single key/data pair within a page. + * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. + * We guarantee 2-byte alignment for 'MDB_node's. + * + * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child + * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used + * for pgno. (Branch nodes have no flags). Lo and hi are in host byte + * order in case some accesses can be optimized to 32-bit word access. + * + * Leaf node flags describe node contents. #F_BIGDATA says the node's + * data part is the page number of an overflow page with actual data. + * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in + * a sub-page/sub-database, and named databases (just #F_SUBDATA). + */ +typedef struct MDB_node { + /** part of data size or pgno + * @{ */ +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned short mn_lo, mn_hi; +#else + unsigned short mn_hi, mn_lo; +#endif + /** @} */ +/** @defgroup mdb_node Node Flags + * @ingroup internal + * Flags for node headers. + * @{ + */ +#define F_BIGDATA 0x01 /**< data put on overflow page */ +#define F_SUBDATA 0x02 /**< data is a sub-database */ +#define F_DUPDATA 0x04 /**< data has duplicates */ + +/** valid flags for #mdb_node_add() */ +#define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND) + +/** @} */ + unsigned short mn_flags; /**< @ref mdb_node */ + unsigned short mn_ksize; /**< key size */ + char mn_data[1]; /**< key and data are appended here */ +} MDB_node; + + /** Size of the node header, excluding dynamic data at the end */ +#define NODESIZE offsetof(MDB_node, mn_data) + + /** Bit position of top word in page number, for shifting mn_flags */ +#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) + + /** Size of a node in a branch page with a given key. + * This is just the node header plus the key, there is no data. + */ +#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) + + /** Size of a node in a leaf page with a given key and data. + * This is node header plus key plus data size. + */ +#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) + + /** Address of node \b i in page \b p */ +#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE)) + + /** Address of the key for the node */ +#define NODEKEY(node) (void *)((node)->mn_data) + + /** Address of the data for a node */ +#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) + + /** Get the page number pointed to by a branch node */ +#define NODEPGNO(node) \ + ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \ + (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0)) + /** Set the page number in a branch node */ +#define SETPGNO(node,pgno) do { \ + (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \ + if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0) + + /** Get the size of the data in a leaf node */ +#define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) + /** Set the size of the data for a leaf node */ +#define SETDSZ(node,size) do { \ + (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0) + /** The size of a key in a node */ +#define NODEKSZ(node) ((node)->mn_ksize) + + /** Copy a page number from src to dst */ +#ifdef MISALIGNED_OK +#define COPY_PGNO(dst,src) dst = src +#else +#if SIZE_MAX > 4294967295UL +#define COPY_PGNO(dst,src) do { \ + unsigned short *s, *d; \ + s = (unsigned short *)&(src); \ + d = (unsigned short *)&(dst); \ + *d++ = *s++; \ + *d++ = *s++; \ + *d++ = *s++; \ + *d = *s; \ +} while (0) +#else +#define COPY_PGNO(dst,src) do { \ + unsigned short *s, *d; \ + s = (unsigned short *)&(src); \ + d = (unsigned short *)&(dst); \ + *d++ = *s++; \ + *d = *s; \ +} while (0) +#endif +#endif + /** The address of a key in a LEAF2 page. + * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs. + * There are no node headers, keys are stored contiguously. + */ +#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks))) + + /** Set the \b node's key into \b keyptr, if requested. */ +#define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \ + (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } } + + /** Set the \b node's key into \b key. */ +#define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); } + + /** Information about a single database in the environment. */ +typedef struct MDB_db { + uint32_t md_pad; /**< also ksize for LEAF2 pages */ + uint16_t md_flags; /**< @ref mdb_dbi_open */ + uint16_t md_depth; /**< depth of this tree */ + pgno_t md_branch_pages; /**< number of internal pages */ + pgno_t md_leaf_pages; /**< number of leaf pages */ + pgno_t md_overflow_pages; /**< number of overflow pages */ + size_t md_entries; /**< number of data items */ + pgno_t md_root; /**< the root page of this tree */ +} MDB_db; + +#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ +#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) + /** #mdb_dbi_open() flags */ +#define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ + MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) + + /** Handle for the DB used to track free pages. */ +#define FREE_DBI 0 + /** Handle for the default DB. */ +#define MAIN_DBI 1 + /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ +#define CORE_DBS 2 + + /** Number of meta pages - also hardcoded elsewhere */ +#define NUM_METAS 2 + + /** Meta page content. + * A meta page is the start point for accessing a database snapshot. + * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). + */ +typedef struct MDB_meta { + /** Stamp identifying this as an LMDB file. It must be set + * to #MDB_MAGIC. */ + uint32_t mm_magic; + /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ + uint32_t mm_version; + void *mm_address; /**< address for fixed mapping */ + size_t mm_mapsize; /**< size of mmap region */ + MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ + /** The size of pages used in this DB */ +#define mm_psize mm_dbs[FREE_DBI].md_pad + /** Any persistent environment flags. @ref mdb_env */ +#define mm_flags mm_dbs[FREE_DBI].md_flags + /** Last used page in the datafile. + * Actually the file may be shorter if the freeDB lists the final pages. + */ + pgno_t mm_last_pg; + volatile txnid_t mm_txnid; /**< txnid that committed this page */ +} MDB_meta; + + /** Buffer for a stack-allocated meta page. + * The members define size and alignment, and silence type + * aliasing warnings. They are not used directly; that could + * mean incorrectly using several union members in parallel. + */ +typedef union MDB_metabuf { + MDB_page mb_page; + struct { + char mm_pad[PAGEHDRSZ]; + MDB_meta mm_meta; + } mb_metabuf; +} MDB_metabuf; + + /** Auxiliary DB info. + * The information here is mostly static/read-only. There is + * only a single copy of this record in the environment. + */ +typedef struct MDB_dbx { + MDB_val md_name; /**< name of the database */ + MDB_cmp_func *md_cmp; /**< function for comparing keys */ + MDB_cmp_func *md_dcmp; /**< function for comparing data items */ + MDB_rel_func *md_rel; /**< user relocate function */ + void *md_relctx; /**< user-provided context for md_rel */ +} MDB_dbx; + + /** A database transaction. + * Every operation requires a transaction handle. + */ +struct MDB_txn { + MDB_txn *mt_parent; /**< parent of a nested txn */ + /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ + MDB_txn *mt_child; + pgno_t mt_next_pgno; /**< next unallocated page */ + /** The ID of this transaction. IDs are integers incrementing from 1. + * Only committed write transactions increment the ID. If a transaction + * aborts, the ID may be re-used by the next writer. + */ + txnid_t mt_txnid; + MDB_env *mt_env; /**< the DB environment */ + /** The list of pages that became unused during this transaction. + */ + MDB_IDL mt_free_pgs; + /** The list of loose pages that became unused and may be reused + * in this transaction, linked through #NEXT_LOOSE_PAGE(page). + */ + MDB_page *mt_loose_pgs; + /** Number of loose pages (#mt_loose_pgs) */ + int mt_loose_count; + /** The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. + */ + CPPMIDL mt_spill_pgs; + union { + /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ + MDB_ID2L dirty_list; + /** For read txns: This thread/txn's reader table slot, or NULL. */ + MDB_reader *reader; + } mt_u; + /** Array of records for each DB known in the environment. */ + MDB_dbx *mt_dbxs; + /** Array of MDB_db records for each known DB */ + MDB_db *mt_dbs; + /** Array of sequence numbers for each DB handle */ + unsigned int *mt_dbiseqs; +/** @defgroup mt_dbflag Transaction DB Flags + * @ingroup internal + * @{ + */ +#define DB_DIRTY 0x01 /**< DB was written in this txn */ +#define DB_STALE 0x02 /**< Named-DB record is older than txnID */ +#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ +#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ +#define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ +#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ +/** @} */ + /** In write txns, array of cursors for each DB */ + MDB_cursor **mt_cursors; + /** Array of flags for each DB */ + unsigned char *mt_dbflags; + /** Number of DB records in use, or 0 when the txn is finished. + * This number only ever increments until the txn finishes; we + * don't decrement it when individual DB handles are closed. + */ + MDB_dbi mt_numdbs; + +/** @defgroup mdb_txn Transaction Flags + * @ingroup internal + * @{ + */ + /** #mdb_txn_begin() flags */ +#define MDB_TXN_BEGIN_FLAGS MDB_RDONLY +#define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ + /* internal txn flags */ +#define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */ +#define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ +#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ +#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ +#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ +#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ + /** most operations on the txn are currently illegal */ +#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD) +/** @} */ + unsigned int mt_flags; /**< @ref mdb_txn */ + /** #dirty_list room: Array size - \#dirty pages visible to this txn. + * Includes ancestor txns' dirty pages not hidden by other txns' + * dirty/spilled pages. Thus commit(nested txn) has room to merge + * dirty_list into mt_parent after freeing hidden mt_parent pages. + */ + unsigned int mt_dirty_room; +}; + +/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. + * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to + * raise this on a 64 bit machine. + */ +#define CURSOR_STACK 32 + +struct MDB_xcursor; + + /** Cursors are used for all DB operations. + * A cursor holds a path of (page pointer, key index) from the DB + * root to a position in the DB, plus other state. #MDB_DUPSORT + * cursors include an xcursor to the current data item. Write txns + * track their cursors and keep them up to date when data moves. + * Exception: An xcursor's pointer to a #P_SUBP page can be stale. + * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). + */ +struct MDB_cursor { + /** Next cursor on this DB in this txn */ + MDB_cursor *mc_next; + /** Backup of the original cursor if this cursor is a shadow */ + MDB_cursor *mc_backup; + /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ + struct MDB_xcursor *mc_xcursor; + /** The transaction that owns this cursor */ + MDB_txn *mc_txn; + /** The database handle this cursor operates on */ + MDB_dbi mc_dbi; + /** The database record for this cursor */ + MDB_db *mc_db; + /** The database auxiliary record for this cursor */ + MDB_dbx *mc_dbx; + /** The @ref mt_dbflag for this database */ + unsigned char *mc_dbflag; + unsigned short mc_snum; /**< number of pushed pages */ + unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ +/** @defgroup mdb_cursor Cursor Flags + * @ingroup internal + * Cursor state flags. + * @{ + */ +#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ +#define C_EOF 0x02 /**< No more data */ +#define C_SUB 0x04 /**< Cursor is a sub-cursor */ +#define C_DEL 0x08 /**< last op was a cursor_del */ +#define C_UNTRACK 0x40 /**< Un-track cursor when closing */ +/** @} */ + unsigned int mc_flags; /**< @ref mdb_cursor */ + MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ + indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ +}; + + /** Context for sorted-dup records. + * We could have gone to a fully recursive design, with arbitrarily + * deep nesting of sub-databases. But for now we only handle these + * levels - main DB, optional sub-DB, sorted-duplicate DB. + */ +typedef struct MDB_xcursor { + /** A sub-cursor for traversing the Dup DB */ + MDB_cursor mx_cursor; + /** The database record for this Dup DB */ + MDB_db mx_db; + /** The auxiliary DB record for this Dup DB */ + MDB_dbx mx_dbx; + /** The @ref mt_dbflag for this Dup DB */ + unsigned char mx_dbflag; +} MDB_xcursor; + + /** Check if there is an inited xcursor */ +#define XCURSOR_INITED(mc) \ + ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + + /** Update the xcursor's sub-page pointer, if any, in \b mc. Needed + * when the node which contains the sub-page may have moved. Called + * with leaf page \b mp = mc->mc_pg[\b top]. + */ +#define XCURSOR_REFRESH(mc, top, mp) do { \ + MDB_page *xr_pg = (mp); \ + MDB_node *xr_node; \ + if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \ + xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \ + if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ + (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ +} while (0) + + /** State of FreeDB old pages, stored in the MDB_env */ +typedef struct MDB_pgstate { + pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ + txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ +} MDB_pgstate; + + /** The database environment. */ +struct MDB_env { + HANDLE me_fd; /**< The main data file */ + HANDLE me_lfd; /**< The lock file */ + HANDLE me_mfd; /**< For writing and syncing the meta pages */ + /** Failed to update the meta page. Probably an I/O error. */ +#define MDB_FATAL_ERROR 0x80000000U + /** Some fields are initialized. */ +#define MDB_ENV_ACTIVE 0x20000000U + /** me_txkey is set */ +#define MDB_ENV_TXKEY 0x10000000U + /** fdatasync is unreliable */ +#define MDB_FSYNCONLY 0x08000000U + uint32_t me_flags; /**< @ref mdb_env */ + unsigned int me_psize; /**< DB page size, inited from me_os_psize */ + unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ + unsigned int me_maxreaders; /**< size of the reader table */ + /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */ + volatile int me_close_readers; + MDB_dbi me_numdbs; /**< number of DBs opened */ + MDB_dbi me_maxdbs; /**< size of the DB table */ + MDB_PID_T me_pid; /**< process ID of this env */ + char *me_path; /**< path to the DB files */ + char *me_map; /**< the memory map of the data file */ + MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ + MDB_meta *me_metas[NUM_METAS]; /**< pointers to the two meta pages */ + void *me_pbuf; /**< scratch area for DUPSORT put() */ + MDB_txn *me_txn; /**< current write transaction */ + MDB_txn *me_txn0; /**< prealloc'd write transaction */ + size_t me_mapsize; /**< size of the data memory map */ + off_t me_size; /**< current file size */ + pgno_t me_maxpg; /**< me_mapsize / me_psize */ + MDB_dbx *me_dbxs; /**< array of static DB info */ + uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ + unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */ + pthread_key_t me_txkey; /**< thread-key for readers */ + txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ + MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ +# define me_pglast me_pgstate.mf_pglast +# define me_pghead me_pgstate.mf_pghead +// MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ + /** IDL of pages that became unused in a write txn */ + MDB_IDL me_free_pgs; + /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ + MDB_ID2L me_dirty_list; + /** Max number of freelist items that can fit in a single overflow page */ + int me_maxfree_1pg; + /** Max size of a node on a page */ + unsigned int me_nodemax; +#if !(MDB_MAXKEYSIZE) + unsigned int me_maxkey; /**< max size of a key */ +#endif + int me_live_reader; /**< have liveness lock in reader table */ +#ifdef _WIN32 + int me_pidquery; /**< Used in OpenProcess */ +#endif +#ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */ +# define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */ +# define me_wmutex me_txns->mti_wmutex /**< Shared writer lock */ +#else + mdb_mutex_t me_rmutex; + mdb_mutex_t me_wmutex; +#endif + void *me_userctx; /**< User-settable context */ + MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ +}; + + /** Nested transaction */ +typedef struct MDB_ntxn { + MDB_txn mnt_txn; /**< the transaction */ + MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ +} MDB_ntxn; + + /** max number of pages to commit in one writev() call */ +#define MDB_COMMIT_PAGES 64 +#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES +#undef MDB_COMMIT_PAGES +#define MDB_COMMIT_PAGES IOV_MAX +#endif + + /** max bytes to write in one call */ +#define MAX_WRITE (0x40000000U >> (sizeof(ssize_t) == 4)) + + /** Check \b txn and \b dbi arguments to a function */ +#define TXN_DBI_EXIST(txn, dbi, validity) \ + ((txn) && (dbi)<(txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) + + /** Check for misused \b dbi handles */ +#define TXN_DBI_CHANGED(txn, dbi) \ + ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) + +static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); +static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); +static int mdb_page_touch(MDB_cursor *mc); + +#define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \ + "reset-tmp", "fail-begin", "fail-beginchild"} +enum { + /* mdb_txn_end operation number, for logging */ + MDB_END_COMMITTED, MDB_END_EMPTY_COMMIT, MDB_END_ABORT, MDB_END_RESET, + MDB_END_RESET_TMP, MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD +}; +#define MDB_END_OPMASK 0x0F /**< mask for #mdb_txn_end() operation number */ +#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ +#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ +#define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ +static void mdb_txn_end(MDB_txn *txn, unsigned mode); + +static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); +static int mdb_page_search_root(MDB_cursor *mc, + MDB_val *key, int modify); +#define MDB_PS_MODIFY 1 +#define MDB_PS_ROOTONLY 2 +#define MDB_PS_FIRST 4 +#define MDB_PS_LAST 8 +static int mdb_page_search(MDB_cursor *mc, + MDB_val *key, int flags); +static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); + +#define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ +static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, + pgno_t newpgno, unsigned int nflags); + +static int mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta); +static MDB_meta *mdb_env_pick_meta(const MDB_env *env); +static int mdb_env_write_meta(MDB_txn *txn); +#ifdef MDB_USE_POSIX_MUTEX /* Drop unused excl arg */ +# define mdb_env_close0(env, excl) mdb_env_close1(env) +#endif +static void mdb_env_close0(MDB_env *env, int excl); + +static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); +static int mdb_node_add(MDB_cursor *mc, indx_t indx, + MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags); +static void mdb_node_del(MDB_cursor *mc, int ksize); +static void mdb_node_shrink(MDB_page *mp, indx_t indx); +static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); +static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); +static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); +static size_t mdb_branch_size(MDB_env *env, MDB_val *key); + +static int mdb_rebalance(MDB_cursor *mc); +static int mdb_update_key(MDB_cursor *mc, MDB_val *key); + +static void mdb_cursor_pop(MDB_cursor *mc); +static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); + +static int mdb_cursor_del0(MDB_cursor *mc); +static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); +static int mdb_cursor_sibling(MDB_cursor *mc, int move_right); +static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); +static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); +static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, + int *exactp); +static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); +static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); + +static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); +static void mdb_xcursor_init0(MDB_cursor *mc); +static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); +static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); + +static int mdb_drop0(MDB_cursor *mc, int subs); +static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); +static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); + +/** @cond */ +static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long; +/** @endcond */ + +/** Compare two items pointing at size_t's of unknown alignment. */ +#ifdef MISALIGNED_OK +# define mdb_cmp_clong mdb_cmp_long +#else +# define mdb_cmp_clong mdb_cmp_cint +#endif + +#ifdef _WIN32 +static SECURITY_DESCRIPTOR mdb_null_sd; +static SECURITY_ATTRIBUTES mdb_all_sa; +static int mdb_sec_inited; + +struct MDB_name; +static int utf8_to_utf16(const char *src, struct MDB_name *dst, int xtra); +#endif + +/** Return the library version info. */ +char * ESECT +mdb_version(int *major, int *minor, int *patch) +{ + if (major) *major = MDB_VERSION_MAJOR; + if (minor) *minor = MDB_VERSION_MINOR; + if (patch) *patch = MDB_VERSION_PATCH; + return MDB_VERSION_STRING; +} + +/** Table of descriptions for LMDB @ref errors */ +static char *const mdb_errstr[] = { + "MDB_KEYEXIST: Key/data pair already exists", + "MDB_NOTFOUND: No matching key/data pair found", + "MDB_PAGE_NOTFOUND: Requested page not found", + "MDB_CORRUPTED: Located page was wrong type", + "MDB_PANIC: Update of meta page failed or environment had fatal error", + "MDB_VERSION_MISMATCH: Database environment version mismatch", + "MDB_INVALID: File is not an LMDB file", + "MDB_MAP_FULL: Environment mapsize limit reached", + "MDB_DBS_FULL: Environment maxdbs limit reached", + "MDB_READERS_FULL: Environment maxreaders limit reached", + "MDB_TLS_FULL: Thread-local storage keys full - too many environments open", + "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big", + "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", + "MDB_PAGE_FULL: Internal error - page has no more space", + "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", + "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", + "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", + "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", + "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", + "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", +}; + +char * +mdb_strerror(int err) +{ +#ifdef _WIN32 + /** HACK: pad 4KB on stack over the buf. Return system msgs in buf. + * This works as long as no function between the call to mdb_strerror + * and the actual use of the message uses more than 4K of stack. + */ +#define MSGSIZE 1024 +#define PADSIZE 4096 + char buf[MSGSIZE+PADSIZE], *ptr = buf; +#endif + int i; + if (!err) + return ("Successful return: 0"); + + if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { + i = err - MDB_KEYEXIST; + return mdb_errstr[i]; + } + +#ifdef _WIN32 + /* These are the C-runtime error codes we use. The comment indicates + * their numeric value, and the Win32 error they would correspond to + * if the error actually came from a Win32 API. A major mess, we should + * have used LMDB-specific error codes for everything. + */ + switch(err) { + case ENOENT: /* 2, FILE_NOT_FOUND */ + case EIO: /* 5, ACCESS_DENIED */ + case ENOMEM: /* 12, INVALID_ACCESS */ + case EACCES: /* 13, INVALID_DATA */ + case EBUSY: /* 16, CURRENT_DIRECTORY */ + case EINVAL: /* 22, BAD_COMMAND */ + case ENOSPC: /* 28, OUT_OF_PAPER */ + return strerror(err); + default: + ; + } + buf[0] = 0; + FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, 0, ptr, MSGSIZE, (va_list *)buf+MSGSIZE); + return ptr; +#else + return strerror(err); +#endif +} + +/** assert(3) variant in cursor context */ +#define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) +/** assert(3) variant in transaction context */ +#define mdb_tassert(txn, expr) mdb_assert0((txn)->mt_env, expr, #expr) +/** assert(3) variant in environment context */ +#define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) + +#ifndef NDEBUG +# define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \ + mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__)) + +static void ESECT +mdb_assert_fail(MDB_env *env, const char *expr_txt, + const char *func, const char *file, int line) +{ + char buf[400]; + sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()", + file, line, expr_txt, func); + if (env->me_assert_func) + env->me_assert_func(env, buf); + fprintf(stderr, "%s\n", buf); + abort(); +} +#else +# define mdb_assert0(env, expr, expr_txt) ((void) 0) +#endif /* NDEBUG */ + +#if MDB_DEBUG +/** Return the page number of \b mp which may be sub-page, for debug output */ +static pgno_t +mdb_dbg_pgno(MDB_page *mp) +{ + pgno_t ret; + COPY_PGNO(ret, mp->mp_pgno); + return ret; +} + +/** Display a key in hexadecimal and return the address of the result. + * @param[in] key the key to display + * @param[in] buf the buffer to write into. Should always be #DKBUF. + * @return The key in hexadecimal form. + */ +char * +mdb_dkey(MDB_val *key, char *buf) +{ + char *ptr = buf; + unsigned char *c = key->mv_data; + unsigned int i; + + if (!key) + return ""; + + if (key->mv_size > DKBUF_MAXKEYSIZE) + return "MDB_MAXKEYSIZE"; + /* may want to make this a dynamic check: if the key is mostly + * printable characters, print it as-is instead of converting to hex. + */ +#if 1 + buf[0] = '\0'; + for (i=0; imv_size; i++) + ptr += sprintf(ptr, "%02x", *c++); +#else + sprintf(buf, "%.*s", key->mv_size, key->mv_data); +#endif + return buf; +} + +static const char * +mdb_leafnode_type(MDB_node *n) +{ + static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; + return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : + tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; +} + +/** Display all the keys in the page. */ +void +mdb_page_list(MDB_page *mp) +{ + pgno_t pgno = mdb_dbg_pgno(mp); + const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; + MDB_node *node; + unsigned int i, nkeys, nsize, total = 0; + MDB_val key; + DKBUF; + + switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { + case P_BRANCH: type = "Branch page"; break; + case P_LEAF: type = "Leaf page"; break; + case P_LEAF|P_SUBP: type = "Sub-page"; break; + case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; + case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; + case P_OVERFLOW: + fprintf(stderr, "Overflow page %"Z"u pages %u%s\n", + pgno, mp->mp_pages, state); + return; + case P_META: + fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n", + pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); + return; + default: + fprintf(stderr, "Bad page %"Z"u flags 0x%X\n", pgno, mp->mp_flags); + return; + } + + nkeys = NUMKEYS(mp); + fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state); + + for (i=0; imp_pad; + key.mv_data = LEAF2KEY(mp, i, nsize); + total += nsize; + fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); + continue; + } + node = NODEPTR(mp, i); + key.mv_size = node->mn_ksize; + key.mv_data = node->mn_data; + nsize = NODESIZE + key.mv_size; + if (IS_BRANCH(mp)) { + fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node), + DKEY(&key)); + total += nsize; + } else { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + nsize += sizeof(pgno_t); + else + nsize += NODEDSZ(node); + total += nsize; + nsize += sizeof(indx_t); + fprintf(stderr, "key %d: nsize %d, %s%s\n", + i, nsize, DKEY(&key), mdb_leafnode_type(node)); + } + total = EVEN(total); + } + fprintf(stderr, "Total: header %d + contents %d + unused %d\n", + IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp)); +} + +void +mdb_cursor_chk(MDB_cursor *mc) +{ + unsigned int i; + MDB_node *node; + MDB_page *mp; + + if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; + for (i=0; imc_top; i++) { + mp = mc->mc_pg[i]; + node = NODEPTR(mp, mc->mc_ki[i]); + if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno) + printf("oops!\n"); + } + if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) + printf("ack!\n"); + if (XCURSOR_INITED(mc)) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && + mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { + printf("blah!\n"); + } + } +} +#endif + +#if (MDB_DEBUG) > 2 +/** Count all the pages in each DB and in the freelist + * and make sure it matches the actual number of pages + * being used. + * All named DBs must be open for a correct count. + */ +static void mdb_audit(MDB_txn *txn) +{ + MDB_cursor mc; + MDB_val key, data; + MDB_ID freecount, count; + MDB_dbi i; + int rc; + + freecount = 0; + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + freecount += *(MDB_ID *)data.mv_data; + mdb_tassert(txn, rc == MDB_NOTFOUND); + + count = 0; + for (i = 0; imt_numdbs; i++) { + MDB_xcursor mx; + if (!(txn->mt_dbflags[i] & DB_VALID)) + continue; + mdb_cursor_init(&mc, txn, i, &mx); + if (txn->mt_dbs[i].md_root == P_INVALID) + continue; + count += txn->mt_dbs[i].md_branch_pages + + txn->mt_dbs[i].md_leaf_pages + + txn->mt_dbs[i].md_overflow_pages; + if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { + rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST); + for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) { + unsigned j; + MDB_page *mp; + mp = mc.mc_pg[mc.mc_top]; + for (j=0; jmn_flags & F_SUBDATA) { + MDB_db db; + memcpy(&db, NODEDATA(leaf), sizeof(db)); + count += db.md_branch_pages + db.md_leaf_pages + + db.md_overflow_pages; + } + } + } + mdb_tassert(txn, rc == MDB_NOTFOUND); + } + } + if (freecount + count + NUM_METAS != txn->mt_next_pgno) { + fprintf(stderr, "audit: %"Z"u freecount: %"Z"u count: %"Z"u total: %"Z"u next_pgno: %"Z"u\n", + txn->mt_txnid, freecount, count+NUM_METAS, + freecount+count+NUM_METAS, txn->mt_next_pgno); + } +} +#endif + +int +mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) +{ + return txn->mt_dbxs[dbi].md_cmp(a, b); +} + +int +mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) +{ + MDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp; +#if UINT_MAX < SIZE_MAX + if (dcmp == mdb_cmp_int && a->mv_size == sizeof(size_t)) + dcmp = mdb_cmp_clong; +#endif + return dcmp(a, b); +} + +/** Allocate memory for a page. + * Re-use old malloc'd pages first for singletons, otherwise just malloc. + * Set #MDB_TXN_ERROR on failure. + */ +static MDB_page * +mdb_page_malloc(MDB_txn *txn, unsigned num) +{ + MDB_env *env = txn->mt_env; + MDB_page *ret = NULL; // env->me_dpages; + size_t psize = env->me_psize, sz = psize, off; + /* For ! #MDB_NOMEMINIT, psize counts how much to init. + * For a single page alloc, we init everything after the page header. + * For multi-page, we init the final page; if the caller needed that + * many pages they will be filling in at least up to the last page. + */ + if (num == 1) { + /*if (ret) { + VGMEMP_ALLOC(env, ret, sz); + VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); + env->me_dpages = ret->mp_next; + return ret; + }*/ + psize -= off = PAGEHDRSZ; + } else { + sz *= num; + off = sz - psize; + } + if ((ret = malloc(sz)) != NULL) { + VGMEMP_ALLOC(env, ret, sz); + if (!(env->me_flags & MDB_NOMEMINIT)) { + memset((char *)ret + off, 0, psize); + ret->mp_pad = 0; + } + } else { + txn->mt_flags |= MDB_TXN_ERROR; + } + return ret; +} +/** Free a single page. + * Saves single pages to a list, for future reuse. + * (This is not used for multi-page overflow pages.) + */ +static void +mdb_page_free(MDB_env *env, MDB_page *mp) +{ + VGMEMP_FREE(env, mp); + free(mp); + /*mp->mp_next = env->me_dpages; + VGMEMP_FREE(env, mp); + env->me_dpages = mp;*/ +} + +/** Free a dirty page */ +static void +mdb_dpage_free(MDB_env *env, MDB_page *dp) +{ + if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { + mdb_page_free(env, dp); + } else { + /* large pages just get freed directly */ + VGMEMP_FREE(env, dp); + free(dp); + } +} + +/** Return all dirty pages to dpage list */ +static void +mdb_dlist_free(MDB_txn *txn) +{ + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned i, n = dl[0].mid; + + for (i = 1; i <= n; i++) { + mdb_dpage_free(env, dl[i].mptr); + } + dl[0].mid = 0; +} + +/** Loosen or free a single page. + * Saves single pages to a list for future reuse + * in this same txn. It has been pulled from the freeDB + * and already resides on the dirty list, but has been + * deleted. Use these pages first before pulling again + * from the freeDB. + * + * If the page wasn't dirtied in this txn, just add it + * to this txn's free list. + */ +static int +mdb_page_loose(MDB_cursor *mc, MDB_page *mp) +{ + int loose = 0; + pgno_t pgno = mp->mp_pgno; + MDB_txn *txn = mc->mc_txn; + + if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { + if (txn->mt_parent) { + MDB_ID2 *dl = txn->mt_u.dirty_list; + /* If txn has a parent, make sure the page is in our + * dirty list. + */ + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (mp != dl[x].mptr) { /* bad cursor? */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + /* ok, it's ours */ + loose = 1; + } + } + } else { + /* no parent txn, so it's just ours */ + loose = 1; + } + } + if (loose) { + DPRINTF(("loosen db %d page %"Z"u", DDBI(mc), + mp->mp_pgno)); + NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs; + txn->mt_loose_pgs = mp; + txn->mt_loose_count++; + mp->mp_flags |= P_LOOSE; + } else { + int rc = mdb_midl_append(&txn->mt_free_pgs, pgno); + if (rc) + return rc; + } + + return MDB_SUCCESS; +} + +/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. + * @param[in] mc A cursor handle for the current operation. + * @param[in] pflags Flags of the pages to update: + * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. + * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). + * @return 0 on success, non-zero on failure. + */ +static int +mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) +{ + enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m3, *m0 = mc; + MDB_xcursor *mx; + MDB_page *dp, *mp; + MDB_node *leaf; + unsigned i, j; + int rc = MDB_SUCCESS, level; + + /* Mark pages seen by cursors */ + if (mc->mc_flags & C_UNTRACK) + mc = NULL; /* will find mc in mt_cursors */ + for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { + for (; mc; mc=mc->mc_next) { + if (!(mc->mc_flags & C_INITIALIZED)) + continue; + for (m3 = mc;; m3 = &mx->mx_cursor) { + mp = NULL; + for (j=0; jmc_snum; j++) { + mp = m3->mc_pg[j]; + if ((mp->mp_flags & Mask) == pflags) + mp->mp_flags ^= P_KEEP; + + if (pflags == P_DIRTY + && (mp->mp_flags & P_DIRTY) + && !(mp->mp_flags & P_KEEP) + && !(mp->mp_flags & P_LOOSE)) + abort(); + } + mx = m3->mc_xcursor; + /* Proceed to mx if it is at a sub-database */ + if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) + break; + if (! (mp && (mp->mp_flags & P_LEAF))) + break; + leaf = NODEPTR(mp, m3->mc_ki[j-1]); + if (!(leaf->mn_flags & F_SUBDATA)) + break; + } + } + if (i == 0) + break; + } + + if (all) { + /* Mark dirty root pages */ + for (i=0; imt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + pgno_t pgno = txn->mt_dbs[i].md_root; + if (pgno == P_INVALID) + continue; + if ((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS) + break; + if ((dp->mp_flags & Mask) == pflags && level <= 1) + dp->mp_flags ^= P_KEEP; + } + } + } + + return rc; +} + +static int mdb_page_flush(MDB_txn *txn, int keep); + +/** Spill pages from the dirty list back to disk. + * This is intended to prevent running into #MDB_TXN_FULL situations, + * but note that they may still occur in a few cases: + * 1) our estimate of the txn size could be too small. Currently this + * seems unlikely, except with a large number of #MDB_MULTIPLE items. + * 2) child txns may run out of space if their parents dirtied a + * lot of pages and never spilled them. TODO: we probably should do + * a preemptive spill during #mdb_txn_begin() of a child txn, if + * the parent's dirty_room is below a given threshold. + * + * Otherwise, if not using nested txns, it is expected that apps will + * not run into #MDB_TXN_FULL any more. The pages are flushed to disk + * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. + * If the txn never references them again, they can be left alone. + * If the txn only reads them, they can be used without any fuss. + * If the txn writes them again, they can be dirtied immediately without + * going thru all of the work of #mdb_page_touch(). Such references are + * handled by #mdb_page_unspill(). + * + * Also note, we never spill DB root pages, nor pages of active cursors, + * because we'll need these back again soon anyway. And in nested txns, + * we can't spill a page in a child txn if it was already spilled in a + * parent txn. That would alter the parent txns' data even though + * the child hasn't committed yet, and we'd have no way to undo it if + * the child aborted. + * + * @param[in] m0 cursor A cursor handle identifying the transaction and + * database for which we are checking space. + * @param[in] key For a put operation, the key being stored. + * @param[in] data For a put operation, the data being stored. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) +{ + MDB_txn *txn = m0->mc_txn; + MDB_page *dp; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned int i, j, need; + const MDB_ID* sid; + int rc; + + if (m0->mc_flags & C_SUB) + return MDB_SUCCESS; + + /* Estimate how much space this op will take */ + i = m0->mc_db->md_depth; + /* Named DBs also dirty the main DB */ + if (m0->mc_dbi >= CORE_DBS) + i += txn->mt_dbs[MAIN_DBI].md_depth; + /* For puts, roughly factor in the key+data size */ + if (key) + i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; + i += i; /* double it for good measure */ + need = i; + + if (txn->mt_dirty_room > i) + return MDB_SUCCESS; + + if (!txn->mt_spill_pgs) { + txn->mt_spill_pgs = cppmidl_alloc(); + if (!txn->mt_spill_pgs) + return ENOMEM; + } + + /* Preserve pages which may soon be dirtied again */ + if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) + goto done; + + /* Less aggressive spill - we originally spilled the entire dirty list, + * with a few exceptions for cursor pages and DB root pages. But this + * turns out to be a lot of wasted effort because in a large txn many + * of those pages will need to be used again. So now we spill only 1/8th + * of the dirty pages. Testing revealed this to be a good tradeoff, + * better than 1/2, 1/4, or 1/10. + */ + if (need < MDB_IDL_DIRTY_MAX / 8) + need = MDB_IDL_DIRTY_MAX / 8; + + /* Save the page IDs of all the pages we're flushing */ + /* flush from the tail forward, this saves a lot of shifting later on. */ + for (i=dl[0].mid; i && need; i--) { + MDB_ID pn = dl[i].mid << 1; + dp = dl[i].mptr; + if (dp->mp_flags & (P_LOOSE|P_KEEP)) + continue; + /* Can't spill twice, make sure it's not already in a parent's + * spill list. + */ + if (txn->mt_parent) { + MDB_txn *tx2; + for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { + if (tx2->mt_spill_pgs) { + sid = cppmidl_search(tx2->mt_spill_pgs, pn); + if (sid!=NULL && *sid == pn) { + dp->mp_flags |= P_KEEP; + break; + } + } + } + if (tx2) + continue; + } + + for (j = 0; j < m0->mc_snum; ++j) { + if (m0->mc_pg[j] == dp) + abort(); + } + + cppmidl_insert(txn->mt_spill_pgs, pn); + need--; + } + + /* Flush the spilled part of dirty list */ + if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) + goto done; + + /* Reset any dirty pages we kept that page_flush didn't see */ + rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); + +done: + txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; + return rc; +} + +/** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ +static txnid_t +mdb_find_oldest(MDB_txn *txn) +{ + int i; + txnid_t mr, oldest = txn->mt_txnid - 1; + if (txn->mt_env->me_txns) { + MDB_reader *r = txn->mt_env->me_txns->mti_readers; + for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { + if (r[i].mr_pid) { + mr = r[i].mr_txnid; + if (oldest > mr) + oldest = mr; + } + } + } + return oldest; +} + +/** Add a page to the txn's dirty list */ +static void +mdb_page_dirty(MDB_txn *txn, MDB_page *mp) +{ + MDB_ID2 mid; + int rc, (*insert)(MDB_ID2L, MDB_ID2 *); + + if (txn->mt_flags & MDB_TXN_WRITEMAP) { + insert = mdb_mid2l_append; + } else { + insert = mdb_mid2l_insert; + } + mid.mid = mp->mp_pgno; + mid.mptr = mp; + rc = insert(txn->mt_u.dirty_list, &mid); + mdb_tassert(txn, rc == 0); + txn->mt_dirty_room--; +} + +/** Allocate page numbers and memory for writing. Maintain me_pglast, + * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. + * + * If there are free pages available from older transactions, they + * are re-used first. Otherwise allocate a new page at mt_next_pgno. + * Do not modify the freedB, just merge freeDB records into me_pghead[] + * and move me_pglast to say which records were consumed. Only this + * function can create me_pghead and move me_pglast/mt_next_pgno. + * @param[in] mc cursor A cursor handle identifying the transaction and + * database for which we are allocating. + * @param[in] num the number of pages to allocate. + * @param[out] mp Address of the allocated page(s). Requests for multiple pages + * will always be satisfied by a single contiguous chunk of memory. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) +{ +#ifdef MDB_PARANOID /* Seems like we can ignore this now */ + /* Get at most more freeDB records once me_pghead + * has enough pages. If not enough, use new pages from the map. + * If and mc is updating the freeDB, only get new + * records if me_pghead is empty. Then the freelist cannot play + * catch-up with itself by growing while trying to save it. + */ + enum { Paranoid = 1, Max_retries = 500 }; +#else + enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ }; +#endif + int rc, retry = num * 60; + MDB_txn *txn = mc->mc_txn; + MDB_env *env = txn->mt_env; + pgno_t pgno, *mop = env->me_pghead; + unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1; + MDB_page *np; + txnid_t oldest = 0, last; + MDB_cursor_op op; + MDB_cursor m2; + int found_old = 0; + + /* If there are any loose pages, just use them */ + if (num == 1 && txn->mt_loose_pgs) { + np = txn->mt_loose_pgs; + txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); + txn->mt_loose_count--; + DPRINTF(("db %d use loose page %"Z"u", DDBI(mc), + np->mp_pgno)); + *mp = np; + return MDB_SUCCESS; + } + + *mp = NULL; + + /* If our dirty list is already full, we can't do anything */ + if (txn->mt_dirty_room == 0) { + rc = MDB_TXN_FULL; + goto fail; + } + + for (op = MDB_FIRST;; op = MDB_NEXT) { + MDB_val key, data; + MDB_node *leaf; + pgno_t *idl; + + /* Seek a big enough contiguous page range. Prefer + * pages at the tail, just truncating the list. + */ + if (mop_len > n2) { + i = mop_len; + do { + pgno = mop[i]; + if (mop[i-n2] == pgno+n2) + goto search_done; + } while (--i > n2); + if (--retry < 0) + break; + } + + if (op == MDB_FIRST) { /* 1st iteration */ + /* Prepare to fetch more and coalesce */ + last = env->me_pglast; + oldest = env->me_pgoldest; + mdb_cursor_init(&m2, txn, FREE_DBI, NULL); + if (last) { + op = MDB_SET_RANGE; + key.mv_data = &last; /* will look up last+1 */ + key.mv_size = sizeof(last); + } + if (Paranoid && mc->mc_dbi == FREE_DBI) + retry = -1; + } + if (Paranoid && retry < 0 && mop_len) + break; + + last++; + /* Do not fetch more if the record will be too recent */ + if (oldest <= last) { + if (!found_old) { + oldest = mdb_find_oldest(txn); + env->me_pgoldest = oldest; + found_old = 1; + } + if (oldest <= last) + break; + } + rc = mdb_cursor_get(&m2, &key, NULL, op); + if (rc) { + if (rc == MDB_NOTFOUND) + break; + goto fail; + } + last = *(txnid_t*)key.mv_data; + if (oldest <= last) { + if (!found_old) { + oldest = mdb_find_oldest(txn); + env->me_pgoldest = oldest; + found_old = 1; + } + if (oldest <= last) + break; + } + np = m2.mc_pg[m2.mc_top]; + leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); + if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS) + goto fail; + + idl = (MDB_ID *) data.mv_data; + i = idl[0]; + if (!mop) { + if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { + rc = ENOMEM; + goto fail; + } + } else { + if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) + goto fail; + mop = env->me_pghead; + } + env->me_pglast = last; +#if (MDB_DEBUG) > 1 + DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u", + last, txn->mt_dbs[FREE_DBI].md_root, i)); + for (j = i; j; j--) + DPRINTF(("IDL %"Z"u", idl[j])); +#endif + /* Merge in descending sorted order */ + mdb_midl_xmerge(mop, idl); + mop_len = mop[0]; + } + + /* Use new pages from the map when nothing suitable in the freeDB */ + i = 0; + pgno = txn->mt_next_pgno; + if (pgno + num >= env->me_maxpg) { + DPUTS("DB size maxed out"); + rc = MDB_MAP_FULL; + goto fail; + } + +search_done: + if (env->me_flags & MDB_WRITEMAP) { + np = (MDB_page *)(env->me_map + env->me_psize * pgno); + } else { + if (!(np = mdb_page_malloc(txn, num))) { + rc = ENOMEM; + goto fail; + } + } + if (i) { + mop[0] = mop_len -= num; + /* Move any stragglers down */ + for (j = i-num; j < mop_len; ) + mop[++j] = mop[++i]; + } else { + txn->mt_next_pgno = pgno + num; + } + np->mp_pgno = pgno; + mdb_page_dirty(txn, np); + *mp = np; + + return MDB_SUCCESS; + +fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +/** Copy the used portions of a non-overflow page. + * @param[in] dst page to copy into + * @param[in] src page to copy from + * @param[in] psize size of a page + */ +static void +mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) +{ + enum { Align = sizeof(pgno_t) }; + indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; + + /* If page isn't full, just copy the used portion. Adjust + * alignment so memcpy may copy words instead of bytes. + */ + if ((unused &= -Align) && !IS_LEAF2(src)) { + upper = (upper + PAGEBASE) & -Align; + memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align); + memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), + psize - upper); + } else { + memcpy(dst, src, psize - unused); + } +} + +/** Pull a page off the txn's spill list, if present. + * If a page being referenced was spilled to disk in this txn, bring + * it back and make it dirty/writable again. + * @param[in] txn the transaction handle. + * @param[in] mp the page being referenced. It must not be dirty. + * @param[out] ret the writable page, if any. ret is unchanged if + * mp wasn't spilled. + */ +static int +mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) +{ + MDB_env *env = txn->mt_env; + const MDB_txn *tx2; + const MDB_ID* x; + pgno_t pgno = mp->mp_pgno, pn = pgno << 1; + + for (tx2 = txn; tx2; tx2=tx2->mt_parent) { + if (!tx2->mt_spill_pgs) + continue; + x = cppmidl_search(tx2->mt_spill_pgs, pn); + if (x!=NULL && *x == pn) { + MDB_page *np; + int num; + if (txn->mt_dirty_room == 0) + return MDB_TXN_FULL; + if (IS_OVERFLOW(mp)) + num = mp->mp_pages; + else + num = 1; + if (env->me_flags & MDB_WRITEMAP) { + np = mp; + } else { + np = mdb_page_malloc(txn, num); + if (!np) + return ENOMEM; + if (num > 1) + memcpy(np, mp, num * env->me_psize); + else + mdb_page_copy(np, mp, env->me_psize); + } + if (tx2 == txn) { + /* If in current txn, this page is no longer spilled. + * If it happens to be the last page, truncate the spill list. + * Otherwise mark it as deleted by setting the LSB. + */ + cppmidl_erase(tx2->mt_spill_pgs, pn); + } /* otherwise, if belonging to a parent txn, the + * page remains spilled until child commits + */ + + mdb_page_dirty(txn, np); + np->mp_flags |= P_DIRTY; + *ret = np; + break; + } + } + return MDB_SUCCESS; +} + +/** Touch a page: make it dirty and re-insert into tree with updated pgno. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc cursor pointing to the page to be touched + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_touch(MDB_cursor *mc) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top], *np; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m2, *m3; + pgno_t pgno; + int rc; + + if (F_ISSET(mp->mp_flags, P_DIRTY) + && !(txn->mt_env->me_flags & MDB_WRITEMAP) + && (char*)mp > txn->mt_env->me_map + && (char*)mp < txn->mt_env->me_map + txn->mt_env->me_mapsize) { + return MDB_CORRUPTED; + } + + if (!F_ISSET(mp->mp_flags, P_DIRTY)) { + if (txn->mt_flags & MDB_TXN_SPILLS) { + np = NULL; + rc = mdb_page_unspill(txn, mp, &np); + if (rc) + goto fail; + if (np) + goto done; + } + if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || + (rc = mdb_page_alloc(mc, 1, &np))) + goto fail; + pgno = np->mp_pgno; + DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), + mp->mp_pgno, pgno)); + mdb_cassert(mc, mp->mp_pgno != pgno); + mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + /* Update the parent page, if any, to point to the new page */ + if (mc->mc_top) { + MDB_page *parent = mc->mc_pg[mc->mc_top-1]; + MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); + SETPGNO(node, pgno); + } else { + mc->mc_db->md_root = pgno; + } + } else if (txn->mt_parent && !IS_SUBP(mp)) { + MDB_ID2 mid, *dl = txn->mt_u.dirty_list; + pgno = mp->mp_pgno; + /* If txn has a parent, make sure the page is in our + * dirty list. + */ + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (mp != dl[x].mptr) { /* bad cursor? */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + return 0; + } + } + mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); + /* No - copy it */ + np = mdb_page_malloc(txn, 1); + if (!np) + return ENOMEM; + mid.mid = pgno; + mid.mptr = np; + rc = mdb_mid2l_insert(dl, &mid); + mdb_cassert(mc, rc == 0); + } else { + return 0; + } + + mdb_page_copy(np, mp, txn->mt_env->me_psize); + np->mp_pgno = pgno; + np->mp_flags |= P_DIRTY; + +done: + /* Adjust cursors pointing to mp */ + mc->mc_pg[mc->mc_top] = np; + m2 = txn->mt_cursors[mc->mc_dbi]; + if (mc->mc_flags & C_SUB) { + for (; m2; m2=m2->mc_next) { + m3 = &m2->mc_xcursor->mx_cursor; + if (m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[mc->mc_top] == mp) + m3->mc_pg[mc->mc_top] = np; + } + } else { + for (; m2; m2=m2->mc_next) { + if (m2->mc_snum < mc->mc_snum) continue; + if (m2 == mc) continue; + if (m2->mc_pg[mc->mc_top] == mp) { + m2->mc_pg[mc->mc_top] = np; + if (IS_LEAF(np)) + XCURSOR_REFRESH(m2, mc->mc_top, np); + } + } + } + return 0; + +fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_env_sync(MDB_env *env, int force) +{ + int rc = 0; + if (env->me_flags & MDB_RDONLY) + return EACCES; + if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { + if (env->me_flags & MDB_WRITEMAP) { + int flags = ((env->me_flags & MDB_MAPASYNC) && !force) + ? MS_ASYNC : MS_SYNC; + if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) + rc = ErrCode(); +#ifdef _WIN32 + else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) + rc = ErrCode(); +#endif + } else { +#ifdef BROKEN_FDATASYNC + if (env->me_flags & MDB_FSYNCONLY) { + if (fsync(env->me_fd)) + rc = ErrCode(); + } else +#endif + if (MDB_FDATASYNC(env->me_fd)) + rc = ErrCode(); + } + } + return rc; +} + +/** Back up parent txn's cursors, then grab the originals for tracking */ +static int +mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) +{ + MDB_cursor *mc, *bk; + MDB_xcursor *mx; + size_t size; + int i; + + for (i = src->mt_numdbs; --i >= 0; ) { + if ((mc = src->mt_cursors[i]) != NULL) { + size = sizeof(MDB_cursor); + if (mc->mc_xcursor) + size += sizeof(MDB_xcursor); + for (; mc; mc = bk->mc_next) { + bk = malloc(size); + if (!bk) + return ENOMEM; + *bk = *mc; + mc->mc_backup = bk; + mc->mc_db = &dst->mt_dbs[i]; + /* Kill pointers into src to reduce abuse: The + * user may not use mc until dst ends. But we need a valid + * txn pointer here for cursor fixups to keep working. + */ + mc->mc_txn = dst; + mc->mc_dbflag = &dst->mt_dbflags[i]; + if ((mx = mc->mc_xcursor) != NULL) { + *(MDB_xcursor *)(bk+1) = *mx; + mx->mx_cursor.mc_txn = dst; + } + mc->mc_next = dst->mt_cursors[i]; + dst->mt_cursors[i] = mc; + } + } + } + return MDB_SUCCESS; +} + +/** Close this write txn's cursors, give parent txn's cursors back to parent. + * @param[in] txn the transaction handle. + * @param[in] merge true to keep changes to parent cursors, false to revert. + * @return 0 on success, non-zero on failure. + */ +static void +mdb_cursors_close(MDB_txn *txn, unsigned merge) +{ + MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; + MDB_xcursor *mx; + int i; + + for (i = txn->mt_numdbs; --i >= 0; ) { + for (mc = cursors[i]; mc; mc = next) { + next = mc->mc_next; + if ((bk = mc->mc_backup) != NULL) { + if (merge) { + /* Commit changes to parent txn */ + mc->mc_next = bk->mc_next; + mc->mc_backup = bk->mc_backup; + mc->mc_txn = bk->mc_txn; + mc->mc_db = bk->mc_db; + mc->mc_dbflag = bk->mc_dbflag; + if ((mx = mc->mc_xcursor) != NULL) + mx->mx_cursor.mc_txn = bk->mc_txn; + } else { + /* Abort nested txn */ + *mc = *bk; + if ((mx = mc->mc_xcursor) != NULL) + *mx = *(MDB_xcursor *)(bk+1); + } + mc = bk; + } + /* Only malloced cursors are permanently tracked. */ + free(mc); + } + cursors[i] = NULL; + } +} + +#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ +enum Pidlock_op { + Pidset, Pidcheck +}; +#else +enum Pidlock_op { + Pidset = F_SETLK, Pidcheck = F_GETLK +}; +#endif + +/** Set or check a pid lock. Set returns 0 on success. + * Check returns 0 if the process is certainly dead, nonzero if it may + * be alive (the lock exists or an error happened so we do not know). + * + * On Windows Pidset is a no-op, we merely check for the existence + * of the process with the given pid. On POSIX we use a single byte + * lock on the lockfile, set at an offset equal to the pid. + */ +static int +mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid) +{ +#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ + int ret = 0; + HANDLE h; + if (op == Pidcheck) { + h = OpenProcess(env->me_pidquery, FALSE, pid); + /* No documented "no such process" code, but other program use this: */ + if (!h) + return ErrCode() != ERROR_INVALID_PARAMETER; + /* A process exists until all handles to it close. Has it exited? */ + ret = WaitForSingleObject(h, 0) != 0; + CloseHandle(h); + } + return ret; +#else + for (;;) { + int rc; + struct flock lock_info; + memset(&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_WRLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = pid; + lock_info.l_len = 1; + if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { + if (op == F_GETLK && lock_info.l_type != F_UNLCK) + rc = -1; + } else if ((rc = ErrCode()) == EINTR) { + continue; + } + return rc; + } +#endif +} + +/** Common code for #mdb_txn_begin() and #mdb_txn_renew(). + * @param[in] txn the transaction handle to initialize + * @return 0 on success, non-zero on failure. + */ +static int +mdb_txn_renew0(MDB_txn *txn) +{ + MDB_env *env = txn->mt_env; + MDB_txninfo *ti = env->me_txns; + MDB_meta *meta; + unsigned int i, nr, flags = txn->mt_flags; + uint16_t x; + int rc, new_notls = 0; + + if ((flags &= MDB_TXN_RDONLY) != 0) { + if (!ti) { + meta = mdb_env_pick_meta(env); + txn->mt_txnid = meta->mm_txnid; + txn->mt_u.reader = NULL; + } else { + MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : + pthread_getspecific(env->me_txkey); + if (r) { + if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) + return MDB_BAD_RSLOT; + } else { + MDB_PID_T pid = env->me_pid; + MDB_THR_T tid = pthread_self(); + mdb_mutexref_t rmutex = env->me_rmutex; + + if (!env->me_live_reader) { + rc = mdb_reader_pid(env, Pidset, pid); + if (rc) + return rc; + env->me_live_reader = 1; + } + + if (LOCK_MUTEX(rc, env, rmutex)) + return rc; + nr = ti->mti_numreaders; + for (i=0; imti_readers[i].mr_pid == 0) + break; + if (i == env->me_maxreaders) { + UNLOCK_MUTEX(rmutex); + return MDB_READERS_FULL; + } + r = &ti->mti_readers[i]; + /* Claim the reader slot, carefully since other code + * uses the reader table un-mutexed: First reset the + * slot, next publish it in mti_numreaders. After + * that, it is safe for mdb_env_close() to touch it. + * When it will be closed, we can finally claim it. + */ + r->mr_pid = 0; + r->mr_txnid = (txnid_t)-1; + r->mr_tid = tid; + if (i == nr) + ti->mti_numreaders = ++nr; + env->me_close_readers = nr; + r->mr_pid = pid; + UNLOCK_MUTEX(rmutex); + + new_notls = (env->me_flags & MDB_NOTLS); + if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { + r->mr_pid = 0; + return rc; + } + } + do /* LY: Retry on a race, ITS#7970. */ + r->mr_txnid = ti->mti_txnid; + while(r->mr_txnid != ti->mti_txnid); + txn->mt_txnid = r->mr_txnid; + txn->mt_u.reader = r; + meta = env->me_metas[txn->mt_txnid & 1]; + } + + } else { + /* Not yet touching txn == env->me_txn0, it may be active */ + if (ti) { + if (LOCK_MUTEX(rc, env, env->me_wmutex)) + return rc; + txn->mt_txnid = ti->mti_txnid; + meta = env->me_metas[txn->mt_txnid & 1]; + } else { + meta = mdb_env_pick_meta(env); + txn->mt_txnid = meta->mm_txnid; + } + txn->mt_txnid++; +#if MDB_DEBUG + if (txn->mt_txnid == mdb_debug_start) + mdb_debug = 1; +#endif + txn->mt_child = NULL; + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + txn->mt_dirty_room = MDB_IDL_DIRTY_MAX; + txn->mt_u.dirty_list = env->me_dirty_list; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_free_pgs = env->me_free_pgs; + txn->mt_free_pgs[0] = 0; + txn->mt_spill_pgs = NULL; + env->me_txn = txn; + memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int)); + } + + /* Copy the DB info and flags */ + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); + + /* Moved to here to avoid a data race in read TXNs */ + txn->mt_next_pgno = meta->mm_last_pg+1; + + txn->mt_flags = flags; + + /* Setup db info */ + txn->mt_numdbs = env->me_numdbs; + for (i=CORE_DBS; imt_numdbs; i++) { + x = env->me_dbflags[i]; + txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; + txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0; + } + txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID; + txn->mt_dbflags[FREE_DBI] = DB_VALID; + + if (env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + rc = MDB_PANIC; + } else if (env->me_maxpg < txn->mt_next_pgno) { + rc = MDB_MAP_RESIZED; + } else { + return MDB_SUCCESS; + } + mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); + return rc; +} + +int +mdb_txn_renew(MDB_txn *txn) +{ + int rc; + + if (!txn || !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED)) + return EINVAL; + + rc = mdb_txn_renew0(txn); + if (rc == MDB_SUCCESS) { + DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); + } + return rc; +} + +int +mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) +{ + MDB_txn *txn; + MDB_ntxn *ntxn; + int rc, size, tsize; + + flags &= MDB_TXN_BEGIN_FLAGS; + flags |= env->me_flags & MDB_WRITEMAP; + + if (env->me_flags & MDB_RDONLY & ~flags) /* write txn in RDONLY env */ + return EACCES; + + if (parent) { + /* Nested transactions: Max 1 child, write txns only, no writemap */ + flags |= parent->mt_flags; + if (flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_BLOCKED)) { + return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; + } + /* Child txns save MDB_pgstate and use own copy of cursors */ + size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1); + size += tsize = sizeof(MDB_ntxn); + } else if (flags & MDB_RDONLY) { + size = env->me_maxdbs * (sizeof(MDB_db)+1); + size += tsize = sizeof(MDB_txn); + } else { + /* Reuse preallocated write txn. However, do not touch it until + * mdb_txn_renew0() succeeds, since it currently may be active. + */ + txn = env->me_txn0; + goto renew; + } + if ((txn = calloc(1, size)) == NULL) { + DPRINTF(("calloc: %s", strerror(errno))); + return ENOMEM; + } + txn->mt_dbxs = env->me_dbxs; /* static */ + txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); + txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; + txn->mt_flags = flags; + txn->mt_env = env; + + if (parent) { + unsigned int i; + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = parent->mt_dbiseqs; + txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); + if (!txn->mt_u.dirty_list || + !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) + { + free(txn->mt_u.dirty_list); + free(txn); + return ENOMEM; + } + txn->mt_txnid = parent->mt_txnid; + txn->mt_dirty_room = parent->mt_dirty_room; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_spill_pgs = NULL; + txn->mt_next_pgno = parent->mt_next_pgno; + parent->mt_flags |= MDB_TXN_HAS_CHILD; + parent->mt_child = txn; + txn->mt_parent = parent; + txn->mt_numdbs = parent->mt_numdbs; + memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + /* Copy parent's mt_dbflags, but clear DB_NEW */ + for (i=0; imt_numdbs; i++) + txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; + rc = 0; + ntxn = (MDB_ntxn *)txn; + ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ + if (env->me_pghead) { + size = MDB_IDL_SIZEOF(env->me_pghead); + env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); + if (env->me_pghead) + memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); + else + rc = ENOMEM; + } + if (!rc) + rc = mdb_cursor_shadow(parent, txn); + if (rc) + mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD); + } else { /* MDB_RDONLY */ + txn->mt_dbiseqs = env->me_dbiseqs; +renew: + rc = mdb_txn_renew0(txn); + } + if (rc) { + if (txn != env->me_txn0) + free(txn); + } else { + txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */ + *ret = txn; + DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', + (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); + } + + return rc; +} + +MDB_env * +mdb_txn_env(MDB_txn *txn) +{ + if(!txn) return NULL; + return txn->mt_env; +} + +size_t +mdb_txn_id(MDB_txn *txn) +{ + if(!txn) return 0; + return txn->mt_txnid; +} + +/** Export or close DBI handles opened in this txn. */ +static void +mdb_dbis_update(MDB_txn *txn, int keep) +{ + int i; + MDB_dbi n = txn->mt_numdbs; + MDB_env *env = txn->mt_env; + unsigned char *tdbflags = txn->mt_dbflags; + + for (i = n; --i >= CORE_DBS;) { + if (tdbflags[i] & DB_NEW) { + if (keep) { + env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; + } else { + char *ptr = env->me_dbxs[i].md_name.mv_data; + if (ptr) { + env->me_dbxs[i].md_name.mv_data = NULL; + env->me_dbxs[i].md_name.mv_size = 0; + env->me_dbflags[i] = 0; + env->me_dbiseqs[i]++; + free(ptr); + } + } + } + } + if (keep && env->me_numdbs < n) + env->me_numdbs = n; +} + +/** End a transaction, except successful commit of a nested transaction. + * May be called twice for readonly txns: First reset it, then abort. + * @param[in] txn the transaction handle to end + * @param[in] mode why and how to end the transaction + */ +static void +mdb_txn_end(MDB_txn *txn, unsigned mode) +{ + MDB_env *env = txn->mt_env; +#if MDB_DEBUG + static const char *const names[] = MDB_END_NAMES; +#endif + + /* Export or close DBI handles opened in this txn */ + mdb_dbis_update(txn, mode & MDB_END_UPDATE); + + DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + names[mode & MDB_END_OPMASK], + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + if (txn->mt_u.reader) { + txn->mt_u.reader->mr_txnid = (txnid_t)-1; + if (!(env->me_flags & MDB_NOTLS)) { + txn->mt_u.reader = NULL; /* txn does not own reader */ + } else if (mode & MDB_END_SLOT) { + txn->mt_u.reader->mr_pid = 0; + txn->mt_u.reader = NULL; + } /* else txn owns the slot until it does MDB_END_SLOT */ + } + txn->mt_numdbs = 0; /* prevent further DBI activity */ + txn->mt_flags |= MDB_TXN_FINISHED; + + } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { + pgno_t *pghead = env->me_pghead; + + if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ + mdb_cursors_close(txn, 0); + if (!(env->me_flags & MDB_WRITEMAP)) { + mdb_dlist_free(txn); + } + + txn->mt_numdbs = 0; + txn->mt_flags = MDB_TXN_FINISHED; + + if (!txn->mt_parent) { + mdb_midl_shrink(&txn->mt_free_pgs); + env->me_free_pgs = txn->mt_free_pgs; + /* me_pgstate: */ + env->me_pghead = NULL; + env->me_pglast = 0; + + env->me_txn = NULL; + mode = 0; /* txn == env->me_txn0, do not free() it */ + + /* The writer mutex was locked in mdb_txn_begin. */ + if (env->me_txns) + UNLOCK_MUTEX(env->me_wmutex); + } else { + txn->mt_parent->mt_child = NULL; + txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; + env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; + mdb_midl_free(txn->mt_free_pgs); + free(txn->mt_u.dirty_list); + } + cppmidl_free(txn->mt_spill_pgs); + mdb_midl_free(pghead); + } + + if (mode & MDB_END_FREE) + free(txn); +} + +void +mdb_txn_reset(MDB_txn *txn) +{ + if (txn == NULL) + return; + + /* This call is only valid for read-only txns */ + if (!(txn->mt_flags & MDB_TXN_RDONLY)) + return; + + mdb_txn_end(txn, MDB_END_RESET); +} + +void +mdb_txn_abort(MDB_txn *txn) +{ + if (txn == NULL) + return; + + if (txn->mt_child) + mdb_txn_abort(txn->mt_child); + + mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); +} + +/** Save the freelist as of this transaction to the freeDB. + * This changes the freelist. Keep trying until it stabilizes. + */ +static int +mdb_freelist_save(MDB_txn *txn) +{ + /* env->me_pghead[] can grow and shrink during this call. + * env->me_pglast and txn->mt_free_pgs[] can only grow. + * Page numbers cannot disappear from txn->mt_free_pgs[]. + */ + MDB_cursor mc; + MDB_env *env = txn->mt_env; + int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; + txnid_t pglast = 0, head_id = 0; + pgno_t freecnt = 0, *free_pgs, *mop; + ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; + + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + + if (env->me_pghead) { + /* Make sure first page of freeDB is touched and on freelist */ + rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } + + if (!env->me_pghead && txn->mt_loose_pgs) { + /* Put loose page numbers in mt_free_pgs, since + * we may be unable to return them to me_pghead. + */ + MDB_page *mp = txn->mt_loose_pgs; + MDB_ID2 *dl = txn->mt_u.dirty_list; + unsigned x; + if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0) + return rc; + for (; mp; mp = NEXT_LOOSE_PAGE(mp)) { + mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + /* must also remove from dirty list */ + if (txn->mt_flags & MDB_TXN_WRITEMAP) { + for (x=1; x<=dl[0].mid; x++) + if (dl[x].mid == mp->mp_pgno) + break; + mdb_tassert(txn, x <= dl[0].mid); + } else { + x = mdb_mid2l_search(dl, mp->mp_pgno); + mdb_tassert(txn, dl[x].mid == mp->mp_pgno); + mdb_dpage_free(env, mp); + } + dl[x].mptr = NULL; + } + { + /* squash freed slots out of the dirty list */ + unsigned y; + for (y=1; dl[y].mptr && y <= dl[0].mid; y++); + if (y <= dl[0].mid) { + for(x=y, y++;;) { + while (!dl[y].mptr && y <= dl[0].mid) y++; + if (y > dl[0].mid) break; + dl[x++] = dl[y++]; + } + dl[0].mid = x-1; + } else { + /* all slots freed */ + dl[0].mid = 0; + } + } + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + } + + /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ + clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) + ? SSIZE_MAX : maxfree_1pg; + + for (;;) { + /* Come back here after each Put() in case freelist changed */ + MDB_val key, data; + pgno_t *pgs; + ssize_t j; + + /* If using records from freeDB which we have not yet + * deleted, delete them and any we reserved for me_pghead. + */ + while (pglast < env->me_pglast) { + rc = mdb_cursor_first(&mc, &key, NULL); + if (rc) + return rc; + pglast = head_id = *(txnid_t *)key.mv_data; + total_room = head_room = 0; + mdb_tassert(txn, pglast <= env->me_pglast); + rc = mdb_cursor_del(&mc, 0); + if (rc) + return rc; + } + + /* Save the IDL of pages freed by this txn, to a single record */ + if (freecnt < txn->mt_free_pgs[0]) { + if (!freecnt) { + /* Make sure last page of freeDB is touched and on freelist */ + rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } + free_pgs = txn->mt_free_pgs; + /* Write to last page of freeDB */ + key.mv_size = sizeof(txn->mt_txnid); + key.mv_data = &txn->mt_txnid; + do { + freecnt = free_pgs[0]; + data.mv_size = MDB_IDL_SIZEOF(free_pgs); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + return rc; + /* Retry if mt_free_pgs[] grew during the Put() */ + free_pgs = txn->mt_free_pgs; + } while (freecnt < free_pgs[0]); + mdb_midl_sort(free_pgs); + memcpy(data.mv_data, free_pgs, data.mv_size); +#if (MDB_DEBUG) > 1 + { + unsigned int i = free_pgs[0]; + DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", + txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); + for (; i; i--) + DPRINTF(("IDL %"Z"u", free_pgs[i])); + } +#endif + continue; + } + + mop = env->me_pghead; + mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; + + /* Reserve records for me_pghead[]. Split it if multi-page, + * to avoid searching freeDB for a page range. Use keys in + * range [1,me_pglast]: Smaller than txnid of oldest reader. + */ + if (total_room >= mop_len) { + if (total_room == mop_len || --more < 0) + break; + } else if (head_room >= maxfree_1pg && head_id > 1) { + /* Keep current record (overflow page), add a new one */ + head_id--; + head_room = 0; + } + /* (Re)write {key = head_id, IDL length = head_room} */ + total_room -= head_room; + head_room = mop_len - total_room; + if (head_room > maxfree_1pg && head_id > 1) { + /* Overflow multi-page for part of me_pghead */ + head_room /= head_id; /* amortize page sizes */ + head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); + } else if (head_room < 0) { + /* Rare case, not bothering to delete this record */ + head_room = 0; + } + key.mv_size = sizeof(head_id); + key.mv_data = &head_id; + data.mv_size = (head_room + 1) * sizeof(pgno_t); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + return rc; + /* IDL is initially empty, zero out at least the length */ + pgs = (pgno_t *)data.mv_data; + j = head_room > clean_limit ? head_room : 0; + do { + pgs[j] = 0; + } while (--j >= 0); + total_room += head_room; + } + + /* Return loose page numbers to me_pghead, though usually none are + * left at this point. The pages themselves remain in dirty_list. + */ + if (txn->mt_loose_pgs) { + MDB_page *mp = txn->mt_loose_pgs; + unsigned count = txn->mt_loose_count; + MDB_IDL loose; + /* Room for loose pages + temp IDL with same */ + if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0) + return rc; + mop = env->me_pghead; + loose = mop + MDB_IDL_ALLOCLEN(mop) - count; + for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) + loose[ ++count ] = mp->mp_pgno; + loose[0] = count; + mdb_midl_sort(loose); + mdb_midl_xmerge(mop, loose); + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + mop_len = mop[0]; + } + + /* Fill in the reserved me_pghead records */ + rc = MDB_SUCCESS; + if (mop_len) { + MDB_val key, data; + + mop += mop_len; + rc = mdb_cursor_first(&mc, &key, &data); + for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { + txnid_t id = *(txnid_t *)key.mv_data; + ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; + MDB_ID save; + + mdb_tassert(txn, len >= 0 && id <= env->me_pglast); + key.mv_data = &id; + if (len > mop_len) { + len = mop_len; + data.mv_size = (len + 1) * sizeof(MDB_ID); + } + data.mv_data = mop -= len; + save = mop[0]; + mop[0] = len; + rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT); + mop[0] = save; + if (rc || !(mop_len -= len)) + break; + } + } + return rc; +} + +/** Flush (some) dirty pages to the map, after clearing their dirty flag. + * @param[in] txn the transaction that's being committed + * @param[in] keep number of initial pages in dirty_list to keep dirty. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_flush(MDB_txn *txn, int keep) +{ + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned psize = env->me_psize, j; + int i, pagecount = dl[0].mid, rc; + size_t size = 0, pos = 0; + pgno_t pgno = 0; + MDB_page *dp = NULL; +#ifdef _WIN32 + OVERLAPPED ov; +#else + struct iovec iov[MDB_COMMIT_PAGES]; + ssize_t wpos = 0, wsize = 0, wres; + size_t next_pos = 1; /* impossible pos, so pos != next_pos */ + int n = 0; +#endif + + j = i = keep; + + if (env->me_flags & MDB_WRITEMAP) { + /* Clear dirty flags */ + while (++i <= pagecount) { + dp = dl[i].mptr; + /* Don't flush this page yet */ + if (dp->mp_flags & (P_LOOSE|P_KEEP)) { + dp->mp_flags &= ~P_KEEP; + dl[++j] = dl[i]; + continue; + } + dp->mp_flags &= ~P_DIRTY; + } + goto done; + } + + /* Write the pages */ + for (;;) { + if (++i <= pagecount) { + dp = dl[i].mptr; + /* Don't flush this page yet */ + if (dp->mp_flags & (P_LOOSE|P_KEEP)) { + dp->mp_flags &= ~P_KEEP; + dl[i].mid = 0; + continue; + } + pgno = dl[i].mid; + /* clear dirty flag */ + dp->mp_flags &= ~P_DIRTY; + pos = pgno * psize; + size = psize; + if (IS_OVERFLOW(dp)) size *= dp->mp_pages; + } +#ifdef _WIN32 + else break; + + /* Windows actually supports scatter/gather I/O, but only on + * unbuffered file handles. Since we're relying on the OS page + * cache for all our data, that's self-defeating. So we just + * write pages one at a time. We use the ov structure to set + * the write offset, to at least save the overhead of a Seek + * system call. + */ + DPRINTF(("committing page %"Z"u", pgno)); + memset(&ov, 0, sizeof(ov)); + ov.Offset = pos & 0xffffffff; + ov.OffsetHigh = pos >> 16 >> 16; + if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { + rc = ErrCode(); + DPRINTF(("WriteFile: %d", rc)); + return rc; + } +#else + /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ + if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { + if (n) { +retry_write: + /* Write previous page(s) */ +#ifdef MDB_USE_PWRITEV + wres = pwritev(env->me_fd, iov, n, wpos); +#else + if (n == 1) { + wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); + } else { +retry_seek: + if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { + rc = ErrCode(); + if (rc == EINTR) + goto retry_seek; + DPRINTF(("lseek: %s", strerror(rc))); + return rc; + } + wres = writev(env->me_fd, iov, n); + } +#endif + if (wres != wsize) { + if (wres < 0) { + rc = ErrCode(); + if (rc == EINTR) + goto retry_write; + DPRINTF(("Write error: %s", strerror(rc))); + } else { + rc = EIO; /* TODO: Use which error code? */ + DPUTS("short write, filesystem full?"); + } + return rc; + } + n = 0; + } + if (i > pagecount) + break; + wpos = pos; + wsize = 0; + } + DPRINTF(("committing page %"Z"u", pgno)); + next_pos = pos + size; + iov[n].iov_len = size; + iov[n].iov_base = (char *)dp; + wsize += size; + n++; +#endif /* _WIN32 */ + } + + /* MIPS has cache coherency issues, this is a no-op everywhere else + * Note: for any size >= on-chip cache size, entire on-chip cache is + * flushed. + */ + CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); + + for (i = keep; ++i <= pagecount; ) { + dp = dl[i].mptr; + /* This is a page we skipped above */ + if (!dl[i].mid) { + dl[++j] = dl[i]; + dl[j].mid = dp->mp_pgno; + continue; + } + mdb_dpage_free(env, dp); + } + +done: + i--; + txn->mt_dirty_room += i - j; + dl[0].mid = j; + return MDB_SUCCESS; +} + +static int ESECT mdb_env_share_locks(MDB_env *env, int *excl); + +int +mdb_txn_commit(MDB_txn *txn) +{ + int rc; + unsigned int i, end_mode; + MDB_env *env; + + if (txn == NULL) + return EINVAL; + + /* mdb_txn_end() mode for a commit which writes nothing */ + end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; + + if (txn->mt_child) { + rc = mdb_txn_commit(txn->mt_child); + if (rc) + goto fail; + } + + env = txn->mt_env; + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + goto done; + } + + if (txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR)) { + DPUTS("txn has failed/finished, can't commit"); + if (txn->mt_parent) + txn->mt_parent->mt_flags |= MDB_TXN_ERROR; + rc = MDB_BAD_TXN; + goto fail; + } + + if (txn->mt_parent) { + MDB_txn *parent = txn->mt_parent; + MDB_page **lp; + MDB_ID2L dst, src; + const MDB_ID* spillp; + unsigned x, y, len, ps_len; + + /* Append our free list to parent's */ + rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); + if (rc) + goto fail; + mdb_midl_free(txn->mt_free_pgs); + /* Failures after this must either undo the changes + * to the parent or set MDB_TXN_ERROR in the parent. + */ + + parent->mt_next_pgno = txn->mt_next_pgno; + parent->mt_flags = txn->mt_flags; + + /* Merge our cursors into parent's and close them */ + mdb_cursors_close(txn, 1); + + /* Update parent's DB table. */ + memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + parent->mt_numdbs = txn->mt_numdbs; + parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; + parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; + for (i=CORE_DBS; imt_numdbs; i++) { + /* preserve parent's DB_NEW status */ + x = parent->mt_dbflags[i] & DB_NEW; + parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; + } + + dst = parent->mt_u.dirty_list; + src = txn->mt_u.dirty_list; + /* Remove anything in our dirty list from parent's spill list */ + if (parent->mt_spill_pgs && !cppmidl_empty(parent->mt_spill_pgs)) { + /* Mark our dirty pages as deleted in parent spill list */ + for (i=0, len=src[0].mid; ++i <= len; ) { + MDB_ID pn = src[i].mid << 1; + cppmidl_erase(parent->mt_spill_pgs, pn); + } + } + + /* Remove anything in our spill list from parent's dirty list */ + if (txn->mt_spill_pgs && !cppmidl_empty(txn->mt_spill_pgs)) { + cppmidl_begin(txn->mt_spill_pgs); + while ((spillp = cppmidl_next(txn->mt_spill_pgs)) != NULL) { + MDB_ID pn = *spillp; + pn >>= 1; + y = mdb_mid2l_search(dst, pn); + if (y <= dst[0].mid && dst[y].mid == pn) { + free(dst[y].mptr); + while (y < dst[0].mid) { + dst[y] = dst[y+1]; + y++; + } + dst[0].mid--; + } + } + } + + /* Find len = length of merging our dirty list with parent's */ + x = dst[0].mid; + dst[0].mid = 0; /* simplify loops */ + if (parent->mt_parent) { + len = x + src[0].mid; + y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; + for (i = x; y && i; y--) { + pgno_t yp = src[y].mid; + while (yp < dst[i].mid) + i--; + if (yp == dst[i].mid) { + i--; + len--; + } + } + } else { /* Simplify the above for single-ancestor case */ + len = MDB_IDL_DIRTY_MAX - txn->mt_dirty_room; + } + /* Merge our dirty list with parent's */ + y = src[0].mid; + for (i = len; y; dst[i--] = src[y--]) { + pgno_t yp = src[y].mid; + while (yp < dst[x].mid) + dst[i--] = dst[x--]; + if (yp == dst[x].mid) + free(dst[x--].mptr); + } + mdb_tassert(txn, i == x); + dst[0].mid = len; + free(txn->mt_u.dirty_list); + parent->mt_dirty_room = txn->mt_dirty_room; + if (txn->mt_spill_pgs) { + if (parent->mt_spill_pgs) { + cppmidl_insert_list(parent->mt_spill_pgs, txn->mt_spill_pgs); + cppmidl_free(txn->mt_spill_pgs); + } else { + parent->mt_spill_pgs = txn->mt_spill_pgs; + } + } + + /* Append our loose page list to parent's */ + for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) + ; + *lp = txn->mt_loose_pgs; + parent->mt_loose_count += txn->mt_loose_count; + + parent->mt_child = NULL; + mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); + free(txn); + return rc; + } + + if (txn != env->me_txn) { + DPUTS("attempt to commit unknown transaction"); + rc = EINVAL; + goto fail; + } + + mdb_cursors_close(txn, 0); + + if (!txn->mt_u.dirty_list[0].mid && + !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) + goto done; + + DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); + + /* Update DB root pointers */ + if (txn->mt_numdbs > CORE_DBS) { + MDB_cursor mc; + MDB_dbi i; + MDB_val data; + data.mv_size = sizeof(MDB_db); + + mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); + for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + if (TXN_DBI_CHANGED(txn, i)) { + rc = MDB_BAD_DBI; + goto fail; + } + data.mv_data = &txn->mt_dbs[i]; + rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, + F_SUBDATA); + if (rc) + goto fail; + } + } + } + + rc = mdb_freelist_save(txn); + if (rc) + goto fail; + + mdb_midl_free(env->me_pghead); + env->me_pghead = NULL; + mdb_midl_shrink(&txn->mt_free_pgs); + +#if (MDB_DEBUG) > 2 + mdb_audit(txn); +#endif + + if ((rc = mdb_page_flush(txn, 0)) || + (rc = mdb_env_sync(env, 0)) || + (rc = mdb_env_write_meta(txn))) + goto fail; + end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; + if (env->me_flags & MDB_PREVSNAPSHOT) { + if (!(env->me_flags & MDB_NOLOCK)) { + int excl; + rc = mdb_env_share_locks(env, &excl); + if (rc) + goto fail; + } + env->me_flags ^= MDB_PREVSNAPSHOT; + } + +done: + mdb_txn_end(txn, end_mode); + return MDB_SUCCESS; + +fail: + mdb_txn_abort(txn); + return rc; +} + +/** Read the environment parameters of a DB environment before + * mapping it into memory. + * @param[in] env the environment handle + * @param[in] prev whether to read the backup meta page + * @param[out] meta address of where to store the meta information + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta) +{ + MDB_metabuf pbuf; + MDB_page *p; + MDB_meta *m; + int i, rc, off; + enum { Size = sizeof(pbuf) }; + + /* We don't know the page size yet, so use a minimum value. + * Read both meta pages so we can use the latest one. + */ + + for (i=off=0; imm_psize) { +#ifdef _WIN32 + DWORD len; + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1; + if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF) + rc = 0; +#else + rc = pread(env->me_fd, &pbuf, Size, off); +#endif + if (rc != Size) { + if (rc == 0 && off == 0) + return ENOENT; + rc = rc < 0 ? (int) ErrCode() : MDB_INVALID; + DPRINTF(("read: %s", mdb_strerror(rc))); + return rc; + } + + p = (MDB_page *)&pbuf; + + if (!F_ISSET(p->mp_flags, P_META)) { + DPRINTF(("page %"Z"u not a meta page", p->mp_pgno)); + return MDB_INVALID; + } + + m = METADATA(p); + if (m->mm_magic != MDB_MAGIC) { + DPUTS("meta has invalid magic"); + return MDB_INVALID; + } + + if (m->mm_version != MDB_DATA_VERSION) { + DPRINTF(("database is version %u, expected version %u", + m->mm_version, MDB_DATA_VERSION)); + return MDB_VERSION_MISMATCH; + } + + if (off == 0 || (prev ? m->mm_txnid < meta->mm_txnid : m->mm_txnid > meta->mm_txnid)) + *meta = *m; + } + return 0; +} + +/** Fill in most of the zeroed #MDB_meta for an empty database environment */ +static void ESECT +mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) +{ + meta->mm_magic = MDB_MAGIC; + meta->mm_version = MDB_DATA_VERSION; + meta->mm_mapsize = env->me_mapsize; + meta->mm_psize = env->me_psize; + meta->mm_last_pg = NUM_METAS-1; + meta->mm_flags = env->me_flags & 0xffff; + meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ + meta->mm_dbs[FREE_DBI].md_root = P_INVALID; + meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; +} + +/** Write the environment parameters of a freshly created DB environment. + * @param[in] env the environment handle + * @param[in] meta the #MDB_meta to write + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_env_init_meta(MDB_env *env, MDB_meta *meta) +{ + MDB_page *p, *q; + int rc; + unsigned int psize; +#ifdef _WIN32 + DWORD len; + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); +#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ + ov.Offset = pos; \ + rc = WriteFile(fd, ptr, size, &len, &ov); } while(0) +#else + int len; +#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ + len = pwrite(fd, ptr, size, pos); \ + if (len == -1 && ErrCode() == EINTR) continue; \ + rc = (len >= 0); break; } while(1) +#endif + + DPUTS("writing new meta page"); + + psize = env->me_psize; + + p = calloc(NUM_METAS, psize); + if (!p) + return ENOMEM; + + p->mp_pgno = 0; + p->mp_flags = P_META; + *(MDB_meta *)METADATA(p) = *meta; + + q = (MDB_page *)((char *)p + psize); + q->mp_pgno = 1; + q->mp_flags = P_META; + *(MDB_meta *)METADATA(q) = *meta; + + DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0); + if (!rc) + rc = ErrCode(); + else if ((unsigned) len == psize * NUM_METAS) + rc = MDB_SUCCESS; + else + rc = ENOSPC; + free(p); + return rc; +} + +/** Update the environment info to commit a transaction. + * @param[in] txn the transaction that's being committed + * @return 0 on success, non-zero on failure. + */ +static int +mdb_env_write_meta(MDB_txn *txn) +{ + MDB_env *env; + MDB_meta meta, metab, *mp; + unsigned flags; + size_t mapsize; + off_t off; + int rc, len, toggle; + char *ptr; + HANDLE mfd; +#ifdef _WIN32 + OVERLAPPED ov; +#else + int r2; +#endif + + toggle = txn->mt_txnid & 1; + DPRINTF(("writing meta page %d for root page %"Z"u", + toggle, txn->mt_dbs[MAIN_DBI].md_root)); + + env = txn->mt_env; + flags = env->me_flags; + mp = env->me_metas[toggle]; + mapsize = env->me_metas[toggle ^ 1]->mm_mapsize; + /* Persist any increases of mapsize config */ + if (mapsize < env->me_mapsize) + mapsize = env->me_mapsize; + + if (flags & MDB_WRITEMAP) { + mp->mm_mapsize = mapsize; + mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + mp->mm_last_pg = txn->mt_next_pgno - 1; +#if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && /* TODO: portability */ \ + !(defined(__i386__) || defined(__x86_64__)) + /* LY: issue a memory barrier, if not x86. ITS#7969 */ + __sync_synchronize(); +#endif + mp->mm_txnid = txn->mt_txnid; + if (!(flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { + unsigned meta_size = env->me_psize; + rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; + ptr = (char *)mp - PAGEHDRSZ; +#ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ + r2 = (ptr - env->me_map) & (env->me_os_psize - 1); + ptr -= r2; + meta_size += r2; +#endif + if (MDB_MSYNC(ptr, meta_size, rc)) { + rc = ErrCode(); + goto fail; + } + } + goto done; + } + metab.mm_txnid = mp->mm_txnid; + metab.mm_last_pg = mp->mm_last_pg; + + meta.mm_mapsize = mapsize; + meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + meta.mm_last_pg = txn->mt_next_pgno - 1; + meta.mm_txnid = txn->mt_txnid; + + off = offsetof(MDB_meta, mm_mapsize); + ptr = (char *)&meta + off; + len = sizeof(MDB_meta) - off; + off += (char *)mp - env->me_map; + + /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC. + * (me_mfd goes to the same file as me_fd, but writing to it + * also syncs to disk. Avoids a separate fdatasync() call.) + */ + mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; +#ifdef _WIN32 + { + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) + rc = -1; + } +#else +retry_write: + rc = pwrite(mfd, ptr, len, off); +#endif + if (rc != len) { + rc = rc < 0 ? ErrCode() : EIO; +#ifndef _WIN32 + if (rc == EINTR) + goto retry_write; +#endif + DPUTS("write failed, disk error?"); + /* On a failure, the pagecache still contains the new data. + * Write some old data back, to prevent it from being used. + * Use the non-SYNC fd; we know it will fail anyway. + */ + meta.mm_last_pg = metab.mm_last_pg; + meta.mm_txnid = metab.mm_txnid; +#ifdef _WIN32 + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + WriteFile(env->me_fd, ptr, len, NULL, &ov); +#else + r2 = pwrite(env->me_fd, ptr, len, off); + (void)r2; /* Silence warnings. We don't care about pwrite's return value */ +#endif +fail: + env->me_flags |= MDB_FATAL_ERROR; + return rc; + } + /* MIPS has cache coherency issues, this is a no-op everywhere else */ + CACHEFLUSH(env->me_map + off, len, DCACHE); +done: + /* Memory ordering issues are irrelevant; since the entire writer + * is wrapped by wmutex, all of these changes will become visible + * after the wmutex is unlocked. Since the DB is multi-version, + * readers will get consistent data regardless of how fresh or + * how stale their view of these values is. + */ + if (env->me_txns) + env->me_txns->mti_txnid = txn->mt_txnid; + + return MDB_SUCCESS; +} + +/** Check both meta pages to see which one is newer. + * @param[in] env the environment handle + * @return newest #MDB_meta. + */ +static MDB_meta * +mdb_env_pick_meta(const MDB_env *env) +{ + MDB_meta *const *metas = env->me_metas; + return metas[ (metas[0]->mm_txnid < metas[1]->mm_txnid) ^ + ((env->me_flags & MDB_PREVSNAPSHOT) != 0) ]; +} + +int ESECT +mdb_env_create(MDB_env **env) +{ + MDB_env *e; + + e = calloc(1, sizeof(MDB_env)); + if (!e) + return ENOMEM; + + e->me_maxreaders = DEFAULT_READERS; + e->me_maxdbs = e->me_numdbs = CORE_DBS; + e->me_fd = INVALID_HANDLE_VALUE; + e->me_lfd = INVALID_HANDLE_VALUE; + e->me_mfd = INVALID_HANDLE_VALUE; +#ifdef MDB_USE_POSIX_SEM + e->me_rmutex = SEM_FAILED; + e->me_wmutex = SEM_FAILED; +#endif + e->me_pid = getpid(); + GET_PAGESIZE(e->me_os_psize); + VGMEMP_CREATE(e,0,0); + *env = e; + return MDB_SUCCESS; +} + +static int ESECT +mdb_fsize(HANDLE fd, size_t* size); + +static int ESECT +mdb_env_map(MDB_env *env, void *addr) +{ + MDB_page *p; + unsigned int flags = env->me_flags; +#ifdef _WIN32 + int rc; + HANDLE mh; + LONG sizelo, sizehi; + size_t msize; + + if (flags & MDB_RDONLY) { + /* Don't set explicit map size, use whatever exists */ + msize = 0; + sizelo = 0; + sizehi = 0; + } else { + msize = env->me_mapsize; + sizelo = msize & 0xffffffff; + sizehi = msize >> 16 >> 16; /* only needed on Win64 */ + + /* Windows won't create mappings for zero length files. + * and won't map more than the file size. + * Just set the maxsize right now. + */ + if (!(flags & MDB_WRITEMAP) && (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo + || !SetEndOfFile(env->me_fd) + || SetFilePointer(env->me_fd, 0, NULL, 0) != 0)) + return ErrCode(); + } + + mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? + PAGE_READWRITE : PAGE_READONLY, + sizehi, sizelo, NULL); + if (!mh) + return ErrCode(); + env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? + FILE_MAP_WRITE : FILE_MAP_READ, + 0, 0, msize, addr); + rc = env->me_map ? 0 : ErrCode(); + CloseHandle(mh); + if (rc) + return rc; +#else + int prot = PROT_READ; + if (flags & MDB_WRITEMAP) { + size_t fsize; + prot |= PROT_WRITE; + if ( (mdb_fsize(env->me_fd, &fsize)!=MDB_SUCCESS || env->me_mapsize>fsize) + && ftruncate(env->me_fd, env->me_mapsize) < 0) + return ErrCode(); + } + env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, + env->me_fd, 0); + if (env->me_map == MAP_FAILED) { + env->me_map = NULL; + return ErrCode(); + } + + if (flags & MDB_NORDAHEAD) { + /* Turn off readahead. It's harmful when the DB is larger than RAM. */ +#ifdef MADV_RANDOM + madvise(env->me_map, env->me_mapsize, MADV_RANDOM); +#else +#ifdef POSIX_MADV_RANDOM + posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); +#endif /* POSIX_MADV_RANDOM */ +#endif /* MADV_RANDOM */ + } +#endif /* _WIN32 */ + + /* Can happen because the address argument to mmap() is just a + * hint. mmap() can pick another, e.g. if the range is in use. + * The MAP_FIXED flag would prevent that, but then mmap could + * instead unmap existing pages to make room for the new map. + */ + if (addr && env->me_map != addr) + return EBUSY; /* TODO: Make a new MDB_* error code? */ + + p = (MDB_page *)env->me_map; + env->me_metas[0] = METADATA(p); + env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize); + + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_mapsize(MDB_env *env, size_t size) +{ + /* If env is already open, caller is responsible for making + * sure there are no active txns. + */ + if (env->me_map) { + int rc; + MDB_meta *meta; + void *old; + if (env->me_txn) + return EINVAL; + meta = mdb_env_pick_meta(env); + if (!size) + size = meta->mm_mapsize; + { + /* Silently round up to minimum if the size is too small */ + size_t minsize = (meta->mm_last_pg + 1) * env->me_psize; + if (size < minsize) + size = minsize; + } + munmap(env->me_map, env->me_mapsize); + env->me_mapsize = size; + old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; + rc = mdb_env_map(env, old); + if (rc) + return rc; + } + env->me_mapsize = size; + if (env->me_psize) + env->me_maxpg = env->me_mapsize / env->me_psize; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) +{ + if (env->me_map) + return EINVAL; + env->me_maxdbs = dbs + CORE_DBS; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_maxreaders(MDB_env *env, unsigned int readers) +{ + if (env->me_map || readers < 1) + return EINVAL; + env->me_maxreaders = readers; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) +{ + if (!env || !readers) + return EINVAL; + *readers = env->me_maxreaders; + return MDB_SUCCESS; +} + +static int ESECT +mdb_fsize(HANDLE fd, size_t *size) +{ +#ifdef _WIN32 + LARGE_INTEGER fsize; + + if (!GetFileSizeEx(fd, &fsize)) + return ErrCode(); + + *size = fsize.QuadPart; +#else + struct stat st; + + if (fstat(fd, &st)) + return ErrCode(); + + if (S_ISBLK(st.st_mode)) { + size_t ret; + if (ioctl(fd, BLKGETSIZE64, &ret) == 0) { + *size = ret; + return MDB_SUCCESS; + } + else { + return ErrCode(); + } + } + + *size = st.st_size; +#endif + return MDB_SUCCESS; +} + + +#ifdef _WIN32 +typedef wchar_t mdb_nchar_t; +# define MDB_NAME(str) L##str +# define mdb_name_cpy wcscpy +#else +/** Character type for file names: char on Unix, wchar_t on Windows */ +typedef char mdb_nchar_t; +# define MDB_NAME(str) str /**< #mdb_nchar_t[] string literal */ +# define mdb_name_cpy strcpy /**< Copy name (#mdb_nchar_t string) */ +#endif + +/** Filename - string of #mdb_nchar_t[] */ +typedef struct MDB_name { + int mn_len; /**< Length */ + int mn_alloced; /**< True if #mn_val was malloced */ + mdb_nchar_t *mn_val; /**< Contents */ +} MDB_name; + +/** Filename suffixes [datafile,lockfile][without,with MDB_NOSUBDIR] */ +static const mdb_nchar_t *const mdb_suffixes[2][2] = { + { MDB_NAME("/data.mdb"), MDB_NAME("") }, + { MDB_NAME("/lock.mdb"), MDB_NAME("-lock") } +}; + +#define MDB_SUFFLEN 9 /**< Max string length in #mdb_suffixes[] */ + +/** Set up filename + scratch area for filename suffix, for opening files. + * It should be freed with #mdb_fname_destroy(). + * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16. + * + * @param[in] path Pathname for #mdb_env_open(). + * @param[in] envflags Whether a subdir and/or lockfile will be used. + * @param[out] fname Resulting filename, with room for a suffix if necessary. + */ +static int ESECT +mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname) +{ + int no_suffix = F_ISSET(envflags, MDB_NOSUBDIR|MDB_NOLOCK); + fname->mn_alloced = 0; +#ifdef _WIN32 + return utf8_to_utf16(path, fname, no_suffix ? 0 : MDB_SUFFLEN); +#else + fname->mn_len = strlen(path); + if (no_suffix) + fname->mn_val = (char *) path; + else if ((fname->mn_val = malloc(fname->mn_len + MDB_SUFFLEN+1)) != NULL) { + fname->mn_alloced = 1; + strcpy(fname->mn_val, path); + } + else + return ENOMEM; + return MDB_SUCCESS; +#endif +} + +/** Destroy \b fname from #mdb_fname_init() */ +#define mdb_fname_destroy(fname) \ + do { if ((fname).mn_alloced) free((fname).mn_val); } while (0) + +#ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */ +# define MDB_CLOEXEC O_CLOEXEC +#else +# define MDB_CLOEXEC 0 +#endif + +/** File type, access mode etc. for #mdb_fopen() */ +enum mdb_fopen_type { +#ifdef _WIN32 + MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS +#else + /* A comment in mdb_fopen() explains some O_* flag choices. */ + MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ + MDB_O_RDWR = O_RDWR |O_CREAT, /**< for me_fd */ + MDB_O_META = O_WRONLY|MDB_DSYNC |MDB_CLOEXEC, /**< for me_mfd */ + MDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|MDB_CLOEXEC, /**< for #mdb_env_copy() */ + /** Bitmask for open() flags in enum #mdb_fopen_type. The other bits + * distinguish otherwise-equal MDB_O_* constants from each other. + */ + MDB_O_MASK = MDB_O_RDWR|MDB_CLOEXEC | MDB_O_RDONLY|MDB_O_META|MDB_O_COPY, + MDB_O_LOCKS = MDB_O_RDWR|MDB_CLOEXEC | ((MDB_O_MASK+1) & ~MDB_O_MASK) /**< for me_lfd */ +#endif +}; + +/** Open an LMDB file. + * @param[in] env The LMDB environment. + * @param[in,out] fname Path from from #mdb_fname_init(). A suffix is + * appended if necessary to create the filename, without changing mn_len. + * @param[in] which Determines file type, access mode, etc. + * @param[in] mode The Unix permissions for the file, if we create it. + * @param[out] res Resulting file handle. + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_fopen(const MDB_env *env, MDB_name *fname, + enum mdb_fopen_type which, mdb_mode_t mode, + HANDLE *res) +{ + int rc = MDB_SUCCESS; + HANDLE fd; +#ifdef _WIN32 + DWORD acc, share, disp, attrs; +#else + int flags; +#endif + + if (fname->mn_alloced) /* modifiable copy */ + mdb_name_cpy(fname->mn_val + fname->mn_len, + mdb_suffixes[which==MDB_O_LOCKS][F_ISSET(env->me_flags, MDB_NOSUBDIR)]); + + /* The directory must already exist. Usually the file need not. + * MDB_O_META requires the file because we already created it using + * MDB_O_RDWR. MDB_O_COPY must not overwrite an existing file. + * + * With MDB_O_COPY we do not want the OS to cache the writes, since + * the source data is already in the OS cache. + * + * The lockfile needs FD_CLOEXEC (close file descriptor on exec*()) + * to avoid the flock() issues noted under Caveats in lmdb.h. + * Also set it for other filehandles which the user cannot get at + * and close himself, which he may need after fork(). I.e. all but + * me_fd, which programs do use via mdb_env_get_fd(). + */ + +#ifdef _WIN32 + acc = GENERIC_READ|GENERIC_WRITE; + share = FILE_SHARE_READ|FILE_SHARE_WRITE; + disp = OPEN_ALWAYS; + attrs = FILE_ATTRIBUTE_NORMAL; + switch (which) { + case MDB_O_RDONLY: /* read-only datafile */ + acc = GENERIC_READ; + disp = OPEN_EXISTING; + break; + case MDB_O_META: /* for writing metapages */ + acc = GENERIC_WRITE; + disp = OPEN_EXISTING; + attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH; + break; + case MDB_O_COPY: /* mdb_env_copy() & co */ + acc = GENERIC_WRITE; + share = 0; + disp = CREATE_NEW; + attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH; + break; + default: break; /* silence gcc -Wswitch (not all enum values handled) */ + } + fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL); +#else + fd = open(fname->mn_val, which & MDB_O_MASK, mode); +#endif + + if (fd == INVALID_HANDLE_VALUE) + rc = ErrCode(); +#ifndef _WIN32 + else { + if (which != MDB_O_RDONLY && which != MDB_O_RDWR) { + /* Set CLOEXEC if we could not pass it to open() */ + if (!MDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1) + (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC); + } + if (which == MDB_O_COPY && env->me_psize >= env->me_os_psize) { + /* This may require buffer alignment. There is no portable + * way to ask how much, so we require OS pagesize alignment. + */ +# ifdef F_NOCACHE /* __APPLE__ */ + (void) fcntl(fd, F_NOCACHE, 1); +# elif defined O_DIRECT + /* open(...O_DIRECT...) would break on filesystems without + * O_DIRECT support (ITS#7682). Try to set it here instead. + */ + if ((flags = fcntl(fd, F_GETFL)) != -1) + (void) fcntl(fd, F_SETFL, flags | O_DIRECT); +# endif + } + } +#endif /* !_WIN32 */ + + *res = fd; + return rc; +} + + +#ifdef BROKEN_FDATASYNC +#include +#include +#endif + +static void mdb_env_sel_other(MDB_env* env) +{ + MDB_meta* const* metas = env->me_metas; + metas[metas[0]->mm_txnid > metas[1]->mm_txnid]->mm_txnid += 2; +} + +/** Further setup required for opening an LMDB environment + */ +static int ESECT +mdb_env_open2(MDB_env *env, int prev) +{ + unsigned int flags = env->me_flags; + int i, newenv = 0, rc; + MDB_meta meta; + +#ifdef _WIN32 + /* See if we should use QueryLimited */ + rc = GetVersion(); + if ((rc & 0xff) > 5) + env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION; + else + env->me_pidquery = PROCESS_QUERY_INFORMATION; +#endif /* _WIN32 */ + +#ifdef BROKEN_FDATASYNC + /* ext3/ext4 fdatasync is broken on some older Linux kernels. + * https://lkml.org/lkml/2012/9/3/83 + * Kernels after 3.6-rc6 are known good. + * https://lkml.org/lkml/2012/9/10/556 + * See if the DB is on ext3/ext4, then check for new enough kernel + * Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known + * to be patched. + */ + { + struct statfs st; + fstatfs(env->me_fd, &st); + while (st.f_type == 0xEF53) { + struct utsname uts; + int i; + uname(&uts); + if (uts.release[0] < '3') { + if (!strncmp(uts.release, "2.6.32.", 7)) { + i = atoi(uts.release+7); + if (i >= 60) + break; /* 2.6.32.60 and newer is OK */ + } else if (!strncmp(uts.release, "2.6.34.", 7)) { + i = atoi(uts.release+7); + if (i >= 15) + break; /* 2.6.34.15 and newer is OK */ + } + } else if (uts.release[0] == '3') { + i = atoi(uts.release+2); + if (i > 5) + break; /* 3.6 and newer is OK */ + if (i == 5) { + i = atoi(uts.release+4); + if (i >= 4) + break; /* 3.5.4 and newer is OK */ + } else if (i == 2) { + i = atoi(uts.release+4); + if (i >= 30) + break; /* 3.2.30 and newer is OK */ + } + } else { /* 4.x and newer is OK */ + break; + } + env->me_flags |= MDB_FSYNCONLY; + break; + } + } +#endif + + if ((i = mdb_env_read_header(env, prev, &meta)) != 0) { + if (i != ENOENT) + return i; + DPUTS("new mdbenv"); + newenv = 1; + env->me_psize = env->me_os_psize; + if (env->me_psize > MAX_PAGESIZE) + env->me_psize = MAX_PAGESIZE; + memset(&meta, 0, sizeof(meta)); + mdb_env_init_meta0(env, &meta); + meta.mm_mapsize = DEFAULT_MAPSIZE; + } else { + env->me_psize = meta.mm_psize; + } + + /* Was a mapsize configured? */ + if (!env->me_mapsize) { + env->me_mapsize = meta.mm_mapsize; + } + { + /* Make sure mapsize >= committed data size. Even when using + * mm_mapsize, which could be broken in old files (ITS#7789). + */ + size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; + if (env->me_mapsize < minsize) + env->me_mapsize = minsize; + } + meta.mm_mapsize = env->me_mapsize; + + if (newenv && !(flags & MDB_FIXEDMAP)) { + /* mdb_env_map() may grow the datafile. Write the metapages + * first, so the file will be valid if initialization fails. + * Except with FIXEDMAP, since we do not yet know mm_address. + * We could fill in mm_address later, but then a different + * program might end up doing that - one with a memory layout + * and map address which does not suit the main program. + */ + rc = mdb_env_init_meta(env, &meta); + if (rc) + return rc; + newenv = 0; + } + + rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL); + if (rc) + return rc; + + if (newenv) { + if (flags & MDB_FIXEDMAP) + meta.mm_address = env->me_map; + i = mdb_env_init_meta(env, &meta); + if (i != MDB_SUCCESS) { + return i; + } + } + + env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) + - sizeof(indx_t); +#if !(MDB_MAXKEYSIZE) + env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); +#endif + env->me_maxpg = env->me_mapsize / env->me_psize; + + if (env->me_txns) + env->me_txns->mti_txnid = meta.mm_txnid; + +#if MDB_DEBUG + { + MDB_meta *meta = mdb_env_pick_meta(env); + MDB_db *db = &meta->mm_dbs[MAIN_DBI]; + + DPRINTF(("opened database version %u, pagesize %u", + meta->mm_version, env->me_psize)); + DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1))); + DPRINTF(("depth: %u", db->md_depth)); + DPRINTF(("entries: %"Z"u", db->md_entries)); + DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); + DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages)); + DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages)); + DPRINTF(("root: %"Z"u", db->md_root)); + } +#endif + + return MDB_SUCCESS; +} + + +/** Release a reader thread's slot in the reader lock table. + * This function is called automatically when a thread exits. + * @param[in] ptr This points to the slot in the reader lock table. + */ +static void +mdb_env_reader_dest(void *ptr) +{ + MDB_reader *reader = ptr; + +#ifndef _WIN32 + if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */ +#endif + /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */ + reader->mr_pid = 0; +} + +#ifdef _WIN32 +/** Junk for arranging thread-specific callbacks on Windows. This is + * necessarily platform and compiler-specific. Windows supports up + * to 1088 keys. Let's assume nobody opens more than 64 environments + * in a single process, for now. They can override this if needed. + */ +#ifndef MAX_TLS_KEYS +#define MAX_TLS_KEYS 64 +#endif +static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS]; +static int mdb_tls_nkeys; + +static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr) +{ + int i; + switch(reason) { + case DLL_PROCESS_ATTACH: break; + case DLL_THREAD_ATTACH: break; + case DLL_THREAD_DETACH: + for (i=0; ime_lfd, 0, 0, 1, 0, &ov)) { + rc = ErrCode(); + } else { + UnlockFile(env->me_lfd, 0, 0, 1, 0); + *excl = 0; + } + } +#else + { + struct flock lock_info; + /* The shared lock replaces the existing lock */ + memset((void *)&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_RDLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = 0; + lock_info.l_len = 1; + while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && + (rc = ErrCode()) == EINTR) ; + *excl = rc ? -1 : 0; /* error may mean we lost the lock */ + } +#endif + + return rc; +} + +/** Try to get exclusive lock, otherwise shared. + * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. + */ +static int ESECT +mdb_env_excl_lock(MDB_env *env, int *excl) +{ + int rc = 0; +#ifdef _WIN32 + if (LockFile(env->me_lfd, 0, 0, 1, 0)) { + *excl = 1; + } else { + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { + *excl = 0; + } else { + rc = ErrCode(); + } + } +#else + struct flock lock_info; + memset((void *)&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_WRLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = 0; + lock_info.l_len = 1; + while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && + (rc = ErrCode()) == EINTR) ; + if (!rc) { + *excl = 1; + } else +# ifndef MDB_USE_POSIX_MUTEX + if (*excl < 0) /* always true when MDB_USE_POSIX_MUTEX */ +# endif + { + lock_info.l_type = F_RDLCK; + while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && + (rc = ErrCode()) == EINTR) ; + if (rc == 0) + *excl = 0; + } +#endif + return rc; +} + +#ifdef MDB_USE_HASH +/* + * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code + * + * @(#) $Revision: 5.1 $ + * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $ + * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $ + * + * http://www.isthe.com/chongo/tech/comp/fnv/index.html + * + *** + * + * Please do not copyright this code. This code is in the public domain. + * + * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO + * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + * + * By: + * chongo /\oo/\ + * http://www.isthe.com/chongo/ + * + * Share and Enjoy! :-) + */ + +typedef unsigned long long mdb_hash_t; +#define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL) + +/** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer + * @param[in] val value to hash + * @param[in] hval initial value for hash + * @return 64 bit hash + * + * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the + * hval arg on the first call. + */ +static mdb_hash_t +mdb_hash_val(MDB_val *val, mdb_hash_t hval) +{ + unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ + unsigned char *end = s + val->mv_size; + /* + * FNV-1a hash each octet of the string + */ + while (s < end) { + /* xor the bottom with the current octet */ + hval ^= (mdb_hash_t)*s++; + + /* multiply by the 64 bit FNV magic prime mod 2^64 */ + hval += (hval << 1) + (hval << 4) + (hval << 5) + + (hval << 7) + (hval << 8) + (hval << 40); + } + /* return our new hash value */ + return hval; +} + +/** Hash the string and output the encoded hash. + * This uses modified RFC1924 Ascii85 encoding to accommodate systems with + * very short name limits. We don't care about the encoding being reversible, + * we just want to preserve as many bits of the input as possible in a + * small printable string. + * @param[in] str string to hash + * @param[out] encbuf an array of 11 chars to hold the hash + */ +static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; + +static void ESECT +mdb_pack85(unsigned long l, char *out) +{ + int i; + + for (i=0; i<5; i++) { + *out++ = mdb_a85[l % 85]; + l /= 85; + } +} + +static void ESECT +mdb_hash_enc(MDB_val *val, char *encbuf) +{ + mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); + + mdb_pack85(h, encbuf); + mdb_pack85(h>>32, encbuf+5); + encbuf[10] = '\0'; +} +#endif + +/** Open and/or initialize the lock region for the environment. + * @param[in] env The LMDB environment. + * @param[in] fname Filename + scratch area, from #mdb_fname_init(). + * @param[in] mode The Unix permissions for the file, if we create it. + * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) +{ +#ifdef _WIN32 +# define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT +#else +# define MDB_ERRCODE_ROFS EROFS +#endif + int rc; + off_t size, rsize; + + rc = mdb_fopen(env, fname, MDB_O_LOCKS, mode, &env->me_lfd); + if (rc) { + /* Omit lockfile if read-only env on read-only filesystem */ + if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { + return MDB_SUCCESS; + } + goto fail; + } + + if (!(env->me_flags & MDB_NOTLS)) { + rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); + if (rc) + goto fail; + env->me_flags |= MDB_ENV_TXKEY; +#ifdef _WIN32 + /* Windows TLS callbacks need help finding their TLS info. */ + if (mdb_tls_nkeys >= MAX_TLS_KEYS) { + rc = MDB_TLS_FULL; + goto fail; + } + mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; +#endif + } + + /* Try to get exclusive lock. If we succeed, then + * nobody is using the lock region and we should initialize it. + */ + if ((rc = mdb_env_excl_lock(env, excl))) goto fail; + +#ifdef _WIN32 + size = GetFileSize(env->me_lfd, NULL); +#else + size = lseek(env->me_lfd, 0, SEEK_END); + if (size == -1) goto fail_errno; +#endif + rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); + if (size < rsize && *excl > 0) { +#ifdef _WIN32 + if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize + || !SetEndOfFile(env->me_lfd)) + goto fail_errno; +#else + if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; +#endif + } else { + rsize = size; + size = rsize - sizeof(MDB_txninfo); + env->me_maxreaders = size/sizeof(MDB_reader) + 1; + } + { +#ifdef _WIN32 + HANDLE mh; + mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE, + 0, 0, NULL); + if (!mh) goto fail_errno; + env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL); + CloseHandle(mh); + if (!env->me_txns) goto fail_errno; +#else + void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, + env->me_lfd, 0); + if (m == MAP_FAILED) goto fail_errno; + env->me_txns = m; +#endif + } + if (*excl > 0) { +#ifdef _WIN32 + BY_HANDLE_FILE_INFORMATION stbuf; + struct { + DWORD volume; + DWORD nhigh; + DWORD nlow; + } idbuf; + MDB_val val; + char encbuf[11]; + + if (!mdb_sec_inited) { + InitializeSecurityDescriptor(&mdb_null_sd, + SECURITY_DESCRIPTOR_REVISION); + SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE); + mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES); + mdb_all_sa.bInheritHandle = FALSE; + mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd; + mdb_sec_inited = 1; + } + if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno; + idbuf.volume = stbuf.dwVolumeSerialNumber; + idbuf.nhigh = stbuf.nFileIndexHigh; + idbuf.nlow = stbuf.nFileIndexLow; + val.mv_data = &idbuf; + val.mv_size = sizeof(idbuf); + mdb_hash_enc(&val, encbuf); + sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf); + sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf); + env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); + if (!env->me_rmutex) goto fail_errno; + env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); + if (!env->me_wmutex) goto fail_errno; +#elif defined(MDB_USE_POSIX_SEM) + struct stat stbuf; + struct { + dev_t dev; + ino_t ino; + } idbuf; + MDB_val val; + char encbuf[11]; + +#if defined(__NetBSD__) +#define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */ +#endif + if (fstat(env->me_lfd, &stbuf)) goto fail_errno; + idbuf.dev = stbuf.st_dev; + idbuf.ino = stbuf.st_ino; + val.mv_data = &idbuf; + val.mv_size = sizeof(idbuf); + mdb_hash_enc(&val, encbuf); +#ifdef MDB_SHORT_SEMNAMES + encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */ +#endif + sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf); + sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf); + /* Clean up after a previous run, if needed: Try to + * remove both semaphores before doing anything else. + */ + sem_unlink(env->me_txns->mti_rmname); + sem_unlink(env->me_txns->mti_wmname); + env->me_rmutex = sem_open(env->me_txns->mti_rmname, + O_CREAT|O_EXCL, mode, 1); + if (env->me_rmutex == SEM_FAILED) goto fail_errno; + env->me_wmutex = sem_open(env->me_txns->mti_wmname, + O_CREAT|O_EXCL, mode, 1); + if (env->me_wmutex == SEM_FAILED) goto fail_errno; +#else /* MDB_USE_POSIX_MUTEX: */ + pthread_mutexattr_t mattr; + + /* Solaris needs this before initing a robust mutex. Otherwise + * it may skip the init and return EBUSY "seems someone already + * inited" or EINVAL "it was inited differently". + */ + memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex)); + memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex)); + + if ((rc = pthread_mutexattr_init(&mattr))) + goto fail; + + rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); +#ifdef MDB_ROBUST_SUPPORTED + if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); +#endif + if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr); + if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr); + pthread_mutexattr_destroy(&mattr); + if (rc) + goto fail; +#endif /* _WIN32 || MDB_USE_POSIX_SEM */ + + env->me_txns->mti_magic = MDB_MAGIC; + env->me_txns->mti_format = MDB_LOCK_FORMAT; + env->me_txns->mti_txnid = 0; + env->me_txns->mti_numreaders = 0; + + } else { + if (env->me_txns->mti_magic != MDB_MAGIC) { + DPUTS("lock region has invalid magic"); + rc = MDB_INVALID; + goto fail; + } + if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { + DPRINTF(("lock region has format+version 0x%x, expected 0x%x", + env->me_txns->mti_format, MDB_LOCK_FORMAT)); + rc = MDB_VERSION_MISMATCH; + goto fail; + } + rc = ErrCode(); + if (rc && rc != EACCES && rc != EAGAIN) { + goto fail; + } +#ifdef _WIN32 + env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); + if (!env->me_rmutex) goto fail_errno; + env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); + if (!env->me_wmutex) goto fail_errno; +#elif defined(MDB_USE_POSIX_SEM) + env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0); + if (env->me_rmutex == SEM_FAILED) goto fail_errno; + env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0); + if (env->me_wmutex == SEM_FAILED) goto fail_errno; +#endif + } + return MDB_SUCCESS; + +fail_errno: + rc = ErrCode(); +fail: + return rc; +} + + /** Only a subset of the @ref mdb_env flags can be changed + * at runtime. Changing other flags requires closing the + * environment and re-opening it with the new flags. + */ +#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) +#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ + MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD|MDB_PREVSNAPSHOT) + +#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) +# error "Persistent DB flags & env flags overlap, but both go in mm_flags" +#endif + +int ESECT +mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) +{ + int rc, excl = -1; + MDB_name fname; + + if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) + return EINVAL; + + flags |= env->me_flags; + + rc = mdb_fname_init(path, flags, &fname); + if (rc) + return rc; + + if (flags & MDB_RDONLY) { + /* silently ignore WRITEMAP when we're only getting read access */ + flags &= ~MDB_WRITEMAP; + } else { + if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) && + (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) + rc = ENOMEM; + } + env->me_flags = flags |= MDB_ENV_ACTIVE; + if (rc) + goto leave; + + env->me_path = strdup(path); + env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); + env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); + env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int)); + if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { + rc = ENOMEM; + goto leave; + } + env->me_dbxs[FREE_DBI].md_cmp = mdb_cmp_long; /* aligned MDB_INTEGERKEY */ + + /* For RDONLY, get lockfile after we know datafile exists */ + if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { + rc = mdb_env_setup_locks(env, &fname, mode, &excl); + if (rc) + goto leave; + if ((flags & MDB_PREVSNAPSHOT) && !excl) { + rc = EAGAIN; + goto leave; + } + } + + rc = mdb_fopen(env, &fname, + (flags & MDB_RDONLY) ? MDB_O_RDONLY : MDB_O_RDWR, + mode, &env->me_fd); + if (rc) + goto leave; + + if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { + rc = mdb_env_setup_locks(env, &fname, mode, &excl); + if (rc) + goto leave; + } + + if ((rc = mdb_env_open2(env, flags & MDB_PREVSNAPSHOT)) == MDB_SUCCESS) { + if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { + /* Synchronous fd for meta writes. Needed even with + * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. + */ + rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); + if (rc) + goto leave; + } + DPRINTF(("opened dbenv %p", (void *) env)); + if (excl > 0 && !(flags & MDB_PREVSNAPSHOT)) { + rc = mdb_env_share_locks(env, &excl); + if (rc) + goto leave; + } + if (!(flags & MDB_RDONLY)) { + MDB_txn *txn; + int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs * + (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned int)+1); + if ((env->me_pbuf = calloc(1, env->me_psize)) && + (txn = calloc(1, size))) + { + txn->mt_dbs = (MDB_db *)((char *)txn + tsize); + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); + txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); + txn->mt_env = env; + txn->mt_dbxs = env->me_dbxs; + txn->mt_flags = MDB_TXN_FINISHED; + env->me_txn0 = txn; + } else { + rc = ENOMEM; + } + } + } + +leave: + if (rc) { + mdb_env_close0(env, excl); + } + mdb_fname_destroy(fname); + return rc; +} + +/** Destroy resources from mdb_env_open(), clear our readers & DBIs */ +static void ESECT +mdb_env_close0(MDB_env *env, int excl) +{ + int i; + + if (!(env->me_flags & MDB_ENV_ACTIVE)) + return; + + /* Doing this here since me_dbxs may not exist during mdb_env_close */ + if (env->me_dbxs) { + for (i = env->me_maxdbs; --i >= CORE_DBS; ) + free(env->me_dbxs[i].md_name.mv_data); + free(env->me_dbxs); + } + + free(env->me_pbuf); + free(env->me_dbiseqs); + free(env->me_dbflags); + free(env->me_path); + free(env->me_dirty_list); + free(env->me_txn0); + mdb_midl_free(env->me_free_pgs); + + if (env->me_flags & MDB_ENV_TXKEY) { + pthread_key_delete(env->me_txkey); +#ifdef _WIN32 + /* Delete our key from the global list */ + for (i=0; ime_txkey) { + mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; + mdb_tls_nkeys--; + break; + } +#endif + } + + if (env->me_map) { + munmap(env->me_map, env->me_mapsize); + } + if (env->me_mfd != INVALID_HANDLE_VALUE) + (void) close(env->me_mfd); + if (env->me_fd != INVALID_HANDLE_VALUE) + (void) close(env->me_fd); + if (env->me_txns) { + MDB_PID_T pid = getpid(); + /* Clearing readers is done in this function because + * me_txkey with its destructor must be disabled first. + * + * We skip the the reader mutex, so we touch only + * data owned by this process (me_close_readers and + * our readers), and clear each reader atomically. + */ + for (i = env->me_close_readers; --i >= 0; ) + if (env->me_txns->mti_readers[i].mr_pid == pid) + env->me_txns->mti_readers[i].mr_pid = 0; +#ifdef _WIN32 + if (env->me_rmutex) { + CloseHandle(env->me_rmutex); + if (env->me_wmutex) CloseHandle(env->me_wmutex); + } + /* Windows automatically destroys the mutexes when + * the last handle closes. + */ +#elif defined(MDB_USE_POSIX_SEM) + if (env->me_rmutex != SEM_FAILED) { + sem_close(env->me_rmutex); + if (env->me_wmutex != SEM_FAILED) + sem_close(env->me_wmutex); + /* If we have the filelock: If we are the + * only remaining user, clean up semaphores. + */ + if (excl == 0) + mdb_env_excl_lock(env, &excl); + if (excl > 0) { + sem_unlink(env->me_txns->mti_rmname); + sem_unlink(env->me_txns->mti_wmname); + } + } +#endif + //munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); + } + if (env->me_lfd != INVALID_HANDLE_VALUE) { +#ifdef _WIN32 + if (excl >= 0) { + /* Unlock the lockfile. Windows would have unlocked it + * after closing anyway, but not necessarily at once. + */ + UnlockFile(env->me_lfd, 0, 0, 1, 0); + } +#endif + (void) close(env->me_lfd); + } + + env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); +} + +void ESECT +mdb_env_close(MDB_env *env) +{ + MDB_page *dp; + + if (env == NULL) + return; + + VGMEMP_DESTROY(env); + /*while ((dp = env->me_dpages) != NULL) { + VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); + env->me_dpages = dp->mp_next; + free(dp); + }*/ + + mdb_env_close0(env, 0); + free(env); +} + +/** Compare two items pointing at aligned size_t's */ +static int +mdb_cmp_long(const MDB_val *a, const MDB_val *b) +{ + return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 : + *(size_t *)a->mv_data > *(size_t *)b->mv_data; +} + +/** Compare two items pointing at aligned unsigned int's. + * + * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp, + * but #mdb_cmp_clong() is called instead if the data type is size_t. + */ +static int +mdb_cmp_int(const MDB_val *a, const MDB_val *b) +{ + return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 : + *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data; +} + +/** Compare two items pointing at unsigned ints of unknown alignment. + * Nodes and keys are guaranteed to be 2-byte aligned. + */ +static int +mdb_cmp_cint(const MDB_val *a, const MDB_val *b) +{ +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned short *u, *c; + int x; + + u = (unsigned short *) ((char *) a->mv_data + a->mv_size); + c = (unsigned short *) ((char *) b->mv_data + a->mv_size); + do { + x = *--u - *--c; + } while(!x && u > (unsigned short *)a->mv_data); + return x; +#else + unsigned short *u, *c, *end; + int x; + + end = (unsigned short *) ((char *) a->mv_data + a->mv_size); + u = (unsigned short *)a->mv_data; + c = (unsigned short *)b->mv_data; + do { + x = *u++ - *c++; + } while(!x && u < end); + return x; +#endif +} + +/** Compare two items lexically */ +static int +mdb_cmp_memn(const MDB_val *a, const MDB_val *b) +{ + int diff; + ssize_t len_diff; + unsigned int len; + + len = a->mv_size; + len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; + if (len_diff > 0) { + len = b->mv_size; + len_diff = 1; + } + + diff = memcmp(a->mv_data, b->mv_data, len); + return diff ? diff : len_diff<0 ? -1 : len_diff; +} + +/** Compare two items in reverse byte order */ +static int +mdb_cmp_memnr(const MDB_val *a, const MDB_val *b) +{ + const unsigned char *p1, *p2, *p1_lim; + ssize_t len_diff; + int diff; + + p1_lim = (const unsigned char *)a->mv_data; + p1 = (const unsigned char *)a->mv_data + a->mv_size; + p2 = (const unsigned char *)b->mv_data + b->mv_size; + + len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; + if (len_diff > 0) { + p1_lim += len_diff; + len_diff = 1; + } + + while (p1 > p1_lim) { + diff = *--p1 - *--p2; + if (diff) + return diff; + } + return len_diff<0 ? -1 : len_diff; +} + +/** Search for key within a page, using binary search. + * Returns the smallest entry larger or equal to the key. + * If exactp is non-null, stores whether the found entry was an exact match + * in *exactp (1 or 0). + * Updates the cursor index with the index of the found entry. + * If no entry larger or equal to the key is found, returns NULL. + */ +static MDB_node * +mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) +{ + unsigned int i = 0, nkeys; + int low, high; + int rc = 0; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NULL; + MDB_val nodekey; + MDB_cmp_func *cmp; + DKBUF; + + nkeys = NUMKEYS(mp); + + DPRINTF(("searching %u keys in %s %spage %"Z"u", + nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", + mdb_dbg_pgno(mp))); + + low = IS_LEAF(mp) ? 0 : 1; + high = nkeys - 1; + cmp = mc->mc_dbx->md_cmp; + + /* Branch pages have no data, so if using integer keys, + * alignment is guaranteed. Use faster mdb_cmp_int. + */ + if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { + if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) + cmp = mdb_cmp_long; + else + cmp = mdb_cmp_int; + } + + if (IS_LEAF2(mp)) { + nodekey.mv_size = mc->mc_db->md_pad; + node = NODEPTR(mp, 0); /* fake */ + while (low <= high) { + i = (low + high) >> 1; + nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); + rc = cmp(key, &nodekey); + DPRINTF(("found leaf index %u [%s], rc = %i", + i, DKEY(&nodekey), rc)); + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } else { + while (low <= high) { + i = (low + high) >> 1; + + node = NODEPTR(mp, i); + nodekey.mv_size = NODEKSZ(node); + nodekey.mv_data = NODEKEY(node); + + rc = cmp(key, &nodekey); +#if MDB_DEBUG + if (IS_LEAF(mp)) + DPRINTF(("found leaf index %u [%s], rc = %i", + i, DKEY(&nodekey), rc)); + else + DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i", + i, DKEY(&nodekey), NODEPGNO(node), rc)); +#endif + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } + + if (rc > 0) { /* Found entry is less than the key. */ + i++; /* Skip to get the smallest entry larger than key. */ + if (!IS_LEAF2(mp)) + node = NODEPTR(mp, i); + } + if (exactp) + *exactp = (rc == 0 && nkeys > 0); + /* store the key index */ + mc->mc_ki[mc->mc_top] = i; + if (i >= nkeys) + /* There is no entry larger or equal to the key. */ + return NULL; + + /* nodeptr is fake for LEAF2 */ + return node; +} + +#if 0 +static void +mdb_cursor_adjust(MDB_cursor *mc, func) +{ + MDB_cursor *m2; + + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { + func(mc, m2); + } + } +} +#endif + +/** Pop a page off the top of the cursor's stack. */ +static void +mdb_cursor_pop(MDB_cursor *mc) +{ + if (mc->mc_snum) { + DPRINTF(("popping page %"Z"u off db %d cursor %p", + mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc)); + + mc->mc_snum--; + if (mc->mc_snum) { + mc->mc_top--; + } else { + mc->mc_flags &= ~C_INITIALIZED; + } + } +} + +/** Push a page onto the top of the cursor's stack. + * Set #MDB_TXN_ERROR on failure. + */ +static int +mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) +{ + DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, + DDBI(mc), (void *) mc)); + + if (mc->mc_snum >= CURSOR_STACK) { + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CURSOR_FULL; + } + + mc->mc_top = mc->mc_snum++; + mc->mc_pg[mc->mc_top] = mp; + mc->mc_ki[mc->mc_top] = 0; + + return MDB_SUCCESS; +} + +/** Find the address of the page corresponding to a given page number. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc the cursor accessing the page. + * @param[in] pgno the page number for the page to retrieve. + * @param[out] ret address of a pointer where the page's address will be stored. + * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) +{ + MDB_txn *txn = mc->mc_txn; + MDB_env *env = txn->mt_env; + MDB_page *p = NULL; + int level; + + if (! (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_WRITEMAP))) { + MDB_txn *tx2 = txn; + level = 1; + do { + MDB_ID2L dl = tx2->mt_u.dirty_list; + const MDB_ID* x; + /* Spilled pages were dirtied in this txn and flushed + * because the dirty list got full. Bring this page + * back in from the map (but don't unspill it here, + * leave that unless page_touch happens again). + */ + if (tx2->mt_spill_pgs) { + MDB_ID pn = pgno << 1; + x = cppmidl_search(tx2->mt_spill_pgs, pn); + if(x!=NULL && *x==pn) { + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + goto done; + } + } + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + p = dl[x].mptr; + goto done; + } + } + level++; + } while ((tx2 = tx2->mt_parent) != NULL); + } + + if (pgno < txn->mt_next_pgno) { + level = 0; + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + } else { + DPRINTF(("page %"Z"u not found", pgno)); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_NOTFOUND; + } + +done: + *ret = p; + if (lvl) + *lvl = level; + return MDB_SUCCESS; +} + +/** Finish #mdb_page_search() / #mdb_page_search_lowest(). + * The cursor is at the root page, set up the rest of it. + */ +static int +mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int rc; + DKBUF; + + while (IS_BRANCH(mp)) { + MDB_node *node; + indx_t i; + + DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); + /* Don't assert on branch pages in the FreeDB. We can get here + * while in the process of rebalancing a FreeDB branch page; we must + * let that proceed. ITS#8336 + */ + mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); + DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); + + if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { + i = 0; + if (flags & MDB_PS_LAST) { + i = NUMKEYS(mp) - 1; + /* if already init'd, see if we're already in right place */ + if (mc->mc_flags & C_INITIALIZED) { + if (mc->mc_ki[mc->mc_top] == i) { + mc->mc_top = mc->mc_snum++; + mp = mc->mc_pg[mc->mc_top]; + goto ready; + } + } + } + } else { + int exact; + node = mdb_node_search(mc, key, &exact); + if (node == NULL) + i = NUMKEYS(mp) - 1; + else { + i = mc->mc_ki[mc->mc_top]; + if (!exact) { + mdb_cassert(mc, i > 0); + i--; + } + } + DPRINTF(("following index %u for key [%s]", i, DKEY(key))); + } + + mdb_cassert(mc, i < NUMKEYS(mp)); + node = NODEPTR(mp, i); + + if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) + return rc; + + mc->mc_ki[mc->mc_top] = i; + if ((rc = mdb_cursor_push(mc, mp))) + return rc; + +ready: + if (flags & MDB_PS_MODIFY) { + if ((rc = mdb_page_touch(mc)) != 0) + return rc; + mp = mc->mc_pg[mc->mc_top]; + } + } + + if (!IS_LEAF(mp)) { + DPRINTF(("internal error, index points to a %02X page!?", + mp->mp_flags)); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + + DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, + key ? DKEY(key) : "null")); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + return MDB_SUCCESS; +} + +/** Search for the lowest key under the current branch page. + * This just bypasses a NUMKEYS check in the current page + * before calling mdb_page_search_root(), because the callers + * are all in situations where the current page is known to + * be underfilled. + */ +static int +mdb_page_search_lowest(MDB_cursor *mc) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NODEPTR(mp, 0); + int rc; + + if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) + return rc; + + mc->mc_ki[mc->mc_top] = 0; + if ((rc = mdb_cursor_push(mc, mp))) + return rc; + return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); +} + +/** Search for the page a given key should be in. + * Push it and its parent pages on the cursor stack. + * @param[in,out] mc the cursor for this operation. + * @param[in] key the key to search for, or NULL for first/last page. + * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB + * are touched (updated with new page numbers). + * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. + * This is used by #mdb_cursor_first() and #mdb_cursor_last(). + * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) +{ + int rc; + pgno_t root; + + /* Make sure the txn is still viable, then find the root from + * the txn's db table and set it as the root of the cursor's stack. + */ + if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) { + DPUTS("transaction may not be used now"); + return MDB_BAD_TXN; + } else { + /* Make sure we're using an up-to-date root */ + if (*mc->mc_dbflag & DB_STALE) { + MDB_cursor mc2; + if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) + return MDB_BAD_DBI; + mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); + if (rc) + return rc; + { + MDB_val data; + int exact = 0; + uint16_t flags; + MDB_node *leaf = mdb_node_search(&mc2, + &mc->mc_dbx->md_name, &exact); + if (!exact) + return MDB_NOTFOUND; + if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) + return MDB_INCOMPATIBLE; /* not a named DB */ + rc = mdb_node_read(&mc2, leaf, &data); + if (rc) + return rc; + memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), + sizeof(uint16_t)); + /* The txn may not know this DBI, or another process may + * have dropped and recreated the DB with other flags. + */ + if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) + return MDB_INCOMPATIBLE; + memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); + } + *mc->mc_dbflag &= ~DB_STALE; + } + root = mc->mc_db->md_root; + + if (root == P_INVALID) { /* Tree is empty. */ + DPUTS("tree is empty"); + return MDB_NOTFOUND; + } + } + + mdb_cassert(mc, root > 1); + if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) + if ((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0) + return rc; + + mc->mc_snum = 1; + mc->mc_top = 0; + + DPRINTF(("db %d root page %"Z"u has flags 0x%X", + DDBI(mc), root, mc->mc_pg[0]->mp_flags)); + + if (flags & MDB_PS_MODIFY) { + if ((rc = mdb_page_touch(mc))) + return rc; + } + + if (flags & MDB_PS_ROOTONLY) + return MDB_SUCCESS; + + return mdb_page_search_root(mc, key, flags); +} + +static int +mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) +{ + MDB_txn *txn = mc->mc_txn; + pgno_t pg = mp->mp_pgno; + unsigned x = 0, ovpages = mp->mp_pages; + MDB_env *env = txn->mt_env; + CPPMIDL sl = txn->mt_spill_pgs; + const MDB_ID* px; + MDB_ID pn = pg << 1; + int rc; + + DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); + /* If the page is dirty or on the spill list we just acquired it, + * so we should give it back to our current free list, if any. + * Otherwise put it onto the list of pages we freed in this txn. + * + * Won't create me_pghead: me_pglast must be inited along with it. + * Unsupported in nested txns: They would need to hide the page + * range in ancestor txns' dirty and spilled lists. + */ + if (env->me_pghead && + !txn->mt_parent && + ((mp->mp_flags & P_DIRTY) || + (sl && (px = cppmidl_search(sl, pn)) !=NULL && *px== pn))) + { + unsigned i, j; + pgno_t *mop; + MDB_ID2 *dl, ix, iy; + rc = mdb_midl_need(&env->me_pghead, ovpages); + if (rc) + return rc; + if (!(mp->mp_flags & P_DIRTY)) { + /* This page is no longer spilled */ + cppmidl_erase(sl, pn); + goto release; + } + /* Remove from dirty list */ + dl = txn->mt_u.dirty_list; + x = dl[0].mid--; + for (ix = dl[x]; ix.mptr != mp; ix = iy) { + if (x > 1) { + x--; + iy = dl[x]; + dl[x] = ix; + } else { + mdb_cassert(mc, x > 1); + j = ++(dl[0].mid); + dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + } + txn->mt_dirty_room++; + if (!(env->me_flags & MDB_WRITEMAP)) + mdb_dpage_free(env, mp); +release: + /* Insert in me_pghead */ + mop = env->me_pghead; + j = mop[0] + ovpages; + for (i = mop[0]; i && mop[i] < pg; i--) + mop[j--] = mop[i]; + while (j>i) + mop[j--] = pg++; + mop[0] += ovpages; + } else { + rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); + if (rc) + return rc; + } + mc->mc_db->md_overflow_pages -= ovpages; + return 0; +} + +/** Return the data associated with a given node. + * @param[in] mc The cursor for this operation. + * @param[in] leaf The node being read. + * @param[out] data Updated to point to the node's data. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) +{ + MDB_page *omp; /* overflow page */ + pgno_t pgno; + int rc; + + if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { + data->mv_size = NODEDSZ(leaf); + data->mv_data = NODEDATA(leaf); + return MDB_SUCCESS; + } + + /* Read overflow data. + */ + data->mv_size = NODEDSZ(leaf); + memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); + if ((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0) { + DPRINTF(("read overflow page %"Z"u failed", pgno)); + return rc; + } + data->mv_data = METADATA(omp); + + return MDB_SUCCESS; +} + +int +mdb_get(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data) +{ + MDB_cursor mc; + MDB_xcursor mx; + int exact = 0; + DKBUF; + + DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); + + if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + mdb_cursor_init(&mc, txn, dbi, &mx); + return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); +} + +/** Find a sibling for a page. + * Replaces the page at the top of the cursor's stack with the + * specified sibling, if one exists. + * @param[in] mc The cursor for this operation. + * @param[in] move_right Non-zero if the right sibling is requested, + * otherwise the left sibling. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_cursor_sibling(MDB_cursor *mc, int move_right) +{ + int rc; + MDB_node *indx; + MDB_page *mp; + + if (mc->mc_snum < 2) { + return MDB_NOTFOUND; /* root has no siblings */ + } + + mdb_cursor_pop(mc); + DPRINTF(("parent page is page %"Z"u, index %u", + mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); + + if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) + : (mc->mc_ki[mc->mc_top] == 0)) { + DPRINTF(("no more keys left, moving to %s sibling", + move_right ? "right" : "left")); + if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { + /* undo cursor_pop before returning */ + mc->mc_top++; + mc->mc_snum++; + return rc; + } + } else { + if (move_right) + mc->mc_ki[mc->mc_top]++; + else + mc->mc_ki[mc->mc_top]--; + DPRINTF(("just moving to %s index key %u", + move_right ? "right" : "left", mc->mc_ki[mc->mc_top])); + } + mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + + indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if ((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) { + /* mc will be inconsistent if caller does mc_snum++ as above */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + return rc; + } + + mdb_cursor_push(mc, mp); + if (!move_right) + mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; + + return MDB_SUCCESS; +} + +/** Move the cursor to the next data item. */ +static int +mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) +{ + MDB_page *mp; + MDB_node *leaf; + int rc; + + if ((mc->mc_flags & C_DEL && op == MDB_NEXT_DUP)) + return MDB_NOTFOUND; + + if (!(mc->mc_flags & C_INITIALIZED)) + return mdb_cursor_first(mc, key, data); + + mp = mc->mc_pg[mc->mc_top]; + + if (mc->mc_flags & C_EOF) { + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1) + return MDB_NOTFOUND; + mc->mc_flags ^= C_EOF; + } + + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_NEXT || op == MDB_NEXT_DUP) { + rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); + if (op != MDB_NEXT || rc != MDB_NOTFOUND) { + if (rc == MDB_SUCCESS) + MDB_GET_KEY(leaf, key); + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (op == MDB_NEXT_DUP) + return MDB_NOTFOUND; + } + } + + DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", + mdb_dbg_pgno(mp), (void *) mc)); + if (mc->mc_flags & C_DEL) { + mc->mc_flags ^= C_DEL; + goto skip; + } + + if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { + DPUTS("=====> move to next sibling page"); + if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { + mc->mc_flags |= C_EOF; + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + } else + mc->mc_ki[mc->mc_top]++; + +skip: + DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (mc->mc_xcursor==NULL) + return MDB_CORRUPTED; + mdb_xcursor_init1(mc, leaf); + } + if (data) { + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc != MDB_SUCCESS) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Move the cursor to the previous data item. */ +static int +mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) +{ + MDB_page *mp; + MDB_node *leaf; + int rc; + + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = mdb_cursor_last(mc, key, data); + if (rc) + return rc; + mc->mc_ki[mc->mc_top]++; + } + + mp = mc->mc_pg[mc->mc_top]; + + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_PREV || op == MDB_PREV_DUP) { + rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); + if (op != MDB_PREV || rc != MDB_NOTFOUND) { + if (rc == MDB_SUCCESS) { + MDB_GET_KEY(leaf, key); + mc->mc_flags &= ~C_EOF; + } + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (op == MDB_PREV_DUP) + return MDB_NOTFOUND; + } + } + + DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", + mdb_dbg_pgno(mp), (void *) mc)); + + mc->mc_flags &= ~(C_EOF|C_DEL); + + if (mc->mc_ki[mc->mc_top] == 0) { + DPUTS("=====> move to prev sibling page"); + if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; + DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + } else + mc->mc_ki[mc->mc_top]--; + + DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + } + if (data) { + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc != MDB_SUCCESS) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Set the cursor on a specific data item. */ +static int +mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op, int *exactp) +{ + int rc; + MDB_page *mp; + MDB_node *leaf = NULL; + DKBUF; + + if (key->mv_size == 0) + return MDB_BAD_VALSIZE; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + /* See if we're already on the right page */ + if (mc->mc_flags & C_INITIALIZED) { + MDB_val nodekey; + + mp = mc->mc_pg[mc->mc_top]; + if (!NUMKEYS(mp)) { + mc->mc_ki[mc->mc_top] = 0; + return MDB_NOTFOUND; + } + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_size = mc->mc_db->md_pad; + nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); + } else { + leaf = NODEPTR(mp, 0); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* Probably happens rarely, but first node on the page + * was the one we wanted. + */ + mc->mc_ki[mc->mc_top] = 0; + if (exactp) + *exactp = 1; + goto set1; + } + if (rc > 0) { + unsigned int i; + unsigned int nkeys = NUMKEYS(mp); + if (nkeys > 1) { + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_data = LEAF2KEY(mp, + nkeys-1, nodekey.mv_size); + } else { + leaf = NODEPTR(mp, nkeys-1); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* last node was the one we wanted */ + mc->mc_ki[mc->mc_top] = nkeys-1; + if (exactp) + *exactp = 1; + goto set1; + } + if (rc < 0) { + if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { + /* This is definitely the right page, skip search_page */ + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_data = LEAF2KEY(mp, + mc->mc_ki[mc->mc_top], nodekey.mv_size); + } else { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* current node was the one we wanted */ + if (exactp) + *exactp = 1; + goto set1; + } + } + rc = 0; + mc->mc_flags &= ~C_EOF; + goto set2; + } + } + /* If any parents have right-sibs, search. + * Otherwise, there's nothing further. + */ + for (i=0; imc_top; i++) + if (mc->mc_ki[i] < + NUMKEYS(mc->mc_pg[i])-1) + break; + if (i == mc->mc_top) { + /* There are no other pages */ + mc->mc_ki[mc->mc_top] = nkeys; + return MDB_NOTFOUND; + } + } + if (!mc->mc_top) { + /* There are no other pages */ + mc->mc_ki[mc->mc_top] = 0; + if (op == MDB_SET_RANGE && !exactp) { + rc = 0; + goto set1; + } else + return MDB_NOTFOUND; + } + } else { + mc->mc_pg[0] = 0; + } + + rc = mdb_page_search(mc, key, 0); + if (rc != MDB_SUCCESS) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + mdb_cassert(mc, IS_LEAF(mp)); + +set2: + leaf = mdb_node_search(mc, key, exactp); + if (exactp != NULL && !*exactp) { + /* MDB_SET specified and not an exact match. */ + return MDB_NOTFOUND; + } + + if (leaf == NULL) { + DPUTS("===> inexact leaf not found, goto sibling"); + if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { + mc->mc_flags |= C_EOF; + return rc; /* no entries matched */ + } + mp = mc->mc_pg[mc->mc_top]; + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, 0); + } + +set1: + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + if (IS_LEAF2(mp)) { + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } + return MDB_SUCCESS; + } + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + } + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (mc->mc_xcursor==NULL) + return MDB_CORRUPTED; + if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + } else { + int ex2, *ex2p; + if (op == MDB_GET_BOTH) { + ex2p = &ex2; + ex2 = 0; + } else { + ex2p = NULL; + } + rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); + if (rc != MDB_SUCCESS) + return rc; + } + } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { + MDB_val olddata; + MDB_cmp_func *dcmp; + if ((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS) + return rc; + dcmp = mc->mc_dbx->md_dcmp; +#if UINT_MAX < SIZE_MAX + if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) + dcmp = mdb_cmp_clong; +#endif + rc = dcmp(data, &olddata); + if (rc) { + if (op == MDB_GET_BOTH || rc > 0) + return MDB_NOTFOUND; + rc = 0; + } + *data = olddata; + + } else { + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + + /* The key already matches in all other cases */ + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) + MDB_GET_KEY(leaf, key); + DPRINTF(("==> cursor placed on key [%s]", DKEY(key))); + + return rc; +} + +/** Move the cursor to the first item in the database. */ +static int +mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) +{ + int rc; + MDB_node *leaf; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); + if (rc != MDB_SUCCESS) + return rc; + } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + mc->mc_ki[mc->mc_top] = 0; + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); + return MDB_SUCCESS; + } + + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (mc->mc_xcursor==NULL) + return MDB_CORRUPTED; + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc) + return rc; + } else { + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Move the cursor to the last item in the database. */ +static int +mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) +{ + int rc; + MDB_node *leaf; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_LAST); + if (rc != MDB_SUCCESS) + return rc; + } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; + mc->mc_flags |= C_INITIALIZED|C_EOF; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (mc->mc_xcursor==NULL) + return MDB_CORRUPTED; + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc) + return rc; + } else { + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +int +mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op) +{ + int rc; + int exact = 0; + int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); + + if (mc == NULL) + return EINVAL; + + if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + switch (op) { + case MDB_GET_CURRENT: + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + } else { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int nkeys = NUMKEYS(mp); + if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { + mc->mc_ki[mc->mc_top] = nkeys; + rc = MDB_NOTFOUND; + break; + } + rc = MDB_SUCCESS; + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } else { + MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY(leaf, key); + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (mc->mc_xcursor==NULL) + rc = MDB_CORRUPTED; + else + rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); + } else { + rc = mdb_node_read(mc, leaf, data); + } + } + } + } + break; + case MDB_GET_BOTH: + case MDB_GET_BOTH_RANGE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (mc->mc_xcursor == NULL) { + rc = MDB_INCOMPATIBLE; + break; + } + /* FALLTHRU */ + case MDB_SET: + case MDB_SET_KEY: + case MDB_SET_RANGE: + if (key == NULL) { + rc = EINVAL; + } else { + rc = mdb_cursor_set(mc, key, data, op, + op == MDB_SET_RANGE ? NULL : &exact); + } + break; + case MDB_GET_MULTIPLE: + if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + rc = MDB_SUCCESS; + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || + (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) + break; + goto fetchm; + case MDB_NEXT_MULTIPLE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); + if (rc == MDB_SUCCESS) { + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + MDB_cursor *mx; +fetchm: + mx = &mc->mc_xcursor->mx_cursor; + data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * + mx->mc_db->md_pad; + data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); + mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; + } else { + rc = MDB_NOTFOUND; + } + } + break; + case MDB_PREV_MULTIPLE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdb_cursor_last(mc, key, data); + else + rc = MDB_SUCCESS; + if (rc == MDB_SUCCESS) { + MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; + if (mx->mc_flags & C_INITIALIZED) { + rc = mdb_cursor_sibling(mx, 0); + if (rc == MDB_SUCCESS) + goto fetchm; + } else { + rc = MDB_NOTFOUND; + } + } + break; + case MDB_NEXT: + case MDB_NEXT_DUP: + case MDB_NEXT_NODUP: + rc = mdb_cursor_next(mc, key, data, op); + break; + case MDB_PREV: + case MDB_PREV_DUP: + case MDB_PREV_NODUP: + rc = mdb_cursor_prev(mc, key, data, op); + break; + case MDB_FIRST: + rc = mdb_cursor_first(mc, key, data); + break; + case MDB_FIRST_DUP: + mfunc = mdb_cursor_first; + mmove: + if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + if (mc->mc_xcursor == NULL) { + rc = MDB_INCOMPATIBLE; + break; + } + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) { + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + rc = MDB_NOTFOUND; + break; + } + { + MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDB_GET_KEY(leaf, key); + rc = mdb_node_read(mc, leaf, data); + break; + } + } + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); + break; + case MDB_LAST: + rc = mdb_cursor_last(mc, key, data); + break; + case MDB_LAST_DUP: + mfunc = mdb_cursor_last; + goto mmove; + default: + DPRINTF(("unhandled/unimplemented cursor operation %u", op)); + rc = EINVAL; + break; + } + + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; + + return rc; +} + +/** Touch all the pages in the cursor stack. Set mc_top. + * Makes sure all the pages are writable, before attempting a write operation. + * @param[in] mc The cursor to operate on. + */ +static int +mdb_cursor_touch(MDB_cursor *mc) +{ + int rc = MDB_SUCCESS; + + if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { + /* Touch DB record of named DB */ + MDB_cursor mc2; + MDB_xcursor mcx; + if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) + return MDB_BAD_DBI; + mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); + if (rc) + return rc; + *mc->mc_dbflag |= DB_DIRTY; + } + mc->mc_top = 0; + if (mc->mc_snum) { + do { + rc = mdb_page_touch(mc); + } while (!rc && ++(mc->mc_top) < mc->mc_snum); + mc->mc_top = mc->mc_snum-1; + } + return rc; +} + +/** Do not spill pages to disk if txn is getting full, may fail instead */ +#define MDB_NOSPILL 0x8000 + +int +mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, + unsigned int flags) +{ + MDB_env *env; + MDB_node *leaf = NULL; + MDB_page *fp, *mp, *sub_root = NULL; + uint16_t fp_flags; + MDB_val xdata, *rdata, dkey, olddata; + MDB_db dummy; + int do_sub = 0, insert_key, insert_data; + unsigned int mcount = 0, dcount = 0, nospill; + size_t nsize; + int rc, rc2; + unsigned int nflags; + DKBUF; + + if (mc == NULL || key == NULL) + return EINVAL; + + env = mc->mc_txn->mt_env; + + /* Check this first so counter will always be zero on any + * early failures. + */ + if (flags & MDB_MULTIPLE) { + dcount = data[1].mv_size; + data[1].mv_size = 0; + if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) + return MDB_INCOMPATIBLE; + } + + nospill = flags & MDB_NOSPILL; + flags &= ~MDB_NOSPILL; + + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (key->mv_size-1 >= ENV_MAXKEY(env)) + return MDB_BAD_VALSIZE; + +#if SIZE_MAX > MAXDATASIZE + if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) + return MDB_BAD_VALSIZE; +#else + if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) + return MDB_BAD_VALSIZE; +#endif + + DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u", + DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); + + dkey.mv_size = 0; + + if (flags == MDB_CURRENT) { + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + rc = MDB_SUCCESS; + } else if (mc->mc_db->md_root == P_INVALID) { + /* new database, cursor has nothing to point to */ + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + rc = MDB_NO_ROOT; + } else { + int exact = 0; + MDB_val d2; + if (flags & MDB_APPEND) { + MDB_val k2; + rc = mdb_cursor_last(mc, &k2, &d2); + if (rc == 0) { + rc = mc->mc_dbx->md_cmp(key, &k2); + if (rc > 0) { + rc = MDB_NOTFOUND; + mc->mc_ki[mc->mc_top]++; + } else { + /* new key is <= last key */ + rc = MDB_KEYEXIST; + } + } + } else { + rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); + } + if ((flags & MDB_NOOVERWRITE) && rc == 0) { + DPRINTF(("duplicate key [%s]", DKEY(key))); + *data = d2; + return MDB_KEYEXIST; + } + if (rc && rc != MDB_NOTFOUND) + return rc; + } + + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; + + /* Cursor is positioned, check for room in the dirty list */ + if (!nospill) { + if (flags & MDB_MULTIPLE) { + rdata = &xdata; + xdata.mv_size = data->mv_size * dcount; + } else { + rdata = data; + } + if ((rc2 = mdb_page_spill(mc, key, rdata))) + return rc2; + } + + if (rc == MDB_NO_ROOT) { + MDB_page *np; + /* new database, write a root leaf page */ + DPUTS("allocating new root leaf page"); + if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { + return rc2; + } + mdb_cursor_push(mc, np); + mc->mc_db->md_root = np->mp_pgno; + mc->mc_db->md_depth++; + *mc->mc_dbflag |= DB_DIRTY; + if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) + == MDB_DUPFIXED) + np->mp_flags |= P_LEAF2; + mc->mc_flags |= C_INITIALIZED; + } else { + /* make sure all cursor pages are writable */ + rc2 = mdb_cursor_touch(mc); + if (rc2) + return rc2; + } + + insert_key = insert_data = rc; + if (insert_key) { + /* The key does not exist */ + DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + LEAFSIZE(key, data) > env->me_nodemax) + { + /* Too big for a node, insert in sub-DB. Set up an empty + * "old sub-page" for prep_subDB to expand to a full page. + */ + fp_flags = P_LEAF|P_DIRTY; + fp = env->me_pbuf; + fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */ + fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE); + olddata.mv_size = PAGEHDRSZ; + goto prep_subDB; + } + } else { + /* there's only a key anyway, so this is a no-op */ + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + char *ptr; + unsigned int ksize = mc->mc_db->md_pad; + if (key->mv_size != ksize) + return MDB_BAD_VALSIZE; + ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + memcpy(ptr, key->mv_data, ksize); +fix_parent: + /* if overwriting slot 0 of leaf, need to + * update branch key if there is a parent page + */ + if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { + unsigned short dtop = 1; + mc->mc_top--; + /* slot 0 is always an empty key, find real slot */ + while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { + mc->mc_top--; + dtop++; + } + if (mc->mc_ki[mc->mc_top]) + rc2 = mdb_update_key(mc, key); + else + rc2 = MDB_SUCCESS; + mc->mc_top += dtop; + if (rc2) + return rc2; + } + return MDB_SUCCESS; + } + +more: + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + olddata.mv_size = NODEDSZ(leaf); + olddata.mv_data = NODEDATA(leaf); + + /* DB has dups? */ + if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { + /* Prepare (sub-)page/sub-DB to accept the new item, + * if needed. fp: old sub-page or a header faking + * it. mp: new (sub-)page. offset: growth in page + * size. xdata: node data with new page or DB. + */ + unsigned i, offset = 0; + mp = fp = xdata.mv_data = env->me_pbuf; + mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; + + /* Was a single item before, must convert now */ + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDB_cmp_func *dcmp; + /* Just overwrite the current item */ + if (flags == MDB_CURRENT) + goto current; + dcmp = mc->mc_dbx->md_dcmp; +#if UINT_MAX < SIZE_MAX + if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) + dcmp = mdb_cmp_clong; +#endif + /* does data match? */ + if (!dcmp(data, &olddata)) { + if (flags & (MDB_NODUPDATA|MDB_APPENDDUP)) + return MDB_KEYEXIST; + /* overwrite it */ + goto current; + } + + /* Back up original data item */ + dkey.mv_size = olddata.mv_size; + dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); + + /* Make sub-page header for the dup items, with dummy body */ + fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; + fp->mp_lower = (PAGEHDRSZ-PAGEBASE); + xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp->mp_flags |= P_LEAF2; + fp->mp_pad = data->mv_size; + xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ + } else { + xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + + (dkey.mv_size & 1) + (data->mv_size & 1); + } + fp->mp_upper = xdata.mv_size - PAGEBASE; + olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ + } else if (leaf->mn_flags & F_SUBDATA) { + /* Data is on sub-DB, just store it */ + flags |= F_DUPDATA|F_SUBDATA; + goto put_sub; + } else { + /* Data is on sub-page */ + fp = olddata.mv_data; + switch (flags) { + default: + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + offset = EVEN(NODESIZE + sizeof(indx_t) + + data->mv_size); + break; + } + offset = fp->mp_pad; + if (SIZELEFT(fp) < offset) { + offset *= 4; /* space for 4 more */ + break; + } + /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */ + case MDB_CURRENT: + fp->mp_flags |= P_DIRTY; + COPY_PGNO(fp->mp_pgno, mp->mp_pgno); + mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; + flags |= F_DUPDATA; + goto put_sub; + } + xdata.mv_size = olddata.mv_size + offset; + } + + fp_flags = fp->mp_flags; + if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { + /* Too big for a sub-page, convert to sub-DB */ + fp_flags &= ~P_SUBP; +prep_subDB: + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp_flags |= P_LEAF2; + dummy.md_pad = fp->mp_pad; + dummy.md_flags = MDB_DUPFIXED; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + dummy.md_flags |= MDB_INTEGERKEY; + } else { + dummy.md_pad = 0; + dummy.md_flags = 0; + } + dummy.md_depth = 1; + dummy.md_branch_pages = 0; + dummy.md_leaf_pages = 1; + dummy.md_overflow_pages = 0; + dummy.md_entries = NUMKEYS(fp); + xdata.mv_size = sizeof(MDB_db); + xdata.mv_data = &dummy; + if ((rc = mdb_page_alloc(mc, 1, &mp))) + return rc; + offset = env->me_psize - olddata.mv_size; + flags |= F_DUPDATA|F_SUBDATA; + dummy.md_root = mp->mp_pgno; + sub_root = mp; + } + if (mp != fp) { + mp->mp_flags = fp_flags | P_DIRTY; + mp->mp_pad = fp->mp_pad; + mp->mp_lower = fp->mp_lower; + mp->mp_upper = fp->mp_upper + offset; + if (fp_flags & P_LEAF2) { + memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); + } else { + memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, + olddata.mv_size - fp->mp_upper - PAGEBASE); + memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), NUMKEYS(fp) * sizeof(mp->mp_ptrs[0])); + for (i=0; imp_ptrs[i] += offset; + } + } + + rdata = &xdata; + flags |= F_DUPDATA; + do_sub = 1; + if (!insert_key) + mdb_node_del(mc, 0); + goto new_sub; + } +current: + /* LMDB passes F_SUBDATA in 'flags' to write a DB record */ + if ((leaf->mn_flags ^ flags) & F_SUBDATA) + return MDB_INCOMPATIBLE; + /* overflow page overwrites need special handling */ + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); + + memcpy(&pg, olddata.mv_data, sizeof(pg)); + if ((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0) + return rc2; + ovpages = omp->mp_pages; + + /* Is the ov page large enough? */ + if (ovpages >= dpages) { + if (!(omp->mp_flags & P_DIRTY) && + (level || (env->me_flags & MDB_WRITEMAP))) + { + rc = mdb_page_unspill(mc->mc_txn, omp, &omp); + if (rc) + return rc; + level = 0; /* dirty in this txn or clean */ + } + /* Is it dirty? */ + if (omp->mp_flags & P_DIRTY) { + /* yes, overwrite it. Note in this case we don't + * bother to try shrinking the page if the new data + * is smaller than the overflow threshold. + */ + if (level > 1) { + /* It is writable only in a parent txn */ + size_t sz = (size_t) env->me_psize * ovpages, off; + MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); + MDB_ID2 id2; + if (!np) + return ENOMEM; + id2.mid = pg; + id2.mptr = np; + /* Note - this page is already counted in parent's dirty_room */ + rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); + mdb_cassert(mc, rc2 == 0); + /* Currently we make the page look as with put() in the + * parent txn, in case the user peeks at MDB_RESERVEd + * or unused parts. Some users treat ovpages specially. + */ + if (!(flags & MDB_RESERVE)) { + /* Skip the part where LMDB will put *data. + * Copy end of page, adjusting alignment so + * compiler may copy words instead of bytes. + */ + off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); + memcpy((size_t *)((char *)np + off), + (size_t *)((char *)omp + off), sz - off); + sz = PAGEHDRSZ; + } + memcpy(np, omp, sz); /* Copy beginning of page */ + omp = np; + } + SETDSZ(leaf, data->mv_size); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = METADATA(omp); + else + memcpy(METADATA(omp), data->mv_data, data->mv_size); + return MDB_SUCCESS; + } + } + if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) + return rc2; + } else if (data->mv_size == olddata.mv_size) { + /* same size, just replace it. Note that we could + * also reuse this node if the new data is smaller, + * but instead we opt to shrink the node in that case. + */ + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = olddata.mv_data; + else if (!(mc->mc_flags & C_SUB)) + memcpy(olddata.mv_data, data->mv_data, data->mv_size); + else { + memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); + goto fix_parent; + } + return MDB_SUCCESS; + } + mdb_node_del(mc, 0); + } + + rdata = data; + +new_sub: + nflags = flags & NODE_ADD_FLAGS; + nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); + if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { + if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) + nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ + if (!insert_key) + nflags |= MDB_SPLIT_REPLACE; + rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); + } else { + /* There is room already in this leaf page. */ + rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); + if (rc == 0) { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue; + if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { + m3->mc_ki[i]++; + } + XCURSOR_REFRESH(m3, i, mp); + } + } + } + + if (rc == MDB_SUCCESS) { + /* Now store the actual data in the child DB. Note that we're + * storing the user data in the keys field, so there are strict + * size limits on dupdata. The actual data fields of the child + * DB are all zero size. + */ + if (do_sub) { + int xflags, new_dupdata; + size_t ecount; +put_sub: + xdata.mv_size = 0; + xdata.mv_data = ""; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (flags & MDB_CURRENT) { + xflags = MDB_CURRENT|MDB_NOSPILL; + } else { + mdb_xcursor_init1(mc, leaf); + xflags = (flags & MDB_NODUPDATA) ? + MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; + } + if (sub_root) + mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; + new_dupdata = (int)dkey.mv_size; + /* converted, write the original data first */ + if (dkey.mv_size) { + rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); + if (rc) + goto bad_sub; + /* we've done our job */ + dkey.mv_size = 0; + } + if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2; + MDB_xcursor *mx = mc->mc_xcursor; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (!(m2->mc_flags & C_INITIALIZED)) continue; + if (m2->mc_pg[i] == mp) { + if (m2->mc_ki[i] == mc->mc_ki[i]) { + mdb_xcursor_init2(m2, mx, new_dupdata); + } else if (!insert_key) { + XCURSOR_REFRESH(m2, i, mp); + } + } + } + } + ecount = mc->mc_xcursor->mx_db.md_entries; + if (flags & MDB_APPENDDUP) + xflags |= MDB_APPEND; + rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); + if (flags & F_SUBDATA) { + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } + insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; + } + /* Increment count unless we just replaced an existing item. */ + if (insert_data) + mc->mc_db->md_entries++; + if (insert_key) { + /* Invalidate txn if we created an empty sub-DB */ + if (rc) + goto bad_sub; + /* If we succeeded and the key didn't exist before, + * make sure the cursor is marked valid. + */ + mc->mc_flags |= C_INITIALIZED; + } + if (flags & MDB_MULTIPLE) { + if (!rc) { + mcount++; + /* let caller know how many succeeded, if any */ + data[1].mv_size = mcount; + if (mcount < dcount) { + data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; + insert_key = insert_data = 0; + goto more; + } + } + } + return rc; +bad_sub: + if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */ + rc = MDB_CORRUPTED; + } + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_cursor_del(MDB_cursor *mc, unsigned int flags) +{ + MDB_node *leaf; + MDB_page *mp; + int rc; + + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + return MDB_NOTFOUND; + + if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) + return rc; + + rc = mdb_cursor_touch(mc); + if (rc) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + if (IS_LEAF2(mp)) + goto del_key; + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (mc->mc_xcursor==NULL) + return MDB_CORRUPTED; + if (flags & MDB_NODUPDATA) { + /* mdb_cursor_del0() will subtract the final entry */ + mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; + mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + } else { + if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); + if (rc) + return rc; + /* If sub-DB still has entries, we're done */ + if (mc->mc_xcursor->mx_db.md_entries) { + if (leaf->mn_flags & F_SUBDATA) { + /* update subDB info */ + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } else { + MDB_cursor *m2; + /* shrink fake page */ + mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + /* fix other sub-DB cursors pointed at fake pages on this page */ + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (!(m2->mc_flags & C_INITIALIZED)) continue; + if (m2->mc_pg[mc->mc_top] == mp) { + XCURSOR_REFRESH(m2, mc->mc_top, mp); + } + } + } + mc->mc_db->md_entries--; + return rc; + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + } + /* otherwise fall thru and delete the sub-DB */ + } + + if (leaf->mn_flags & F_SUBDATA) { + /* add all the child DB's pages to the free list */ + rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (rc) + goto fail; + } + } + /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */ + else if ((leaf->mn_flags ^ flags) & F_SUBDATA) { + rc = MDB_INCOMPATIBLE; + goto fail; + } + + /* add overflow pages to free list */ + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + + memcpy(&pg, NODEDATA(leaf), sizeof(pg)); + if ((rc = mdb_page_get(mc, pg, &omp, NULL)) || + (rc = mdb_ovpage_free(mc, omp))) + goto fail; + } + +del_key: + return mdb_cursor_del0(mc); + +fail: + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +/** Allocate and initialize new pages for a database. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc a cursor on the database being added to. + * @param[in] flags flags defining what type of page is being allocated. + * @param[in] num the number of pages to allocate. This is usually 1, + * unless allocating overflow pages for a large record. + * @param[out] mp Address of a page, or NULL on failure. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) +{ + MDB_page *np; + int rc; + + if ((rc = mdb_page_alloc(mc, num, &np))) + return rc; + DPRINTF(("allocated new mpage %"Z"u, page size %u", + np->mp_pgno, mc->mc_txn->mt_env->me_psize)); + np->mp_flags = flags | P_DIRTY; + np->mp_lower = (PAGEHDRSZ-PAGEBASE); + np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; + + if (IS_BRANCH(np)) + mc->mc_db->md_branch_pages++; + else if (IS_LEAF(np)) + mc->mc_db->md_leaf_pages++; + else if (IS_OVERFLOW(np)) { + mc->mc_db->md_overflow_pages += num; + np->mp_pages = num; + } + *mp = np; + + return 0; +} + +/** Calculate the size of a leaf node. + * The size depends on the environment's page size; if a data item + * is too large it will be put onto an overflow page and the node + * size will only include the key and not the data. Sizes are always + * rounded up to an even number of bytes, to guarantee 2-byte alignment + * of the #MDB_node headers. + * @param[in] env The environment handle. + * @param[in] key The key for the node. + * @param[in] data The data for the node. + * @return The number of bytes needed to store the node. + */ +static size_t +mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) +{ + size_t sz; + + sz = LEAFSIZE(key, data); + if (sz > env->me_nodemax) { + /* put on overflow page */ + sz -= data->mv_size - sizeof(pgno_t); + } + + return EVEN(sz + sizeof(indx_t)); +} + +/** Calculate the size of a branch node. + * The size should depend on the environment's page size but since + * we currently don't support spilling large keys onto overflow + * pages, it's simply the size of the #MDB_node header plus the + * size of the key. Sizes are always rounded up to an even number + * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. + * @param[in] env The environment handle. + * @param[in] key The key for the node. + * @return The number of bytes needed to store the node. + */ +static size_t +mdb_branch_size(MDB_env *env, MDB_val *key) +{ + size_t sz; + + sz = INDXSIZE(key); + if (sz > env->me_nodemax) { + /* put on overflow page */ + /* not implemented */ + /* sz -= key->size - sizeof(pgno_t); */ + } + + return sz + sizeof(indx_t); +} + +/** Add a node to the page pointed to by the cursor. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc The cursor for this operation. + * @param[in] indx The index on the page where the new node should be added. + * @param[in] key The key for the new node. + * @param[in] data The data for the new node, if any. + * @param[in] pgno The page number, if adding a branch node. + * @param[in] flags Flags for the node. + * @return 0 on success, non-zero on failure. Possible errors are: + *
    + *
  • ENOMEM - failed to allocate overflow pages for the node. + *
  • MDB_PAGE_FULL - there is insufficient room in the page. This error + * should never happen since all callers already calculate the + * page's free space before calling this function. + *
+ */ +static int +mdb_node_add(MDB_cursor *mc, indx_t indx, + MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags) +{ + unsigned int i; + size_t node_size = NODESIZE; + ssize_t room; + indx_t ofs; + MDB_node *node; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_page *ofp = NULL; /* overflow page */ + void *ndata; + DKBUF; + + mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); + + DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", + IS_LEAF(mp) ? "leaf" : "branch", + IS_SUBP(mp) ? "sub-" : "", + mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, + key ? key->mv_size : 0, key ? DKEY(key) : "null")); + + if (IS_LEAF2(mp)) { + /* Move higher keys up one slot. */ + int ksize = mc->mc_db->md_pad, dif; + char *ptr = LEAF2KEY(mp, indx, ksize); + dif = NUMKEYS(mp) - indx; + if (dif > 0) + memmove(ptr+ksize, ptr, dif*ksize); + /* insert new key */ + memcpy(ptr, key->mv_data, ksize); + + /* Just using these for counting */ + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + return MDB_SUCCESS; + } + + room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); + if (key != NULL) + node_size += key->mv_size; + if (IS_LEAF(mp)) { + mdb_cassert(mc, key && data); + if (F_ISSET(flags, F_BIGDATA)) { + /* Data already on overflow page. */ + node_size += sizeof(pgno_t); + } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { + int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); + int rc; + /* Put data on overflow page. */ + DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", + data->mv_size, node_size+data->mv_size)); + node_size = EVEN(node_size + sizeof(pgno_t)); + if ((ssize_t)node_size > room) + goto full; + if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) + return rc; + DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); + flags |= F_BIGDATA; + goto update; + } else { + node_size += data->mv_size; + } + } + node_size = EVEN(node_size); + if ((ssize_t)node_size > room) + goto full; + +update: + /* Move higher pointers up one slot. */ + for (i = NUMKEYS(mp); i > indx; i--) + mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; + + /* Adjust free space offsets. */ + ofs = mp->mp_upper - node_size; + mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); + mp->mp_ptrs[indx] = ofs; + mp->mp_upper = ofs; + mp->mp_lower += sizeof(indx_t); + + /* Write the node data. */ + node = NODEPTR(mp, indx); + node->mn_ksize = (key == NULL) ? 0 : key->mv_size; + node->mn_flags = flags; + if (IS_LEAF(mp)) + SETDSZ(node,data->mv_size); + else + SETPGNO(node,pgno); + + if (key) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); + + if (IS_LEAF(mp)) { + ndata = NODEDATA(node); + if (ofp == NULL) { + if (F_ISSET(flags, F_BIGDATA)) + memcpy(ndata, data->mv_data, sizeof(pgno_t)); + else if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = ndata; + else + memcpy(ndata, data->mv_data, data->mv_size); + } else { + memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); + ndata = METADATA(ofp); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = ndata; + else + memcpy(ndata, data->mv_data, data->mv_size); + } + } + + return MDB_SUCCESS; + +full: + DPRINTF(("not enough room in page %"Z"u, got %u ptrs", + mdb_dbg_pgno(mp), NUMKEYS(mp))); + DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); + DPRINTF(("node size = %"Z"u", node_size)); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_FULL; +} + +/** Delete the specified node from a page. + * @param[in] mc Cursor pointing to the node to delete. + * @param[in] ksize The size of a node. Only used if the page is + * part of a #MDB_DUPFIXED database. + */ +static void +mdb_node_del(MDB_cursor *mc, int ksize) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top]; + indx_t indx = mc->mc_ki[mc->mc_top]; + unsigned int sz; + indx_t i, j, numkeys, ptr; + MDB_node *node; + char *base; + + DPRINTF(("delete node %u on %s page %"Z"u", indx, + IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); + numkeys = NUMKEYS(mp); + mdb_cassert(mc, indx < numkeys); + + if (IS_LEAF2(mp)) { + int x = numkeys - 1 - indx; + base = LEAF2KEY(mp, indx, ksize); + if (x) + memmove(base, base + ksize, x * ksize); + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += ksize - sizeof(indx_t); + return; + } + + node = NODEPTR(mp, indx); + sz = NODESIZE + node->mn_ksize; + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + sz += sizeof(pgno_t); + else + sz += NODEDSZ(node); + } + sz = EVEN(sz); + + ptr = mp->mp_ptrs[indx]; + for (i = j = 0; i < numkeys; i++) { + if (i != indx) { + mp->mp_ptrs[j] = mp->mp_ptrs[i]; + if (mp->mp_ptrs[i] < ptr) + mp->mp_ptrs[j] += sz; + j++; + } + } + + base = (char *)mp + mp->mp_upper + PAGEBASE; + memmove(base + sz, base, ptr - mp->mp_upper); + + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += sz; +} + +/** Compact the main page after deleting a node on a subpage. + * @param[in] mp The main page to operate on. + * @param[in] indx The index of the subpage on the main page. + */ +static void +mdb_node_shrink(MDB_page *mp, indx_t indx) +{ + MDB_node *node; + MDB_page *sp, *xp; + char *base; + indx_t delta, nsize, len, ptr; + int i; + + node = NODEPTR(mp, indx); + sp = (MDB_page *)NODEDATA(node); + delta = SIZELEFT(sp); + nsize = NODEDSZ(node) - delta; + + /* Prepare to shift upward, set len = length(subpage part to shift) */ + if (IS_LEAF2(sp)) { + len = nsize; + if (nsize & 1) + return; /* do not make the node uneven-sized */ + } else { + xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ + for (i = NUMKEYS(sp); --i >= 0; ) + xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; + len = PAGEHDRSZ; + } + sp->mp_upper = sp->mp_lower; + COPY_PGNO(sp->mp_pgno, mp->mp_pgno); + SETDSZ(node, nsize); + + /* Shift upward */ + base = (char *)mp + mp->mp_upper + PAGEBASE; + memmove(base + delta, base, (char *)sp + len - base); + + ptr = mp->mp_ptrs[indx]; + for (i = NUMKEYS(mp); --i >= 0; ) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] += delta; + } + mp->mp_upper += delta; +} + +/** Initial setup of a sorted-dups cursor. + * Sorted duplicates are implemented as a sub-database for the given key. + * The duplicate data items are actually keys of the sub-database. + * Operations on the duplicate data items are performed using a sub-cursor + * initialized when the sub-database is first accessed. This function does + * the preliminary setup of the sub-cursor, filling in the fields that + * depend only on the parent DB. + * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. + */ +static void +mdb_xcursor_init0(MDB_cursor *mc) +{ + MDB_xcursor *mx = mc->mc_xcursor; + + mx->mx_cursor.mc_xcursor = NULL; + mx->mx_cursor.mc_txn = mc->mc_txn; + mx->mx_cursor.mc_db = &mx->mx_db; + mx->mx_cursor.mc_dbx = &mx->mx_dbx; + mx->mx_cursor.mc_dbi = mc->mc_dbi; + mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + mx->mx_dbx.md_name.mv_size = 0; + mx->mx_dbx.md_name.mv_data = NULL; + mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; + mx->mx_dbx.md_dcmp = NULL; + mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; +} + +/** Final setup of a sorted-dups cursor. + * Sets up the fields that depend on the data from the main cursor. + * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. + * @param[in] node The data containing the #MDB_db record for the + * sorted-dup database. + */ +static void +mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) +{ + MDB_xcursor *mx = mc->mc_xcursor; + + if (node->mn_flags & F_SUBDATA) { + memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); + mx->mx_cursor.mc_pg[0] = 0; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + } else { + MDB_page *fp = NODEDATA(node); + mx->mx_db.md_pad = 0; + mx->mx_db.md_flags = 0; + mx->mx_db.md_depth = 1; + mx->mx_db.md_branch_pages = 0; + mx->mx_db.md_leaf_pages = 1; + mx->mx_db.md_overflow_pages = 0; + mx->mx_db.md_entries = NUMKEYS(fp); + COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; + mx->mx_cursor.mc_pg[0] = fp; + mx->mx_cursor.mc_ki[0] = 0; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + mx->mx_db.md_flags = MDB_DUPFIXED; + mx->mx_db.md_pad = fp->mp_pad; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + mx->mx_db.md_flags |= MDB_INTEGERKEY; + } + } + DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, + mx->mx_db.md_root)); + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; +#if UINT_MAX < SIZE_MAX + if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) + mx->mx_dbx.md_cmp = mdb_cmp_clong; +#endif +} + + +/** Fixup a sorted-dups cursor due to underlying update. + * Sets up some fields that depend on the data from the main cursor. + * Almost the same as init1, but skips initialization steps if the + * xcursor had already been used. + * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up. + * @param[in] src_mx The xcursor of an up-to-date cursor. + * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item. + */ +static void +mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) +{ + MDB_xcursor *mx = mc->mc_xcursor; + + if (new_dupdata) { + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags |= C_INITIALIZED; + mx->mx_cursor.mc_ki[0] = 0; + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; +#if UINT_MAX < SIZE_MAX + mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; +#endif + } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) { + return; + } + mx->mx_db = src_mx->mx_db; + mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; + DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, + mx->mx_db.md_root)); +} + +/** Initialize a cursor for a given transaction and database. */ +static void +mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) +{ + mc->mc_next = NULL; + mc->mc_backup = NULL; + mc->mc_dbi = dbi; + mc->mc_txn = txn; + mc->mc_db = &txn->mt_dbs[dbi]; + mc->mc_dbx = &txn->mt_dbxs[dbi]; + mc->mc_dbflag = &txn->mt_dbflags[dbi]; + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_pg[0] = 0; + mc->mc_ki[0] = 0; + mc->mc_flags = 0; + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { + mdb_tassert(txn, mx != NULL); + mc->mc_xcursor = mx; + mdb_xcursor_init0(mc); + } else { + mc->mc_xcursor = NULL; + } + if (*mc->mc_dbflag & DB_STALE) { + mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); + } +} + +int +mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) +{ + MDB_cursor *mc; + size_t size = sizeof(MDB_cursor); + + if (!ret || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + if (dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EINVAL; + + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) + size += sizeof(MDB_xcursor); + + if ((mc = malloc(size)) != NULL) { + mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); + if (txn->mt_cursors) { + mc->mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = mc; + mc->mc_flags |= C_UNTRACK; + } + } else { + return ENOMEM; + } + + *ret = mc; + + return MDB_SUCCESS; +} + +int +mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) +{ + if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID)) + return EINVAL; + + if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); + return MDB_SUCCESS; +} + +/* Return the count of duplicate data items for the current key */ +int +mdb_cursor_count(MDB_cursor *mc, size_t *countp) +{ + MDB_node *leaf; + + if (mc == NULL || countp == NULL) + return EINVAL; + + if (mc->mc_xcursor == NULL) + return MDB_INCOMPATIBLE; + + if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + + if (!mc->mc_snum) + return MDB_NOTFOUND; + + if (mc->mc_flags & C_EOF) { + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + return MDB_NOTFOUND; + mc->mc_flags ^= C_EOF; + } + + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + *countp = 1; + } else { + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + return EINVAL; + + *countp = mc->mc_xcursor->mx_db.md_entries; + } + return MDB_SUCCESS; +} + +void +mdb_cursor_close(MDB_cursor *mc) +{ + if (mc && !mc->mc_backup) { + /* remove from txn, if tracked */ + if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { + MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + while (*prev && *prev != mc) prev = &(*prev)->mc_next; + if (*prev == mc) + *prev = mc->mc_next; + } + free(mc); + } +} + +MDB_txn * +mdb_cursor_txn(MDB_cursor *mc) +{ + if (!mc) return NULL; + return mc->mc_txn; +} + +MDB_dbi +mdb_cursor_dbi(MDB_cursor *mc) +{ + return mc->mc_dbi; +} + +/** Replace the key for a branch node with a new key. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc Cursor pointing to the node to operate on. + * @param[in] key The new key to use. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_update_key(MDB_cursor *mc, MDB_val *key) +{ + MDB_page *mp; + MDB_node *node; + char *base; + size_t len; + int delta, ksize, oksize; + indx_t ptr, i, numkeys, indx; + DKBUF; + + indx = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + node = NODEPTR(mp, indx); + ptr = mp->mp_ptrs[indx]; +#if MDB_DEBUG + { + MDB_val k2; + char kbuf2[DKBUF_MAXKEYSIZE*2+1]; + k2.mv_data = NODEKEY(node); + k2.mv_size = node->mn_ksize; + DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", + indx, ptr, + mdb_dkey(&k2, kbuf2), + DKEY(key), + mp->mp_pgno)); + } +#endif + + /* Sizes must be 2-byte aligned. */ + ksize = EVEN(key->mv_size); + oksize = EVEN(node->mn_ksize); + delta = ksize - oksize; + + /* Shift node contents if EVEN(key length) changed. */ + if (delta) { + if (delta > 0 && SIZELEFT(mp) < delta) { + pgno_t pgno; + /* not enough space left, do a delete and split */ + DPRINTF(("Not enough room, delta = %d, splitting...", delta)); + pgno = NODEPGNO(node); + mdb_node_del(mc, 0); + return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); + } + + numkeys = NUMKEYS(mp); + for (i = 0; i < numkeys; i++) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] -= delta; + } + + base = (char *)mp + mp->mp_upper + PAGEBASE; + len = ptr - mp->mp_upper + NODESIZE; + memmove(base - delta, base, len); + mp->mp_upper -= delta; + + node = NODEPTR(mp, indx); + } + + /* But even if no shift was needed, update ksize */ + if (node->mn_ksize != key->mv_size) + node->mn_ksize = key->mv_size; + + if (key->mv_size) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); + + return MDB_SUCCESS; +} + +static void +mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); + +/** Perform \b act while tracking temporary cursor \b mn */ +#define WITH_CURSOR_TRACKING(mn, act) do { \ + MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ + if ((mn).mc_flags & C_SUB) { \ + dummy.mc_flags = C_INITIALIZED; \ + dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ + tracked = &dummy; \ + } else { \ + tracked = &(mn); \ + } \ + tracked->mc_next = *tp; \ + *tp = tracked; \ + { act; } \ + *tp = tracked->mc_next; \ +} while (0) + +/** Move a node from csrc to cdst. + */ +static int +mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) +{ + MDB_node *srcnode; + MDB_val key, data; + pgno_t srcpg; + MDB_cursor mn; + int rc; + unsigned short flags; + + DKBUF; + + /* Mark src and dst as dirty. */ + if ((rc = mdb_page_touch(csrc)) || + (rc = mdb_page_touch(cdst))) + return rc; + + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); + data.mv_size = 0; + data.mv_data = NULL; + srcpg = 0; + flags = 0; + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); + mdb_cassert(csrc, !((size_t)srcnode & 1)); + srcpg = NODEPGNO(srcnode); + flags = srcnode->mn_flags; + if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + unsigned int snum = csrc->mc_snum; + MDB_node *s2; + /* must find the lowest key below src */ + rc = mdb_page_search_lowest(csrc); + if (rc) + return rc; + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + csrc->mc_snum = snum--; + csrc->mc_top = snum; + } else { + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + } + mn.mc_xcursor = NULL; + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { + unsigned int snum = cdst->mc_snum; + MDB_node *s2; + MDB_val bkey; + /* must find the lowest key below dst */ + mdb_cursor_copy(cdst, &mn); + rc = mdb_page_search_lowest(&mn); + if (rc) + return rc; + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + bkey.mv_size = mn.mc_db->md_pad; + bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); + } else { + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); + bkey.mv_size = NODEKSZ(s2); + bkey.mv_data = NODEKEY(s2); + } + mn.mc_snum = snum--; + mn.mc_top = snum; + mn.mc_ki[snum] = 0; + rc = mdb_update_key(&mn, &bkey); + if (rc) + return rc; + } + + DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", + IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", + csrc->mc_ki[csrc->mc_top], + DKEY(&key), + csrc->mc_pg[csrc->mc_top]->mp_pgno, + cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); + + /* Add the node to the destination page. + */ + rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); + if (rc != MDB_SUCCESS) + return rc; + + /* Delete the node from the source page. + */ + mdb_node_del(csrc, key.mv_size); + + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + MDB_page *mpd, *mps; + + mps = csrc->mc_pg[csrc->mc_top]; + /* If we're adding on the left, bump others up */ + if (fromleft) { + mpd = cdst->mc_pg[csrc->mc_top]; + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; + if (m3 != cdst && + m3->mc_pg[csrc->mc_top] == mpd && + m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { + m3->mc_ki[csrc->mc_top]++; + } + if (m3 !=csrc && + m3->mc_pg[csrc->mc_top] == mps && + m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + m3->mc_ki[csrc->mc_top-1]++; + } + if (IS_LEAF(mps)) + XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); + } + } else + /* Adding on the right, bump others down */ + { + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) continue; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; + if (m3->mc_pg[csrc->mc_top] == mps) { + if (!m3->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + m3->mc_ki[csrc->mc_top-1]--; + } else { + m3->mc_ki[csrc->mc_top]--; + } + if (IS_LEAF(mps)) + XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); + } + } + } + } + + /* Update the parent separators. + */ + if (csrc->mc_ki[csrc->mc_top] == 0) { + if (csrc->mc_ki[csrc->mc_top-1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + DPRINTF(("update separator for source page %"Z"u to [%s]", + csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); + mdb_cursor_copy(csrc, &mn); + mn.mc_snum--; + mn.mc_top--; + /* We want mdb_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_update_key(&mn, &key)); + if (rc) + return rc; + } + if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + MDB_val nullkey; + indx_t ix = csrc->mc_ki[csrc->mc_top]; + nullkey.mv_size = 0; + csrc->mc_ki[csrc->mc_top] = 0; + rc = mdb_update_key(csrc, &nullkey); + csrc->mc_ki[csrc->mc_top] = ix; + mdb_cassert(csrc, rc == MDB_SUCCESS); + } + } + + if (cdst->mc_ki[cdst->mc_top] == 0) { + if (cdst->mc_ki[cdst->mc_top-1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + DPRINTF(("update separator for destination page %"Z"u to [%s]", + cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); + mdb_cursor_copy(cdst, &mn); + mn.mc_snum--; + mn.mc_top--; + /* We want mdb_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_update_key(&mn, &key)); + if (rc) + return rc; + } + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { + MDB_val nullkey; + indx_t ix = cdst->mc_ki[cdst->mc_top]; + nullkey.mv_size = 0; + cdst->mc_ki[cdst->mc_top] = 0; + rc = mdb_update_key(cdst, &nullkey); + cdst->mc_ki[cdst->mc_top] = ix; + mdb_cassert(cdst, rc == MDB_SUCCESS); + } + } + + return MDB_SUCCESS; +} + +/** Merge one page into another. + * The nodes from the page pointed to by \b csrc will + * be copied to the page pointed to by \b cdst and then + * the \b csrc page will be freed. + * @param[in] csrc Cursor pointing to the source page. + * @param[in] cdst Cursor pointing to the destination page. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) +{ + MDB_page *psrc, *pdst; + MDB_node *srcnode; + MDB_val key, data; + unsigned nkeys; + int rc; + indx_t i, j; + + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; + + DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno)); + + mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ + mdb_cassert(csrc, cdst->mc_snum > 1); + + /* Mark dst as dirty. */ + if ((rc = mdb_page_touch(cdst))) + return rc; + + /* get dst page again now that we've touched it. */ + pdst = cdst->mc_pg[cdst->mc_top]; + + /* Move all nodes from src to dst. + */ + j = nkeys = NUMKEYS(pdst); + if (IS_LEAF2(psrc)) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = METADATA(psrc); + for (i = 0; i < NUMKEYS(psrc); i++, j++) { + rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); + if (rc != MDB_SUCCESS) + return rc; + key.mv_data = (char *)key.mv_data + key.mv_size; + } + } else { + for (i = 0; i < NUMKEYS(psrc); i++, j++) { + srcnode = NODEPTR(psrc, i); + if (i == 0 && IS_BRANCH(psrc)) { + MDB_cursor mn; + MDB_node *s2; + mdb_cursor_copy(csrc, &mn); + mn.mc_xcursor = NULL; + /* must find the lowest key below src */ + rc = mdb_page_search_lowest(&mn); + if (rc) + return rc; + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + key.mv_size = mn.mc_db->md_pad; + key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + } else { + key.mv_size = srcnode->mn_ksize; + key.mv_data = NODEKEY(srcnode); + } + + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); + if (rc != MDB_SUCCESS) + return rc; + } + } + + DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", + pdst->mp_pgno, NUMKEYS(pdst), + (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10)); + + /* Unlink the src page from parent and add to free list. + */ + csrc->mc_top--; + mdb_node_del(csrc, 0); + if (csrc->mc_ki[csrc->mc_top] == 0) { + key.mv_size = 0; + rc = mdb_update_key(csrc, &key); + if (rc) { + csrc->mc_top++; + return rc; + } + } + csrc->mc_top++; + + psrc = csrc->mc_pg[csrc->mc_top]; + /* If not operating on FreeDB, allow this page to be reused + * in this txn. Otherwise just add to free list. + */ + rc = mdb_page_loose(csrc, psrc); + if (rc) + return rc; + if (IS_LEAF(psrc)) + csrc->mc_db->md_leaf_pages--; + else + csrc->mc_db->md_branch_pages--; + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + unsigned int top = csrc->mc_top; + + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) continue; + if (m3->mc_snum < csrc->mc_snum) continue; + if (m3->mc_pg[top] == psrc) { + m3->mc_pg[top] = pdst; + m3->mc_ki[top] += nkeys; + m3->mc_ki[top-1] = cdst->mc_ki[top-1]; + } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] && + m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { + m3->mc_ki[top-1]--; + } + if (IS_LEAF(psrc)) + XCURSOR_REFRESH(m3, top, m3->mc_pg[top]); + } + } + { + unsigned int snum = cdst->mc_snum; + uint16_t depth = cdst->mc_db->md_depth; + mdb_cursor_pop(cdst); + rc = mdb_rebalance(cdst); + /* Did the tree height change? */ + if (depth != cdst->mc_db->md_depth) + snum += cdst->mc_db->md_depth - depth; + cdst->mc_snum = snum; + cdst->mc_top = snum-1; + } + return rc; +} + +/** Copy the contents of a cursor. + * @param[in] csrc The cursor to copy from. + * @param[out] cdst The cursor to copy to. + */ +static void +mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) +{ + unsigned int i; + + cdst->mc_txn = csrc->mc_txn; + cdst->mc_dbi = csrc->mc_dbi; + cdst->mc_db = csrc->mc_db; + cdst->mc_dbx = csrc->mc_dbx; + cdst->mc_snum = csrc->mc_snum; + cdst->mc_top = csrc->mc_top; + cdst->mc_flags = csrc->mc_flags; + + for (i=0; imc_snum; i++) { + cdst->mc_pg[i] = csrc->mc_pg[i]; + cdst->mc_ki[i] = csrc->mc_ki[i]; + } +} + +/** Rebalance the tree after a delete operation. + * @param[in] mc Cursor pointing to the page where rebalancing + * should begin. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_rebalance(MDB_cursor *mc) +{ + MDB_node *node; + int rc, fromleft; + unsigned int ptop, minkeys, thresh; + MDB_cursor mn; + indx_t oldki; + + if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { + minkeys = 2; + thresh = 1; + } else { + minkeys = 1; + thresh = FILL_THRESHOLD; + } + DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", + IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", + mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); + + if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && + NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { + DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", + mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); + return MDB_SUCCESS; + } + + if (mc->mc_snum < 2) { + MDB_page *mp = mc->mc_pg[0]; + if (IS_SUBP(mp)) { + DPUTS("Can't rebalance a subpage, ignoring"); + return MDB_SUCCESS; + } + if (NUMKEYS(mp) == 0) { + DPUTS("tree is completely empty"); + mc->mc_db->md_root = P_INVALID; + mc->mc_db->md_depth = 0; + mc->mc_db->md_leaf_pages = 0; + rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (rc) + return rc; + /* Adjust cursors pointing to mp */ + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + { + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) + continue; + if (m3->mc_pg[0] == mp) { + m3->mc_snum = 0; + m3->mc_top = 0; + m3->mc_flags &= ~C_INITIALIZED; + } + } + } + } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { + int i; + DPUTS("collapsing root page!"); + rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (rc) + return rc; + mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); + rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); + if (rc) + return rc; + mc->mc_db->md_depth--; + mc->mc_db->md_branch_pages--; + mc->mc_ki[0] = mc->mc_ki[1]; + for (i = 1; imc_db->md_depth; i++) { + mc->mc_pg[i] = mc->mc_pg[i+1]; + mc->mc_ki[i] = mc->mc_ki[i+1]; + } + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc) continue; + if (!(m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_pg[0] == mp) { + for (i=0; imc_db->md_depth; i++) { + m3->mc_pg[i] = m3->mc_pg[i+1]; + m3->mc_ki[i] = m3->mc_ki[i+1]; + } + m3->mc_snum--; + m3->mc_top--; + } + } + } + } else + DPUTS("root page doesn't need rebalancing"); + return MDB_SUCCESS; + } + + /* The parent (branch page) must have at least 2 pointers, + * otherwise the tree is invalid. + */ + ptop = mc->mc_top-1; + mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); + + /* Leaf page fill factor is below the threshold. + * Try to move keys from left or right neighbor, or + * merge with a neighbor page. + */ + + /* Find neighbors. + */ + mdb_cursor_copy(mc, &mn); + mn.mc_xcursor = NULL; + + oldki = mc->mc_ki[mc->mc_top]; + if (mc->mc_ki[ptop] == 0) { + /* We're the leftmost leaf in our parent. + */ + DPUTS("reading right neighbor"); + mn.mc_ki[ptop]++; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); + if (rc) + return rc; + mn.mc_ki[mn.mc_top] = 0; + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + fromleft = 0; + } else { + /* There is at least one neighbor to the left. + */ + DPUTS("reading left neighbor"); + mn.mc_ki[ptop]--; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); + if (rc) + return rc; + mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; + mc->mc_ki[mc->mc_top] = 0; + fromleft = 1; + } + + DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", + mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); + + /* If the neighbor page is above threshold and has enough keys, + * move one key from it. Otherwise we should try to merge them. + * (A branch page must never have less than 2 keys.) + */ + if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { + rc = mdb_node_move(&mn, mc, fromleft); + if (fromleft) { + /* if we inserted on left, bump position up */ + oldki++; + } + } else { + if (!fromleft) { + rc = mdb_page_merge(&mn, mc); + } else { + oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); + mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; + /* We want mdb_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_page_merge(mc, &mn)); + mdb_cursor_copy(&mn, mc); + } + mc->mc_flags &= ~C_EOF; + } + mc->mc_ki[mc->mc_top] = oldki; + return rc; +} + +/** Complete a delete operation started by #mdb_cursor_del(). */ +static int +mdb_cursor_del0(MDB_cursor *mc) +{ + int rc; + MDB_page *mp; + indx_t ki; + unsigned int nkeys; + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + mdb_node_del(mc, mc->mc_db->md_pad); + mc->mc_db->md_entries--; + { + /* Adjust other cursors pointing to mp */ + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3 == mc || m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] == ki) { + m3->mc_flags |= C_DEL; + if (mc->mc_db->md_flags & MDB_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + } + continue; + } else if (m3->mc_ki[mc->mc_top] > ki) { + m3->mc_ki[mc->mc_top]--; + } + XCURSOR_REFRESH(m3, mc->mc_top, mp); + } + } + } + rc = mdb_rebalance(mc); + + if (rc == MDB_SUCCESS) { + /* DB is totally empty now, just bail out. + * Other cursors adjustments were already done + * by mdb_rebalance and aren't needed here. + */ + if (!mc->mc_snum) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + nkeys = NUMKEYS(mp); + + /* Adjust other cursors pointing to mp */ + for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + /* if m3 points past last node in page, find next sibling */ + if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdb_cursor_sibling(m3, 1); + if (rc == MDB_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDB_SUCCESS; + continue; + } + } + if (mc->mc_db->md_flags & MDB_DUPSORT) { + MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not initd it must be reinited. + * Else if node points to a subDB, nothing is needed. + * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset. + */ + if (node->mn_flags & F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node->mn_flags & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } else { + mdb_xcursor_init1(m3, node); + m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; + } + } + } + } + } + } + mc->mc_flags |= C_DEL; + } + + if (rc) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_del(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data) +{ + if (!key || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { + /* must ignore any data */ + data = NULL; + } + + return mdb_del0(txn, dbi, key, data, 0); +} + +static int +mdb_del0(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data, unsigned flags) +{ + MDB_cursor mc; + MDB_xcursor mx; + MDB_cursor_op op; + MDB_val rdata, *xdata; + int rc, exact = 0; + DKBUF; + + DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); + + mdb_cursor_init(&mc, txn, dbi, &mx); + + if (data) { + op = MDB_GET_BOTH; + rdata = *data; + xdata = &rdata; + } else { + op = MDB_SET; + xdata = NULL; + flags |= MDB_NODUPDATA; + } + rc = mdb_cursor_set(&mc, key, xdata, op, &exact); + if (rc == 0) { + /* let mdb_page_split know about this cursor if needed: + * delete will trigger a rebalance; if it needs to move + * a node from one page to another, it will have to + * update the parent's separator key(s). If the new sepkey + * is larger than the current one, the parent page may + * run out of space, triggering a split. We need this + * cursor to be consistent until the end of the rebalance. + */ + mc.mc_flags |= C_UNTRACK; + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdb_cursor_del(&mc, flags); + txn->mt_cursors[dbi] = mc.mc_next; + } + return rc; +} + +/** Split a page and insert a new node. + * Set #MDB_TXN_ERROR on failure. + * @param[in,out] mc Cursor pointing to the page and desired insertion index. + * The cursor will be updated to point to the actual page and index where + * the node got inserted after the split. + * @param[in] newkey The key for the newly inserted node. + * @param[in] newdata The data for the newly inserted node. + * @param[in] newpgno The page number, if the new node is a branch node. + * @param[in] nflags The #NODE_ADD_FLAGS for the new node. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, + unsigned int nflags) +{ + unsigned int flags; + int rc = MDB_SUCCESS, new_root = 0, did_split = 0; + indx_t newindx; + pgno_t pgno = 0; + int i, j, split_indx, nkeys, pmax; + MDB_env *env = mc->mc_txn->mt_env; + MDB_node *node; + MDB_val sepkey, rkey, xdata, *rdata = &xdata; + MDB_page *copy = NULL; + MDB_page *mp, *rp, *pp; + int ptop; + MDB_cursor mn; + DKBUF; + + mp = mc->mc_pg[mc->mc_top]; + newindx = mc->mc_ki[mc->mc_top]; + nkeys = NUMKEYS(mp); + + DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, + DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); + + /* Create a right sibling. */ + if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) + return rc; + rp->mp_pad = mp->mp_pad; + DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); + + /* Usually when splitting the root page, the cursor + * height is 1. But when called from mdb_update_key, + * the cursor height may be greater because it walks + * up the stack while finding the branch slot to update. + */ + if (mc->mc_top < 1) { + if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) + goto done; + /* shift current top to make room for new parent */ + for (i=mc->mc_snum; i>0; i--) { + mc->mc_pg[i] = mc->mc_pg[i-1]; + mc->mc_ki[i] = mc->mc_ki[i-1]; + } + mc->mc_pg[0] = pp; + mc->mc_ki[0] = 0; + mc->mc_db->md_root = pp->mp_pgno; + DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); + new_root = mc->mc_db->md_depth++; + + /* Add left (implicit) pointer. */ + if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { + /* undo the pre-push */ + mc->mc_pg[0] = mc->mc_pg[1]; + mc->mc_ki[0] = mc->mc_ki[1]; + mc->mc_db->md_root = mp->mp_pgno; + mc->mc_db->md_depth--; + goto done; + } + mc->mc_snum++; + mc->mc_top++; + ptop = 0; + } else { + ptop = mc->mc_top-1; + DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); + } + + mdb_cursor_copy(mc, &mn); + mn.mc_xcursor = NULL; + mn.mc_pg[mn.mc_top] = rp; + mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; + + if (nflags & MDB_APPEND) { + mn.mc_ki[mn.mc_top] = 0; + sepkey = *newkey; + split_indx = newindx; + nkeys = 0; + } else { + + split_indx = (nkeys+1) / 2; + + if (IS_LEAF2(rp)) { + char *split, *ins; + int x; + unsigned int lsize, rsize, ksize; + /* Move half of the keys to the right sibling */ + x = mc->mc_ki[mc->mc_top] - split_indx; + ksize = mc->mc_db->md_pad; + split = LEAF2KEY(mp, split_indx, ksize); + rsize = (nkeys - split_indx) * ksize; + lsize = (nkeys - split_indx) * sizeof(indx_t); + mp->mp_lower -= lsize; + rp->mp_lower += lsize; + mp->mp_upper += rsize - lsize; + rp->mp_upper -= rsize - lsize; + sepkey.mv_size = ksize; + if (newindx == split_indx) { + sepkey.mv_data = newkey->mv_data; + } else { + sepkey.mv_data = split; + } + if (x<0) { + ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); + memcpy(rp->mp_ptrs, split, rsize); + sepkey.mv_data = rp->mp_ptrs; + memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); + memcpy(ins, newkey->mv_data, ksize); + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + } else { + if (x) + memcpy(rp->mp_ptrs, split, x * ksize); + ins = LEAF2KEY(rp, x, ksize); + memcpy(ins, newkey->mv_data, ksize); + memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); + rp->mp_lower += sizeof(indx_t); + rp->mp_upper -= ksize - sizeof(indx_t); + mc->mc_ki[mc->mc_top] = x; + } + } else { + int psize, nsize, k; + /* Maximum free space in an empty page */ + pmax = env->me_psize - PAGEHDRSZ; + if (IS_LEAF(mp)) + nsize = mdb_leaf_size(env, newkey, newdata); + else + nsize = mdb_branch_size(env, newkey); + nsize = EVEN(nsize); + + /* grab a page to hold a temporary copy */ + copy = mdb_page_malloc(mc->mc_txn, 1); + if (copy == NULL) { + rc = ENOMEM; + goto done; + } + copy->mp_pgno = mp->mp_pgno; + copy->mp_flags = mp->mp_flags; + copy->mp_lower = (PAGEHDRSZ-PAGEBASE); + copy->mp_upper = env->me_psize - PAGEBASE; + + /* prepare to insert */ + for (i=0, j=0; imp_ptrs[j++] = 0; + } + copy->mp_ptrs[j++] = mp->mp_ptrs[i]; + } + + /* When items are relatively large the split point needs + * to be checked, because being off-by-one will make the + * difference between success or failure in mdb_node_add. + * + * It's also relevant if a page happens to be laid out + * such that one half of its nodes are all "small" and + * the other half of its nodes are "large." If the new + * item is also "large" and falls on the half with + * "large" nodes, it also may not fit. + * + * As a final tweak, if the new item goes on the last + * spot on the page (and thus, onto the new page), bias + * the split so the new page is emptier than the old page. + * This yields better packing during sequential inserts. + */ + if (nkeys < 32 || nsize > pmax/16 || newindx >= nkeys) { + /* Find split point */ + psize = 0; + if (newindx <= split_indx || newindx >= nkeys) { + i = 0; j = 1; + k = newindx >= nkeys ? nkeys : split_indx+1+IS_LEAF(mp); + } else { + i = nkeys; j = -1; + k = split_indx-1; + } + for (; i!=k; i+=j) { + if (i == newindx) { + psize += nsize; + node = NULL; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + psize += sizeof(pgno_t); + else + psize += NODEDSZ(node); + } + psize = EVEN(psize); + } + if (psize > pmax || i == k-j) { + split_indx = i + (j<0); + break; + } + } + } + if (split_indx == newindx) { + sepkey.mv_size = newkey->mv_size; + sepkey.mv_data = newkey->mv_data; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); + sepkey.mv_size = node->mn_ksize; + sepkey.mv_data = NODEKEY(node); + } + } + } + + DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey))); + + /* Copy separator key to the parent. + */ + if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { + int snum = mc->mc_snum; + mn.mc_snum--; + mn.mc_top--; + did_split = 1; + /* We want other splits to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); + if (rc) + goto done; + + /* root split? */ + if (mc->mc_snum > snum) { + ptop++; + } + /* Right page might now have changed parent. + * Check if left page also changed parent. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; imc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + mc->mc_pg[ptop] = mn.mc_pg[ptop]; + if (mn.mc_ki[ptop]) { + mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; + } else { + /* find right page's left sibling */ + mc->mc_ki[ptop] = mn.mc_ki[ptop]; + mdb_cursor_sibling(mc, 0); + } + } + } else { + mn.mc_top--; + rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); + mn.mc_top++; + } + if (rc != MDB_SUCCESS) { + goto done; + } + if (nflags & MDB_APPEND) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[mc->mc_top] = 0; + rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); + if (rc) + goto done; + for (i=0; imc_top; i++) + mc->mc_ki[i] = mn.mc_ki[i]; + } else if (!IS_LEAF2(mp)) { + /* Move nodes */ + mc->mc_pg[mc->mc_top] = rp; + i = split_indx; + j = 0; + do { + if (i == newindx) { + rkey.mv_data = newkey->mv_data; + rkey.mv_size = newkey->mv_size; + if (IS_LEAF(mp)) { + rdata = newdata; + } else + pgno = newpgno; + flags = nflags; + /* Update index for the new key. */ + mc->mc_ki[mc->mc_top] = j; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + rkey.mv_data = NODEKEY(node); + rkey.mv_size = node->mn_ksize; + if (IS_LEAF(mp)) { + xdata.mv_data = NODEDATA(node); + xdata.mv_size = NODEDSZ(node); + rdata = &xdata; + } else + pgno = NODEPGNO(node); + flags = node->mn_flags; + } + + if (!IS_LEAF(mp) && j == 0) { + /* First branch index doesn't need key data. */ + rkey.mv_size = 0; + } + + rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); + if (rc) + goto done; + if (i == nkeys) { + i = 0; + j = 0; + mc->mc_pg[mc->mc_top] = copy; + } else { + i++; + j++; + } + } while (i != split_indx); + + nkeys = NUMKEYS(copy); + for (i=0; imp_ptrs[i] = copy->mp_ptrs[i]; + mp->mp_lower = copy->mp_lower; + mp->mp_upper = copy->mp_upper; + memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), + env->me_psize - copy->mp_upper - PAGEBASE); + + /* reset back to original page */ + if (newindx < split_indx) { + mc->mc_pg[mc->mc_top] = mp; + } else { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; i<=ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } + if (nflags & MDB_RESERVE) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!(node->mn_flags & F_BIGDATA)) + newdata->mv_data = NODEDATA(node); + } + } else { + if (newindx >= split_indx) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; i<=ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } + } + + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + nkeys = NUMKEYS(mp); + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc) + continue; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (new_root) { + int k; + /* sub cursors may be on different DB */ + if (m3->mc_pg[0] != mp) + continue; + /* root split */ + for (k=new_root; k>=0; k--) { + m3->mc_ki[k+1] = m3->mc_ki[k]; + m3->mc_pg[k+1] = m3->mc_pg[k]; + } + if (m3->mc_ki[0] >= nkeys) { + m3->mc_ki[0] = 1; + } else { + m3->mc_ki[0] = 0; + } + m3->mc_pg[0] = mc->mc_pg[0]; + m3->mc_snum++; + m3->mc_top++; + } + if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) + m3->mc_ki[mc->mc_top]++; + if (m3->mc_ki[mc->mc_top] >= nkeys) { + m3->mc_pg[mc->mc_top] = rp; + m3->mc_ki[mc->mc_top] -= nkeys; + for (i=0; imc_top; i++) { + m3->mc_ki[i] = mn.mc_ki[i]; + m3->mc_pg[i] = mn.mc_pg[i]; + } + } + } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && + m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { + m3->mc_ki[ptop]++; + } + if (IS_LEAF(mp)) + XCURSOR_REFRESH(m3, mc->mc_top, m3->mc_pg[mc->mc_top]); + } + } + DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); + +done: + if (copy) /* tmp page */ + mdb_page_free(env, copy); + if (rc) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_put(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data, unsigned int flags) +{ + MDB_cursor mc; + MDB_xcursor mx; + int rc; + + if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + if (flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) + return EINVAL; + + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + mdb_cursor_init(&mc, txn, dbi, &mx); + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdb_cursor_put(&mc, key, data, flags); + txn->mt_cursors[dbi] = mc.mc_next; + return rc; +} + +#ifndef MDB_WBUF +#define MDB_WBUF (1024*1024) +#endif +#define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ + + /** State needed for a double-buffering compacting copy. */ +typedef struct mdb_copy { + MDB_env *mc_env; + MDB_txn *mc_txn; + pthread_mutex_t mc_mutex; + pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ + char *mc_wbuf[2]; + char *mc_over[2]; + int mc_wlen[2]; + int mc_olen[2]; + pgno_t mc_next_pgno; + HANDLE mc_fd; + int mc_toggle; /**< Buffer number in provider */ + int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ + /** Error code. Never cleared if set. Both threads can set nonzero + * to fail the copy. Not mutex-protected, LMDB expects atomic int. + */ + volatile int mc_error; +} mdb_copy; + + /** Dedicated writer thread for compacting copy. */ +static THREAD_RET ESECT CALL_CONV +mdb_env_copythr(void *arg) +{ + mdb_copy *my = arg; + char *ptr; + int toggle = 0, wsize, rc; +#ifdef _WIN32 + DWORD len; +#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) +#else + int len; +#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) +#ifdef SIGPIPE + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGPIPE); + if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) + my->mc_error = rc; +#endif +#endif + + pthread_mutex_lock(&my->mc_mutex); + for(;;) { + while (!my->mc_new) + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ + break; + wsize = my->mc_wlen[toggle]; + ptr = my->mc_wbuf[toggle]; +again: + rc = MDB_SUCCESS; + while (wsize > 0 && !my->mc_error) { + DO_WRITE(rc, my->mc_fd, ptr, wsize, len); + if (!rc) { + rc = ErrCode(); +#if defined(SIGPIPE) && !defined(_WIN32) + if (rc == EPIPE) { + /* Collect the pending SIGPIPE, otherwise at least OS X + * gives it to the process on thread-exit (ITS#8504). + */ + int tmp; + sigwait(&set, &tmp); + } +#endif + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + if (rc) { + my->mc_error = rc; + } + /* If there's an overflow page tail, write it too */ + if (my->mc_olen[toggle]) { + wsize = my->mc_olen[toggle]; + ptr = my->mc_over[toggle]; + my->mc_olen[toggle] = 0; + goto again; + } + my->mc_wlen[toggle] = 0; + toggle ^= 1; + /* Return the empty buffer to provider */ + my->mc_new--; + pthread_cond_signal(&my->mc_cond); + } + pthread_mutex_unlock(&my->mc_mutex); + return (THREAD_RET)0; +#undef DO_WRITE +} + + /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. + * + * @param[in] my control structure. + * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). + */ +static int ESECT +mdb_env_cthr_toggle(mdb_copy *my, int adjust) +{ + pthread_mutex_lock(&my->mc_mutex); + my->mc_new += adjust; + pthread_cond_signal(&my->mc_cond); + while (my->mc_new & 2) /* both buffers in use */ + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + pthread_mutex_unlock(&my->mc_mutex); + + my->mc_toggle ^= (adjust & 1); + /* Both threads reset mc_wlen, to be safe from threading errors */ + my->mc_wlen[my->mc_toggle] = 0; + return my->mc_error; +} + + /** Depth-first tree traversal for compacting copy. + * @param[in] my control structure. + * @param[in,out] pg database root. + * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. + */ +static int ESECT +mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) +{ + MDB_cursor mc = {0}; + MDB_node *ni; + MDB_page *mo, *mp, *leaf; + char *buf, *ptr; + int rc, toggle; + unsigned int i; + + /* Empty DB, nothing to do */ + if (*pg == P_INVALID) + return MDB_SUCCESS; + + mc.mc_snum = 1; + mc.mc_txn = my->mc_txn; + + rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); + if (rc) + return rc; + rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); + if (rc) + return rc; + + /* Make cursor pages writable */ + buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); + if (buf == NULL) + return ENOMEM; + + for (i=0; imc_env->me_psize); + mc.mc_pg[i] = (MDB_page *)ptr; + ptr += my->mc_env->me_psize; + } + + /* This is writable space for a leaf page. Usually not needed. */ + leaf = (MDB_page *)ptr; + + toggle = my->mc_toggle; + while (mc.mc_snum > 0) { + unsigned n; + mp = mc.mc_pg[mc.mc_top]; + n = NUMKEYS(mp); + + if (IS_LEAF(mp)) { + if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { + for (i=0; imn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + + /* Need writable leaf */ + if (mp != leaf) { + mc.mc_pg[mc.mc_top] = leaf; + mdb_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + ni = NODEPTR(mp, i); + } + + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); + rc = mdb_page_get(&mc, pg, &omp, NULL); + if (rc) + goto done; + if (my->mc_wlen[toggle] >= MDB_WBUF) { + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + memcpy(mo, omp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno; + my->mc_next_pgno += omp->mp_pages; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (omp->mp_pages > 1) { + my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); + my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + } else if (ni->mn_flags & F_SUBDATA) { + MDB_db db; + + /* Need writable leaf */ + if (mp != leaf) { + mc.mc_pg[mc.mc_top] = leaf; + mdb_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + ni = NODEPTR(mp, i); + } + + memcpy(&db, NODEDATA(ni), sizeof(db)); + my->mc_toggle = toggle; + rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); + if (rc) + goto done; + toggle = my->mc_toggle; + memcpy(NODEDATA(ni), &db, sizeof(db)); + } + } + } + } else { + mc.mc_ki[mc.mc_top]++; + if (mc.mc_ki[mc.mc_top] < n) { + pgno_t pg; +again: + ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); + pg = NODEPGNO(ni); + rc = mdb_page_get(&mc, pg, &mp, NULL); + if (rc) + goto done; + mc.mc_top++; + mc.mc_snum++; + mc.mc_ki[mc.mc_top] = 0; + if (IS_BRANCH(mp)) { + /* Whenever we advance to a sibling branch page, + * we must proceed all the way down to its first leaf. + */ + mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); + goto again; + } else + mc.mc_pg[mc.mc_top] = mp; + continue; + } + } + if (my->mc_wlen[toggle] >= MDB_WBUF) { + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + mdb_page_copy(mo, mp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno++; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (mc.mc_top) { + /* Update parent if there is one */ + ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]); + SETPGNO(ni, mo->mp_pgno); + mdb_cursor_pop(&mc); + } else { + /* Otherwise we're done */ + *pg = mo->mp_pgno; + break; + } + } +done: + free(buf); + return rc; +} + + /** Copy environment with compaction. */ +static int ESECT +mdb_env_copyfd1(MDB_env *env, HANDLE fd) +{ + MDB_meta *mm; + MDB_page *mp; + mdb_copy my = {0}; + MDB_txn *txn = NULL; + pthread_t thr; + pgno_t root, new_root; + int rc = MDB_SUCCESS; + +#ifdef _WIN32 + if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) || + !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) { + rc = ErrCode(); + goto done; + } + my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize); + if (my.mc_wbuf[0] == NULL) { + /* _aligned_malloc() sets errno, but we use Windows error codes */ + rc = ERROR_NOT_ENOUGH_MEMORY; + goto done; + } +#else + if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) + return rc; + if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) + goto done2; +#ifdef HAVE_MEMALIGN + my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); + if (my.mc_wbuf[0] == NULL) { + rc = errno; + goto done; + } +#else + { + void *p; + if ((rc = posix_memalign(&p, env->me_os_psize, MDB_WBUF*2)) != 0) + goto done; + my.mc_wbuf[0] = p; + } +#endif +#endif + memset(my.mc_wbuf[0], 0, MDB_WBUF*2); + my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; + my.mc_next_pgno = NUM_METAS; + my.mc_env = env; + my.mc_fd = fd; + rc = THREAD_CREATE(thr, mdb_env_copythr, &my); + if (rc) + goto done; + + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + goto finish; + + mp = (MDB_page *)my.mc_wbuf[0]; + memset(mp, 0, NUM_METAS * env->me_psize); + mp->mp_pgno = 0; + mp->mp_flags = P_META; + mm = (MDB_meta *)METADATA(mp); + mdb_env_init_meta0(env, mm); + mm->mm_address = env->me_metas[0]->mm_address; + + mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); + mp->mp_pgno = 1; + mp->mp_flags = P_META; + *(MDB_meta *)METADATA(mp) = *mm; + mm = (MDB_meta *)METADATA(mp); + + /* Set metapage 1 with current main DB */ + root = new_root = txn->mt_dbs[MAIN_DBI].md_root; + if (root != P_INVALID) { + /* Count free pages + freeDB pages. Subtract from last_pg + * to find the new last_pg, which also becomes the new root. + */ + MDB_ID freecount = 0; + MDB_cursor mc; + MDB_val key, data; + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + freecount += *(MDB_ID *)data.mv_data; + if (rc != MDB_NOTFOUND) + goto finish; + freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + + txn->mt_dbs[FREE_DBI].md_leaf_pages + + txn->mt_dbs[FREE_DBI].md_overflow_pages; + + new_root = txn->mt_next_pgno - 1 - freecount; + mm->mm_last_pg = new_root; + mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + mm->mm_dbs[MAIN_DBI].md_root = new_root; + } else { + /* When the DB is empty, handle it specially to + * fix any breakage like page leaks from ITS#8174. + */ + mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; + } + if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { + mm->mm_txnid = 1; /* use metapage 1 */ + } + + my.mc_wlen[0] = env->me_psize * NUM_METAS; + my.mc_txn = txn; + rc = mdb_env_cwalk(&my, &root, 0); + if (rc == MDB_SUCCESS && root != new_root) { + rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ + } + +finish: + if (rc) + my.mc_error = rc; + mdb_env_cthr_toggle(&my, 1 | MDB_EOF); + rc = THREAD_FINISH(thr); + mdb_txn_abort(txn); + +done: +#ifdef _WIN32 + if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]); + if (my.mc_cond) CloseHandle(my.mc_cond); + if (my.mc_mutex) CloseHandle(my.mc_mutex); +#else + free(my.mc_wbuf[0]); + pthread_cond_destroy(&my.mc_cond); +done2: + pthread_mutex_destroy(&my.mc_mutex); +#endif + return rc ? rc : my.mc_error; +} + + /** Copy environment as-is. */ +static int ESECT +mdb_env_copyfd0(MDB_env *env, HANDLE fd) +{ + MDB_txn *txn = NULL; + mdb_mutexref_t wmutex = NULL; + int rc; + size_t wsize, w3; + char *ptr; +#ifdef _WIN32 + DWORD len, w2; +#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) +#else + ssize_t len; + size_t w2; +#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) +#endif + + /* Do the lock/unlock of the reader mutex before starting the + * write txn. Otherwise other read txns could block writers. + */ + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + return rc; + + if (env->me_txns) { + /* We must start the actual read txn after blocking writers */ + mdb_txn_end(txn, MDB_END_RESET_TMP); + + /* Temporarily block writers until we snapshot the meta pages */ + wmutex = env->me_wmutex; + if (LOCK_MUTEX(rc, env, wmutex)) + goto leave; + + rc = mdb_txn_renew0(txn); + if (rc) { + UNLOCK_MUTEX(wmutex); + goto leave; + } + } + + wsize = env->me_psize * NUM_METAS; + ptr = env->me_map; + w2 = wsize; + while (w2 > 0) { + DO_WRITE(rc, fd, ptr, w2, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + w2 -= len; + continue; + } else { + /* Non-blocking or async handles are not supported */ + rc = EIO; + break; + } + } + if (wmutex) + UNLOCK_MUTEX(wmutex); + + if (rc) + goto leave; + + w3 = txn->mt_next_pgno * env->me_psize; + { + size_t fsize = 0; + if ((rc = mdb_fsize(env->me_fd, &fsize))) + goto leave; + if (w3 > fsize) + w3 = fsize; + } + wsize = w3 - wsize; + while (wsize > 0) { + if (wsize > MAX_WRITE) + w2 = MAX_WRITE; + else + w2 = wsize; + DO_WRITE(rc, fd, ptr, w2, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + +leave: + mdb_txn_abort(txn); + return rc; +} + +int ESECT +mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags) +{ + if (flags & MDB_CP_COMPACT) + return mdb_env_copyfd1(env, fd); + else + return mdb_env_copyfd0(env, fd); +} + +int ESECT +mdb_env_copyfd(MDB_env *env, HANDLE fd) +{ + return mdb_env_copyfd2(env, fd, 0); +} + +int ESECT +mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) +{ + int rc; + MDB_name fname; + HANDLE newfd = INVALID_HANDLE_VALUE; + + rc = mdb_fname_init(path, env->me_flags | MDB_NOLOCK, &fname); + if (rc == MDB_SUCCESS) { + rc = mdb_fopen(env, &fname, MDB_O_COPY, 0666, &newfd); + mdb_fname_destroy(fname); + } + if (rc == MDB_SUCCESS) { + rc = mdb_env_copyfd2(env, newfd, flags); + if (close(newfd) < 0 && rc == MDB_SUCCESS) + rc = ErrCode(); + } + return rc; +} + +int ESECT +mdb_env_copy(MDB_env *env, const char *path) +{ + return mdb_env_copy2(env, path, 0); +} + +int ESECT +mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) +{ + if (flag & ~CHANGEABLE) + return EINVAL; + if (onoff) + env->me_flags |= flag; + else + env->me_flags &= ~flag; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_flags(MDB_env *env, unsigned int *arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = env->me_flags & (CHANGEABLE|CHANGELESS); + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_userctx(MDB_env *env, void *ctx) +{ + if (!env) + return EINVAL; + env->me_userctx = ctx; + return MDB_SUCCESS; +} + +void * ESECT +mdb_env_get_userctx(MDB_env *env) +{ + return env ? env->me_userctx : NULL; +} + +int ESECT +mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) +{ + if (!env) + return EINVAL; +#ifndef NDEBUG + env->me_assert_func = func; +#endif + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_path(MDB_env *env, const char **arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = env->me_path; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = env->me_fd; + return MDB_SUCCESS; +} + +/** Common code for #mdb_stat() and #mdb_env_stat(). + * @param[in] env the environment to operate in. + * @param[in] db the #MDB_db record containing the stats to return. + * @param[out] arg the address of an #MDB_stat structure to receive the stats. + * @return 0, this function always succeeds. + */ +static int ESECT +mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) +{ + arg->ms_psize = env->me_psize; + arg->ms_depth = db->md_depth; + arg->ms_branch_pages = db->md_branch_pages; + arg->ms_leaf_pages = db->md_leaf_pages; + arg->ms_overflow_pages = db->md_overflow_pages; + arg->ms_entries = db->md_entries; + + return MDB_SUCCESS; +} + +int ESECT +mdb_env_stat(MDB_env *env, MDB_stat *arg) +{ + MDB_meta *meta; + + if (env == NULL || arg == NULL) + return EINVAL; + + meta = mdb_env_pick_meta(env); + + return mdb_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); +} + +int ESECT +mdb_env_info(MDB_env *env, MDB_envinfo *arg) +{ + MDB_meta *meta; + + if (env == NULL || arg == NULL) + return EINVAL; + + meta = mdb_env_pick_meta(env); + arg->me_mapaddr = meta->mm_address; + arg->me_last_pgno = meta->mm_last_pg; + arg->me_last_txnid = meta->mm_txnid; + + arg->me_mapsize = env->me_mapsize; + arg->me_maxreaders = env->me_maxreaders; + arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0; + return MDB_SUCCESS; +} + +/** Set the default comparison functions for a database. + * Called immediately after a database is opened to set the defaults. + * The user can then override them with #mdb_set_compare() or + * #mdb_set_dupsort(). + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + */ +static void +mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) +{ + uint16_t f = txn->mt_dbs[dbi].md_flags; + + txn->mt_dbxs[dbi].md_cmp = + (f & MDB_REVERSEKEY) ? mdb_cmp_memnr : + (f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn; + + txn->mt_dbxs[dbi].md_dcmp = + !(f & MDB_DUPSORT) ? 0 : + ((f & MDB_INTEGERDUP) + ? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint) + : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn)); +} + +int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) +{ + MDB_val key, data; + MDB_dbi i; + MDB_cursor mc; + MDB_db dummy; + int rc, dbflag, exact; + unsigned int unused = 0, seq; + char *namedup; + size_t len; + + if (flags & ~VALID_FLAGS) + return EINVAL; + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + /* main DB? */ + if (!name) { + *dbi = MAIN_DBI; + if (flags & PERSISTENT_FLAGS) { + uint16_t f2 = flags & PERSISTENT_FLAGS; + /* make sure flag changes get committed */ + if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { + txn->mt_dbs[MAIN_DBI].md_flags |= f2; + txn->mt_flags |= MDB_TXN_DIRTY; + } + } + mdb_default_cmp(txn, MAIN_DBI); + return MDB_SUCCESS; + } + + if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { + mdb_default_cmp(txn, MAIN_DBI); + } + + /* Is the DB already open? */ + len = strlen(name); + for (i=CORE_DBS; imt_numdbs; i++) { + if (!txn->mt_dbxs[i].md_name.mv_size) { + /* Remember this free slot */ + if (!unused) unused = i; + continue; + } + if (len == txn->mt_dbxs[i].md_name.mv_size && + !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { + *dbi = i; + return MDB_SUCCESS; + } + } + + /* If no free slot and max hit, fail */ + if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) + return MDB_DBS_FULL; + + /* Cannot mix named databases with some mainDB flags */ + if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) + return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; + + /* Find the DB info */ + dbflag = DB_NEW|DB_VALID|DB_USRVALID; + exact = 0; + key.mv_size = len; + key.mv_data = (void *)name; + mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); + rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); + if (rc == MDB_SUCCESS) { + /* make sure this is actually a DB */ + MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) + return MDB_INCOMPATIBLE; + } else { + if (rc != MDB_NOTFOUND || !(flags & MDB_CREATE)) + return rc; + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EACCES; + } + + /* Done here so we cannot fail after creating a new DB */ + if ((namedup = strdup(name)) == NULL) + return ENOMEM; + + if (rc) { + /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ + data.mv_size = sizeof(MDB_db); + data.mv_data = &dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.md_root = P_INVALID; + dummy.md_flags = flags & PERSISTENT_FLAGS; + WITH_CURSOR_TRACKING(mc, + rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); + dbflag |= DB_DIRTY; + } + + if (rc) { + free(namedup); + } else { + /* Got info, register DBI in this txn */ + unsigned int slot = unused ? unused : txn->mt_numdbs; + txn->mt_dbxs[slot].md_name.mv_data = namedup; + txn->mt_dbxs[slot].md_name.mv_size = len; + txn->mt_dbxs[slot].md_rel = NULL; + txn->mt_dbflags[slot] = dbflag; + /* txn-> and env-> are the same in read txns, use + * tmp variable to avoid undefined assignment + */ + seq = ++txn->mt_env->me_dbiseqs[slot]; + txn->mt_dbiseqs[slot] = seq; + + memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); + *dbi = slot; + mdb_default_cmp(txn, slot); + if (!unused) { + txn->mt_numdbs++; + } + } + + return rc; +} + +int ESECT +mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) +{ + if (!arg || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + if (txn->mt_dbflags[dbi] & DB_STALE) { + MDB_cursor mc; + MDB_xcursor mx; + /* Stale, must read the DB's root. cursor_init does it for us. */ + mdb_cursor_init(&mc, txn, dbi, &mx); + } + return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); +} + +void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) +{ + char *ptr; + if (dbi < CORE_DBS || dbi >= env->me_maxdbs) + return; + ptr = env->me_dbxs[dbi].md_name.mv_data; + /* If there was no name, this was already closed */ + if (ptr) { + env->me_dbxs[dbi].md_name.mv_data = NULL; + env->me_dbxs[dbi].md_name.mv_size = 0; + env->me_dbflags[dbi] = 0; + env->me_dbiseqs[dbi]++; + free(ptr); + } +} + +int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) +{ + /* We could return the flags for the FREE_DBI too but what's the point? */ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; + return MDB_SUCCESS; +} + +/** Add all the DB's pages to the free list. + * @param[in] mc Cursor on the DB to free. + * @param[in] subs non-Zero to check for sub-DBs in this DB. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_drop0(MDB_cursor *mc, int subs) +{ + int rc; + + rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); + if (rc == MDB_SUCCESS) { + MDB_txn *txn = mc->mc_txn; + MDB_node *ni; + MDB_cursor mx; + unsigned int i; + + /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. + * This also avoids any P_LEAF2 pages, which have no nodes. + * Also if the DB doesn't have sub-DBs and has no overflow + * pages, omit scanning leaves. + */ + if ((mc->mc_flags & C_SUB) || + (!subs && !mc->mc_db->md_overflow_pages)) + mdb_cursor_pop(mc); + + mdb_cursor_copy(mc, &mx); + while (mc->mc_snum > 0) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + unsigned n = NUMKEYS(mp); + if (IS_LEAF(mp)) { + for (i=0; imn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + rc = mdb_page_get(mc, pg, &omp, NULL); + if (rc != 0) + goto done; + mdb_cassert(mc, IS_OVERFLOW(omp)); + rc = mdb_midl_append_range(&txn->mt_free_pgs, + pg, omp->mp_pages); + if (rc) + goto done; + mc->mc_db->md_overflow_pages -= omp->mp_pages; + if (!mc->mc_db->md_overflow_pages && !subs) + break; + } else if (subs && (ni->mn_flags & F_SUBDATA)) { + mdb_xcursor_init1(mc, ni); + rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (rc) + goto done; + } + } + if (!subs && !mc->mc_db->md_overflow_pages) + goto pop; + } else { + if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) + goto done; + for (i=0; imt_free_pgs, pg); + } + } + if (!mc->mc_top) + break; + mc->mc_ki[mc->mc_top] = i; + rc = mdb_cursor_sibling(mc, 1); + if (rc) { + if (rc != MDB_NOTFOUND) + goto done; + /* no more siblings, go back to beginning + * of previous level. + */ +pop: + mdb_cursor_pop(mc); + mc->mc_ki[0] = 0; + for (i=1; imc_snum; i++) { + mc->mc_ki[i] = 0; + mc->mc_pg[i] = mx.mc_pg[i]; + } + } + } + /* free it */ + rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); +done: + if (rc) + txn->mt_flags |= MDB_TXN_ERROR; + } else if (rc == MDB_NOTFOUND) { + rc = MDB_SUCCESS; + } + mc->mc_flags &= ~C_INITIALIZED; + return rc; +} + +int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) +{ + MDB_cursor *mc, *m2; + int rc; + + if ((unsigned)del > 1 || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EACCES; + + if (TXN_DBI_CHANGED(txn, dbi)) + return MDB_BAD_DBI; + + rc = mdb_cursor_open(txn, dbi, &mc); + if (rc) + return rc; + + rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); + /* Invalidate the dropped DB's cursors */ + for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) + m2->mc_flags &= ~(C_INITIALIZED|C_EOF); + if (rc) + goto leave; + + /* Can't delete the main DB */ + if (del && dbi >= CORE_DBS) { + rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); + if (!rc) { + txn->mt_dbflags[dbi] = DB_STALE; + mdb_dbi_close(txn->mt_env, dbi); + } else { + txn->mt_flags |= MDB_TXN_ERROR; + } + } else { + /* reset the DB record, mark it dirty */ + txn->mt_dbflags[dbi] |= DB_DIRTY; + txn->mt_dbs[dbi].md_depth = 0; + txn->mt_dbs[dbi].md_branch_pages = 0; + txn->mt_dbs[dbi].md_leaf_pages = 0; + txn->mt_dbs[dbi].md_overflow_pages = 0; + txn->mt_dbs[dbi].md_entries = 0; + txn->mt_dbs[dbi].md_root = P_INVALID; + + txn->mt_flags |= MDB_TXN_DIRTY; + } +leave: + mdb_cursor_close(mc); + return rc; +} + +int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) +{ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + txn->mt_dbxs[dbi].md_cmp = cmp; + return MDB_SUCCESS; +} + +int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) +{ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + txn->mt_dbxs[dbi].md_dcmp = cmp; + return MDB_SUCCESS; +} + +int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) +{ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + txn->mt_dbxs[dbi].md_rel = rel; + return MDB_SUCCESS; +} + +int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) +{ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + txn->mt_dbxs[dbi].md_relctx = ctx; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_maxkeysize(MDB_env *env) +{ + return ENV_MAXKEY(env); +} + +int ESECT +mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) +{ + unsigned int i, rdrs; + MDB_reader *mr; + char buf[64]; + int rc = 0, first = 1; + + if (!env || !func) + return -1; + if (!env->me_txns) { + return func("(no reader locks)\n", ctx); + } + rdrs = env->me_txns->mti_numreaders; + mr = env->me_txns->mti_readers; + for (i=0; i> 1; + cursor = base + pivot + 1; + val = pid - ids[cursor]; + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor; + n -= pivot + 1; + + } else { + /* found, so it's a duplicate */ + return -1; + } + } + + if( val > 0 ) { + ++cursor; + } + ids[0]++; + for (n = ids[0]; n > cursor; n--) + ids[n] = ids[n-1]; + ids[n] = pid; + return 0; +} + +int ESECT +mdb_reader_check(MDB_env *env, int *dead) +{ + if (!env) + return EINVAL; + if (dead) + *dead = 0; + return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS; +} + +/** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ +static int ESECT +mdb_reader_check0(MDB_env *env, int rlocked, int *dead) +{ + mdb_mutexref_t rmutex = rlocked ? NULL : env->me_rmutex; + unsigned int i, j, rdrs; + MDB_reader *mr; + MDB_PID_T *pids, pid; + int rc = MDB_SUCCESS, count = 0; + + rdrs = env->me_txns->mti_numreaders; + pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); + if (!pids) + return ENOMEM; + pids[0] = 0; + mr = env->me_txns->mti_readers; + for (i=0; ime_pid) { + if (mdb_pid_insert(pids, pid) == 0) { + if (!mdb_reader_pid(env, Pidcheck, pid)) { + /* Stale reader found */ + j = i; + if (rmutex) { + if ((rc = LOCK_MUTEX0(rmutex)) != 0) { + if ((rc = mdb_mutex_failed(env, rmutex, rc))) + break; + rdrs = 0; /* the above checked all readers */ + } else { + /* Recheck, a new process may have reused pid */ + if (mdb_reader_pid(env, Pidcheck, pid)) + j = rdrs; + } + } + for (; jme_rmutex); + if (!rlocked) { + /* Keep mti_txnid updated, otherwise next writer can + * overwrite data which latest meta page refers to. + */ + meta = mdb_env_pick_meta(env); + env->me_txns->mti_txnid = meta->mm_txnid; + /* env is hosed if the dead thread was ours */ + if (env->me_txn) { + env->me_flags |= MDB_FATAL_ERROR; + env->me_txn = NULL; + rc = MDB_PANIC; + } + } + DPRINTF(("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering"))); + rc2 = mdb_reader_check0(env, rlocked, NULL); + if (rc2 == 0) + rc2 = mdb_mutex_consistent(mutex); + if (rc || (rc = rc2)) { + DPRINTF(("LOCK_MUTEX recovery failed, %s", mdb_strerror(rc))); + UNLOCK_MUTEX(mutex); + } + } else { +#ifdef _WIN32 + rc = ErrCode(); +#endif + DPRINTF(("LOCK_MUTEX failed, %s", mdb_strerror(rc))); + } + + return rc; +} +#endif /* MDB_ROBUST_SUPPORTED */ + +#if defined(_WIN32) +/** Convert \b src to new wchar_t[] string with room for \b xtra extra chars */ +static int ESECT +utf8_to_utf16(const char *src, MDB_name *dst, int xtra) +{ + int rc, need = 0; + wchar_t *result = NULL; + for (;;) { /* malloc result, then fill it in */ + need = MultiByteToWideChar(CP_UTF8, 0, src, -1, result, need); + if (!need) { + rc = ErrCode(); + free(result); + return rc; + } + if (!result) { + result = malloc(sizeof(wchar_t) * (need + xtra)); + if (!result) + return ENOMEM; + continue; + } + dst->mn_alloced = 1; + dst->mn_len = need - 1; + dst->mn_val = result; + return MDB_SUCCESS; + } +} +#endif /* defined(_WIN32) */ + +int mdb_madvise(MDB_env *env, int random) +{ +#ifdef _WIN32 + return 1; +#else + return madvise(env->me_map, env->me_mapsize, random>0 ? MADV_RANDOM : MADV_SEQUENTIAL); +#endif +} + +void mdb_env_unmap(MDB_env *env) +{ + if (env->me_txns) { + munmap((void *)env->me_txns, (env->me_maxreaders - 1) * sizeof(MDB_reader) + sizeof(MDB_txninfo)); + env->me_txns = NULL; + } +} + +size_t mdb_get_txnid(MDB_txn * txn) +{ + return txn->mt_txnid; +} + +void mdb_get_map(MDB_env *env, char** p_mmap, size_t* p_size) +{ + *p_mmap = env->me_map; + *p_size = env->me_mapsize; +} + +void mdb_env_get_txnids(MDB_env* env, size_t* txnid1, size_t* txnid2) +{ + *txnid1 = env->me_metas[0]->mm_txnid; + *txnid2 = env->me_metas[1]->mm_txnid; +} + +int mdb_cursor_first_leaf_page(MDB_cursor* cursor, size_t* pgno) +{ + if (pgno == NULL) + return MDB_INVALID; + + MDB_val tkey; + MDB_val tval; + + int rc = mdb_cursor_get(cursor, &tkey, &tval, MDB_FIRST); + + if (rc != MDB_SUCCESS) + return rc; + + *pgno = cursor->mc_pg[cursor->mc_top]->mp_pgno; + + return MDB_SUCCESS; +} + +int mdb_cursor_next_leaf_page(MDB_cursor* cursor, size_t* pgno) +{ + if (pgno == NULL) + return MDB_INVALID; + + int rc = mdb_cursor_sibling(cursor, 1); + + if (rc != MDB_SUCCESS) + return rc; + + *pgno = cursor->mc_pg[cursor->mc_top]->mp_pgno; + + return MDB_SUCCESS; +} + +int mdb_cursor_get_pageno(MDB_cursor* cursor, size_t* pgno) +{ + if (pgno == NULL) + return MDB_INVALID; + + *pgno = cursor->mc_pg[cursor->mc_top]->mp_pgno; + + return MDB_SUCCESS; +} + +int mdb_page_get_nkeys(MDB_cursor* mc, size_t pgno, unsigned int* nkeys) +{ + if (nkeys == NULL) + return MDB_INVALID; + + MDB_page* pg; + int rc = mdb_page_get(mc, pgno, &pg, NULL); + if (rc != MDB_SUCCESS) + return rc; + + if (!IS_LEAF(pg)) + return MDB_CORRUPTED; + + if (IS_LEAF2(pg)) + return MDB_CORRUPTED; + + *nkeys = NUMKEYS(pg); + + return MDB_SUCCESS; +} + +int mdb_page_get_val(MDB_cursor* mc, size_t pgno, unsigned int idx, + MDB_val* key, MDB_val* data) +{ + MDB_page* pg; + int rc = mdb_page_get(mc, pgno, &pg, NULL); + if (rc != MDB_SUCCESS) + return rc; + + if (!IS_LEAF(pg)) + return MDB_CORRUPTED; + + if (IS_LEAF2(pg)) + return MDB_CORRUPTED; + + unsigned int nkeys = NUMKEYS(pg); + + if (idx >= nkeys) + return MDB_INVALID; + + MDB_node* node = NODEPTR(pg, idx); + + MDB_GET_KEY(node, key); + + rc = mdb_node_read(mc, node, data); + if (rc != MDB_SUCCESS) + return rc; + + return MDB_SUCCESS; +} + +int mdb_page_is_dirty(MDB_cursor* mc, size_t pgno, int* dirty) +{ + MDB_page* pg; + int rc = mdb_page_get(mc, pgno, &pg, NULL); + if (rc != MDB_SUCCESS) + return rc; + + *dirty = ((char*)pg < mc->mc_txn->mt_env->me_map || + (char*)pg > mc->mc_txn->mt_env->me_map + mc->mc_txn->mt_env->me_mapsize) ? 1 : 0; + + return MDB_SUCCESS; +} + +/** @} */ diff --git a/lmdb/midl.cpp b/lmdb/midl.cpp new file mode 100644 index 0000000..d434809 --- /dev/null +++ b/lmdb/midl.cpp @@ -0,0 +1,359 @@ +/** @file midl.c + * @brief ldap bdb back-end ID List functions */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 2000-2019 The OpenLDAP Foundation. + * Portions Copyright 2001-2018 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include +#include +#include +#include +#include +#include "midl.h" + +/** @defgroup internal LMDB Internals + * @{ + */ +/** @defgroup idls ID List Management + * @{ + */ +#define CMP(x,y) ( (x) < (y) ? -1 : (x) > (y) ) + +unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id ) +{ + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = ids[0]; + + while( 0 < n ) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = CMP( ids[cursor], id ); + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor; + n -= pivot + 1; + + } else { + return cursor; + } + } + + if( val > 0 ) { + ++cursor; + } + return cursor; +} + +#if 0 /* superseded by append/sort */ +int mdb_midl_insert( MDB_IDL ids, MDB_ID id ) +{ + unsigned x, i; + + x = mdb_midl_search( ids, id ); + assert( x > 0 ); + + if( x < 1 ) { + /* internal error */ + return -2; + } + + if ( x <= ids[0] && ids[x] == id ) { + /* duplicate */ + assert(0); + return -1; + } + + if ( ++ids[0] >= MDB_IDL_DB_MAX ) { + /* no room */ + --ids[0]; + return -2; + + } else { + /* insert id */ + for (i=ids[0]; i>x; i--) + ids[i] = ids[i-1]; + ids[x] = id; + } + + return 0; +} +#endif + +MDB_IDL mdb_midl_alloc(int num) +{ + MDB_IDL ids = reinterpret_cast(malloc((num+2) * sizeof(MDB_ID))); + if (ids) { + *ids++ = num; + *ids = 0; + } + return ids; +} + +void mdb_midl_free(MDB_IDL ids) +{ + if (ids) + free(ids-1); +} + +void mdb_midl_shrink( MDB_IDL *idp ) +{ + MDB_IDL ids = *idp; + if (*(--ids) > MDB_IDL_UM_MAX && + (ids = reinterpret_cast(realloc(ids, (MDB_IDL_UM_MAX+2) * sizeof(MDB_ID))))) + { + *ids++ = MDB_IDL_UM_MAX; + *idp = ids; + } +} + +static int mdb_midl_grow( MDB_IDL *idp, int num ) +{ + MDB_IDL idn = *idp-1; + /* grow it */ + idn = reinterpret_cast(realloc(idn, (*idn + num + 2) * sizeof(MDB_ID))); + if (!idn) + return ENOMEM; + *idn++ += num; + *idp = idn; + return 0; +} + +int mdb_midl_need( MDB_IDL *idp, unsigned num ) +{ + MDB_IDL ids = *idp; + num += ids[0]; + if (num > ids[-1]) { + num = (num + num/4 + (256 + 2)) & -256; + if (!(ids = reinterpret_cast(realloc(ids-1, num * sizeof(MDB_ID))))) + return ENOMEM; + *ids++ = num - 2; + *idp = ids; + } + return 0; +} + +int mdb_midl_append( MDB_IDL *idp, MDB_ID id ) +{ + MDB_IDL ids = *idp; + /* Too big? */ + if (ids[0] >= ids[-1]) { + if (mdb_midl_grow(idp, MDB_IDL_UM_MAX)) + return ENOMEM; + ids = *idp; + } + ids[0]++; + ids[ids[0]] = id; + return 0; +} + +int mdb_midl_append_list( MDB_IDL *idp, MDB_IDL app ) +{ + MDB_IDL ids = *idp; + /* Too big? */ + if (ids[0] + app[0] >= ids[-1]) { + if (mdb_midl_grow(idp, app[0])) + return ENOMEM; + ids = *idp; + } + memcpy(&ids[ids[0]+1], &app[1], app[0] * sizeof(MDB_ID)); + ids[0] += app[0]; + return 0; +} + +int mdb_midl_append_range( MDB_IDL *idp, MDB_ID id, unsigned n ) +{ + MDB_ID *ids = *idp, len = ids[0]; + /* Too big? */ + if (len + n > ids[-1]) { + if (mdb_midl_grow(idp, n | MDB_IDL_UM_MAX)) + return ENOMEM; + ids = *idp; + } + ids[0] = len + n; + ids += len; + while (n) + ids[n--] = id++; + return 0; +} + +void mdb_midl_xmerge( MDB_IDL idl, MDB_IDL merge ) +{ + MDB_ID old_id, merge_id, i = merge[0], j = idl[0], k = i+j, total = k; + idl[0] = (MDB_ID)-1; /* delimiter for idl scan below */ + old_id = idl[j]; + while (i) { + merge_id = merge[i--]; + for (; old_id < merge_id; old_id = idl[--j]) + idl[k--] = old_id; + idl[k--] = merge_id; + } + idl[0] = total; +} + +/* Quicksort + Insertion sort for small arrays */ + +#define SMALL 8 +#define MIDL_SWAP(a,b) { itmp=(a); (a)=(b); (b)=itmp; } + +void +mdb_midl_sort( MDB_IDL ids ) +{ + /* Max possible depth of int-indexed tree * 2 items/level */ + int istack[sizeof(int)*CHAR_BIT * 2]; + int i,j,k,l,ir,jstack; + MDB_ID a, itmp; + + ir = (int)ids[0]; + l = 1; + jstack = 0; + for(;;) { + if (ir - l < SMALL) { /* Insertion sort */ + for (j=l+1;j<=ir;j++) { + a = ids[j]; + for (i=j-1;i>=1;i--) { + if (ids[i] >= a) break; + ids[i+1] = ids[i]; + } + ids[i+1] = a; + } + if (jstack == 0) break; + ir = istack[jstack--]; + l = istack[jstack--]; + } else { + k = (l + ir) >> 1; /* Choose median of left, center, right */ + MIDL_SWAP(ids[k], ids[l+1]); + if (ids[l] < ids[ir]) { + MIDL_SWAP(ids[l], ids[ir]); + } + if (ids[l+1] < ids[ir]) { + MIDL_SWAP(ids[l+1], ids[ir]); + } + if (ids[l] < ids[l+1]) { + MIDL_SWAP(ids[l], ids[l+1]); + } + i = l+1; + j = ir; + a = ids[l+1]; + for(;;) { + do i++; while(ids[i] > a); + do j--; while(ids[j] < a); + if (j < i) break; + MIDL_SWAP(ids[i],ids[j]); + } + ids[l+1] = ids[j]; + ids[j] = a; + jstack += 2; + if (ir-i+1 >= j-l) { + istack[jstack] = ir; + istack[jstack-1] = i; + ir = j-1; + } else { + istack[jstack] = j-1; + istack[jstack-1] = l; + l = i; + } + } + } +} + +unsigned mdb_mid2l_search( MDB_ID2L ids, MDB_ID id ) +{ + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = (unsigned)ids[0].mid; + + while( 0 < n ) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = CMP( id, ids[cursor].mid ); + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor; + n -= pivot + 1; + + } else { + return cursor; + } + } + + if( val > 0 ) { + ++cursor; + } + return cursor; +} + +int mdb_mid2l_insert( MDB_ID2L ids, MDB_ID2 *id ) +{ + unsigned x, i; + + x = mdb_mid2l_search( ids, id->mid ); + + if( x < 1 ) { + /* internal error */ + return -2; + } + + if ( x <= ids[0].mid && ids[x].mid == id->mid ) { + /* duplicate */ + return -1; + } + + if ( ids[0].mid >= MDB_IDL_UM_MAX ) { + /* too big */ + return -2; + + } else { + /* insert id */ + ids[0].mid++; + for (i=(unsigned)ids[0].mid; i>x; i--) + ids[i] = ids[i-1]; + ids[x] = *id; + } + + return 0; +} + +int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ) +{ + /* Too big? */ + if (ids[0].mid >= MDB_IDL_UM_MAX) { + return -2; + } + ids[0].mid++; + ids[ids[0].mid] = *id; + return 0; +} + +/** @} */ +/** @} */ diff --git a/lmdb/midl.h b/lmdb/midl.h new file mode 100644 index 0000000..5fde4c8 --- /dev/null +++ b/lmdb/midl.h @@ -0,0 +1,187 @@ +/** @file midl.h + * @brief LMDB ID List header file. + * + * This file was originally part of back-bdb but has been + * modified for use in libmdb. Most of the macros defined + * in this file are unused, just left over from the original. + * + * This file is only used internally in libmdb and its definitions + * are not exposed publicly. + */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 2000-2019 The OpenLDAP Foundation. + * Portions Copyright 2001-2018 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#ifndef _MDB_MIDL_H_ +#define _MDB_MIDL_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @defgroup internal LMDB Internals + * @{ + */ + +/** @defgroup idls ID List Management + * @{ + */ + /** A generic unsigned ID number. These were entryIDs in back-bdb. + * Preferably it should have the same size as a pointer. + */ +typedef size_t MDB_ID; + + /** An IDL is an ID List, a sorted array of IDs. The first + * element of the array is a counter for how many actual + * IDs are in the list. In the original back-bdb code, IDLs are + * sorted in ascending order. For libmdb IDLs are sorted in + * descending order. + */ +typedef MDB_ID *MDB_IDL; + +/* IDL sizes - likely should be even bigger + * limiting factors: sizeof(ID), thread stack size + */ +#define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ +#define MDB_IDL_DB_SIZE (1< +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "s3handler.h" +#include "SingleFileStorage.h" +#include + +using namespace std::chrono_literals; + +DEFINE_int32(http_port, 11000, "Port to listen on with HTTP protocol"); +DEFINE_int32(h2_port, -1, "Port to listen on with HTTP/2 protocol (-1 if disabled)"); +DEFINE_string(ip, "localhost", "IP/Hostname to bind to"); +DEFINE_string(root_key, "", "Secret access key for 'root'"); +DEFINE_int64(data_file_size_limit_mb, 0, "Max data file size (0 for unlimited)"); +DEFINE_int64(data_file_alloc_chunk_size_mb, 512, "Data file chunk allocation size"); +DEFINE_int32(threads, + 0, + "Number of threads to listen on. Numbers <= 0 " + "will use the number of cores on this machine."); +DEFINE_bool(manual_commit, false, "Manually commit by putting to a711e93e-93b4-4a9e-8a0b-688797470002"); +DEFINE_string(index_path, ".", "Path where to put the index file"); +DEFINE_string(data_path, ".", "Path where to put the data file"); +DEFINE_bool(stop_on_error, false, "Stop on write/read errors"); +DEFINE_bool(punch_holes, true, "Free up space if not enough free space is left by punching holes"); + +namespace { +class S3HandlerFactory : public proxygen::RequestHandlerFactory { + SingleFileStorage sfs; + std::string root_key; + public: + S3HandlerFactory(SingleFileStorage::SFSOptions sfsoptions) + : sfs(std::move(sfsoptions)), + root_key(FLAGS_root_key) + { + sfs.start_thread(sfs.get_transid()); + } + + void onServerStart(folly::EventBase* /*evb*/) noexcept override { + } + + void onServerStop() noexcept override { + } + + proxygen::RequestHandler* onRequest(proxygen::RequestHandler*, proxygen::HTTPMessage*) noexcept override { + return new S3Handler(sfs, root_key); + } +}; +} + +int main(int argc, char* argv[]) +{ + folly::init(&argc, &argv, true); + SingleFileStorage::init_mutex(); + + std::vector IPs = { + {folly::SocketAddress(FLAGS_ip, FLAGS_http_port, true), proxygen::HTTPServer::Protocol::HTTP}, + }; + + if(FLAGS_h2_port!=-1) + { + IPs.push_back({folly::SocketAddress(FLAGS_ip, FLAGS_h2_port, true), proxygen::HTTPServer::Protocol::HTTP2}); + } + + SingleFileStorage::SFSOptions sfsoptions; + sfsoptions.data_path = FLAGS_data_path; + sfsoptions.db_path = FLAGS_index_path; + sfsoptions.data_file_size_limit_mb = FLAGS_data_file_size_limit_mb; + sfsoptions.alloc_chunk_size = FLAGS_data_file_alloc_chunk_size_mb*1024*1024; + std::vector runtime_id(32); + folly::Random::secureRandom(runtime_id.data(), runtime_id.size()); + sfsoptions.runtime_id = folly::hexlify(folly::ByteRange(runtime_id.data(), runtime_id.size())); + sfsoptions.manual_commit = FLAGS_manual_commit; + sfsoptions.stop_on_error = FLAGS_stop_on_error; + sfsoptions.punch_holes = FLAGS_punch_holes; + + proxygen::HTTPServerOptions options; + options.threads = static_cast(FLAGS_threads); + options.idleTimeout = 60s; + options.shutdownOn = {SIGINT, SIGTERM}; + options.enableContentCompression = false; + options.handlerFactories = + proxygen::RequestHandlerChain().addThen(sfsoptions).build(); + options.h2cEnabled = true; + + proxygen::HTTPServer server(std::move(options)); + server.bind(IPs); + + std::thread t([&]() { server.start(); }); + t.join(); + return 0; +} + diff --git a/os_functions.cpp b/os_functions.cpp new file mode 100644 index 0000000..287c448 --- /dev/null +++ b/os_functions.cpp @@ -0,0 +1,213 @@ +/** + * Copyright Martin Raiber. All Rights Reserved. + * SPDX-License-Identifier: AGPL-3.0-or-later + */ +#include "os_functions.h" +#include +#include +#include +#include + +int64_t os_total_space(const std::string &path) +{ + std::string cp=path; + if(path.size()==0) + return -1; + if(cp[cp.size()-1]=='/') + cp.erase(cp.size()-1, 1); + if(cp[cp.size()-1]!='/') + cp+='/'; + + struct statvfs64 buf; + int rc=statvfs64((path).c_str(), &buf); + if(rc==0) + { + fsblkcnt64_t used=buf.f_blocks-buf.f_bfree; +#if defined(__FreeBSD__) || defined(__APPLE__) + int64 total = (int64)(used+buf.f_bavail)*buf.f_frsize; +#else + fsblkcnt64_t total = (used+buf.f_bavail)*buf.f_bsize; +#endif + if(total>LLONG_MAX) + { + return LLONG_MAX; + } + return total; + } + else + return -1; +} + +int64_t os_free_space(const std::string &path) +{ + std::string cp=path; + if(path.size()==0) + return -1; + if(cp[cp.size()-1]=='/') + cp.erase(cp.size()-1, 1); + if(cp[cp.size()-1]!='/') + cp+='/'; + + struct statvfs64 buf = {}; + int rc=statvfs64((path).c_str(), &buf); + if(rc==0) + { +#if defined(__FreeBSD__) || defined(__APPLE__) + int64 free = (int64)buf.f_frsize*buf.f_bavail; +#else + fsblkcnt64_t blocksize = buf.f_frsize ? buf.f_frsize : buf.f_bsize; + fsblkcnt64_t free = blocksize*buf.f_bavail; +#endif + if(free>LLONG_MAX) + { + return LLONG_MAX; + } + return free; + } + else + { + return -1; + } +} + +unsigned int os_get_file_type(const std::string &path) +{ + int ret = 0; + struct stat64 f_info; + int rc1=stat64((path).c_str(), &f_info); + if(rc1==0) + { + if ( S_ISDIR(f_info.st_mode) ) + { + ret |= EFileType_Directory; + } + else + { + ret |= EFileType_File; + } + } + + int rc2 = lstat64((path).c_str(), &f_info); + if(rc2==0) + { + if(S_ISLNK(f_info.st_mode)) + { + ret |= EFileType_Symlink; + } + + if(!S_ISDIR(f_info.st_mode) + && !S_ISREG(f_info.st_mode) ) + { + ret |= EFileType_Special; + } + + if(rc1!=0) + { + ret |= EFileType_File; + } + } + + return ret; +} + +int os_popen(const std::string& cmd, std::string& ret) +{ + ret.clear(); + +#ifdef __ANDROID__ + POFILE* pin = NULL; +#endif + + FILE* in = NULL; + +#ifndef _WIN32 +#define _popen popen +#define _pclose pclose +#endif + +#ifdef __ANDROID__ + pin = and_popen(cmd.c_str(), "r"); + if(pin!=NULL) in=pin->fp; +#elif __linux__ + in = _popen(cmd.c_str(), "re"); + if(!in) in = _popen(cmd.c_str(), "r"); +#else + in = _popen(cmd.c_str(), "r"); +#endif + + if(in==NULL) + { + return -1; + } + + char buf[4096]; + size_t read; + do + { + read=fread(buf, 1, sizeof(buf), in); + if(read>0) + { + ret.append(buf, buf+read); + } + } + while(read==sizeof(buf)); + +#ifdef __ANDROID__ + return and_pclose(pin); +#else + return _pclose(in); +#endif +} + +#define BTRFS_IOCTL_MAGIC 0x94 +#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) + +bool os_sync(const std::string & path) +{ +#if defined(__linux__) + int fd = open(path.c_str(), O_RDONLY|O_CLOEXEC); + + if(fd!=-1) + { + if(ioctl(fd, BTRFS_IOC_SYNC, NULL)==-1) + { + if(errno!=ENOTTY && errno!=ENOSYS + && errno!=EINVAL) + { + close(fd); + return false; + } + } + else + { + close(fd); + return true; + } + + if(syncfs(fd)!=0) + { + if(errno==ENOSYS) + { + close(fd); + sync(); + return true; + } + close(fd); + return false; + } + else + { + close(fd); + return true; + } + } + else + { + sync(); + return true; + } +#else + sync(); + return true; +#endif +} \ No newline at end of file diff --git a/os_functions.h b/os_functions.h new file mode 100644 index 0000000..7293348 --- /dev/null +++ b/os_functions.h @@ -0,0 +1,53 @@ +#include +#include +#include +#include +#include +#include +#include + +static std::string os_file_sep() { + return "/"; +} + +static pid_t getThreadID() { + return syscall(__NR_gettid); +} + +static int64_t fileSize(const folly::File& file) +{ + struct stat64 st; + folly::checkUnixError(fstat64(file.fd(), &st), "fstat() failed"); + return st.st_size; +} + +static int64_t fileSize(int fd) +{ + struct stat64 st; + folly::checkUnixError(fstat64(fd, &st), "fstat() failed"); + return st.st_size; +} + +int64_t os_total_space(const std::string& path); + +int64_t os_free_space(const std::string& path); + +enum EFileType +{ + EFileType_File = 1, + EFileType_Directory = 2, + EFileType_Symlink = 4, + EFileType_Special = 8 +}; + +unsigned int os_get_file_type(const std::string& path); + +int os_popen(const std::string& cmd, std::string& ret); + +static bool punchHole(int fd, __off64_t spos, __off64_t size) +{ + int rc = fallocate64(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, spos, size); + return rc == 0; +} + +bool os_sync(const std::string & path); \ No newline at end of file diff --git a/relaxed_atomic.h b/relaxed_atomic.h new file mode 100644 index 0000000..bc21c10 --- /dev/null +++ b/relaxed_atomic.h @@ -0,0 +1,72 @@ +#pragma once +#include + +template +struct relaxed_atomic : std::atomic +{ + relaxed_atomic() + : std::atomic() {} + + relaxed_atomic(const relaxed_atomic&) = delete; + relaxed_atomic& operator=(const relaxed_atomic&) = delete; + + relaxed_atomic(const T val) + : std::atomic(val) + { + + } + + T operator=(const T val) volatile noexcept { + this->store(val, std::memory_order_relaxed); + return val; + } + + T operator=(const T val) noexcept { + this->store(val, std::memory_order_relaxed); + return val; + } + + T operator++(int) noexcept { + return this->fetch_add(1, std::memory_order_relaxed); + } + + T operator++() noexcept { + T tmp = this->fetch_add(1, std::memory_order_relaxed); + ++tmp; + return tmp; + } + + T operator--(int) noexcept { + return this->fetch_sub(1, std::memory_order_relaxed); + } + + T operator--() noexcept { + T tmp = this->fetch_sub(1, std::memory_order_relaxed); + --tmp; + return tmp; + } + + T operator+=(const T val) noexcept { + return this->fetch_add(val, std::memory_order_relaxed); + } + + T operator+=(const T val) volatile noexcept { + return this->fetch_add(val, std::memory_order_relaxed); + } + + T operator-=(const T val) noexcept { + return this->fetch_sub(val, std::memory_order_relaxed); + } + + T operator-=(const T val) volatile noexcept { + return this->fetch_sub(val, std::memory_order_relaxed); + } + + operator T() const volatile noexcept { + return this->load(std::memory_order_relaxed); + } + + operator T() const noexcept { + return this->load(std::memory_order_relaxed); + } +}; diff --git a/s3handler.cpp b/s3handler.cpp new file mode 100644 index 0000000..0d39111 --- /dev/null +++ b/s3handler.cpp @@ -0,0 +1,822 @@ +/** + * Copyright Martin Raiber. All Rights Reserved. + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +#include "s3handler.h" +#include "SingleFileStorage.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace proxygen; + +const char* c_commit_uuid = "a711e93e-93b4-4a9e-8a0b-688797470002"; + +std::string hashSha256Hex(const std::string &payload) +{ + unsigned char md[SHA256_DIGEST_LENGTH]; + SHA256(reinterpret_cast(payload.data()), + payload.size(), md); + return folly::hexlify( + folly::ByteRange(md, SHA256_DIGEST_LENGTH)); +} + +std::string hmacSha256Binary(const std::string &key, + const std::string &payload) +{ + std::string ret; + ret.resize(SHA256_DIGEST_LENGTH); + unsigned int len = SHA256_DIGEST_LENGTH; + HMAC(EVP_sha256(), key.data(), key.size(), + reinterpret_cast(payload.data()), payload.size(), + reinterpret_cast(&ret[0]), &len); + assert(len == SHA256_DIGEST_LENGTH); + return ret; +} + +std::string currDate() +{ + time_t t = toTimeT(getCurrentTime()); + struct tm final_tm; + localtime_r(&t, &final_tm); + std::string ret; + ret.resize(8); + strftime(&ret[0], ret.size(), "%Y%m%d", &final_tm); + return ret; +} + +bool checkSig(HTTPMessage &headers, const std::string &secretKey, + const folly::StringPiece &authorization, + const std::string &payload) +{ + const char alg_name[] = "AWS4-HMAC-SHA256"; + const char alg[] = "AWS4-HMAC-SHA256 "; + if (authorization.find(alg) != 0) + return false; + + std::vector authorizationVec; + folly::split(',', authorization.subpiece(sizeof(alg)), authorizationVec); + + std::map authorizationMap; + + for (auto ave : authorizationVec) + { + size_t eq = ave.find_first_of('='); + if (eq != std::string::npos) + { + authorizationMap.insert( + std::make_pair(ave.subpiece(0, eq), ave.subpiece(eq + 1))); + } + } + + const char signedHeadersKey[] = "SignedHeaders"; + auto itSignedHeaders = authorizationMap.find(signedHeadersKey); + if (itSignedHeaders == authorizationMap.end()) + return false; + + const char credentialHeaderKey[] = "Credential"; + auto itCredential = authorizationMap.find(credentialHeaderKey); + if (itCredential == authorizationMap.end()) + return false; + + const char signatureHeaderKey[] = "Signature"; + auto itSignature = authorizationMap.find(signatureHeaderKey); + if (itSignature == authorizationMap.end()) + return false; + + std::vector credentialScopeToks; + folly::split('/', itCredential->second, credentialScopeToks); + + if (credentialScopeToks.size() != 5) + return false; + + std::vector signedHeadersVec; + folly::split(';', itSignedHeaders->second, signedHeadersVec); + + std::string canonicalHeaders; + std::optional > prevSignedHeader; + bool hasHost = false; + for (auto signedHeader : signedHeadersVec) + { + if (prevSignedHeader && prevSignedHeader >= signedHeader) + { + return false; + } + auto fullVal = headers.getHeaders().getSingleOrEmpty(signedHeader); + auto val = folly::trimWhitespace(fullVal); + canonicalHeaders += folly::sformat("{}:{}\n", signedHeader, val); + prevSignedHeader = signedHeader; + if (signedHeader == "host" && !val.empty()) + hasHost = true; + } + + if (!hasHost) + return false; + + auto params = headers.getQueryParams(); + std::string canonicalParamStr; + for (auto param : params) + { + if (!canonicalParamStr.empty()) + canonicalParamStr += "&"; + canonicalParamStr += param.first + "=" + + folly::uriEscape( + param.second, folly::UriEscapeMode::QUERY); + } + std::string canonicalRequest = folly::sformat( + "{}\n{}\n{}\n{}\n{}\n{}\n", headers.getMethodString(), + headers.getPathAsStringPiece(), canonicalParamStr, canonicalHeaders, + itSignedHeaders->second, hashSha256Hex(payload)); + + std::string hashedCanonicalRequest = hashSha256Hex(canonicalRequest); + std::string requestDateTime = + headers.getHeaders().getSingleOrEmpty("X-Amz-Date"); + + std::string stringToSign = folly::sformat( + "{}\n{}\n{}{}{}{}\n{}\n", alg_name, requestDateTime, + credentialScopeToks[1], credentialScopeToks[2], credentialScopeToks[3], + credentialScopeToks[4], hashedCanonicalRequest); + + std::string signingKey = hmacSha256Binary( + hmacSha256Binary( + hmacSha256Binary(hmacSha256Binary("AWS4" + secretKey, currDate()), + credentialScopeToks[1].toString()), + credentialScopeToks[2].toString()), + "aws4_request"); + + std::string sig = folly::hexlify(hmacSha256Binary(signingKey, stringToSign)); + + return sig == itSignature->second; +} + +/** + * Handles requests by serving the file named in path. Only supports GET. + * reads happen in a CPU thread pool since read(2) is blocking. + * If egress pauses, file reading is also paused. + */ + +void S3Handler::onRequest(std::unique_ptr headers) noexcept +{ + if (headers->getMethod() != HTTPMethod::PUT && + headers->getMethod() != HTTPMethod::GET && + headers->getMethod() != HTTPMethod::HEAD && + headers->getMethod() != HTTPMethod::DELETE) + { + ResponseBuilder(downstream_) + .status(400, "Bad method") + .body("Only GET/PUT is supported") + .sendWithEOM(); + return; + } + + if (headers->getMethod() == HTTPMethod::GET || headers->getMethod() == HTTPMethod::HEAD) + { + request_type = headers->getMethod() == HTTPMethod::GET ? RequestType::GetObject : RequestType::HeadObject; + + auto header_path = headers->getPathAsStringPiece(); + if(!header_path.empty()) + { + fpath = std::string(header_path.subpiece(1)); + } + + running = true; + + if(fpath.find('/')==std::string::npos) + { + listObjects(*headers); + return; + } + + if(fpath.find(c_commit_uuid)!=std::string::npos) + { + getCommitObject(*headers); + return; + } + + getObject(*headers); + return; + } + else if (headers->getMethod() == HTTPMethod::PUT) + { + request_type = RequestType::PutObject; + fpath = std::string(headers->getPathAsStringPiece().subpiece(1)); + std::string cl = headers->getHeaders().getSingleOrEmpty( + proxygen::HTTP_HEADER_CONTENT_LENGTH); + if (cl.empty()) + { + ResponseBuilder(downstream_) + .status(500, "Internal error") + .body("Content-Length header not set") + .sendWithEOM(); + return; + } + put_remaining = std::atoll(cl.c_str()); + + XLOGF(DBG0, "PutObject {} length {}", fpath, put_remaining); + + if(fpath.find(c_commit_uuid)!=std::string::npos) + { + commit(*headers); + return; + } + + putObject(*headers); + return; + } + else if(headers->getMethod() == HTTPMethod::DELETE) + { + request_type = RequestType::DeleteObject; + deleteObject(*headers); + } +} + +void S3Handler::listObjects(proxygen::HTTPMessage& headers) +{ + request_type = RequestType::ListObjects; + auto marker = headers.getQueryParam("marker"); + auto max_keys = headers.getIntQueryParam("max-keys", 1000); + auto prefix = headers.getQueryParam("prefix"); + auto delimiter = headers.getQueryParam("delimiter"); + + auto evb = folly::EventBaseManager::get()->getEventBase(); + + folly::getGlobalCPUExecutor()->add( + [self = self, evb, marker, max_keys, prefix, delimiter]() + { + self->listObjects(evb, self, marker, std::max(0, std::min(10000, max_keys)), prefix, delimiter); + }); +} + +void S3Handler::getCommitObject(proxygen::HTTPMessage& headers) +{ + if(request_type==RequestType::HeadObject) + { + ResponseBuilder(self->downstream_).status(200, "OK").header(proxygen::HTTP_HEADER_CONTENT_LENGTH, std::to_string(sfs.get_runtime_id().size())).sendWithEOM(); + return; + } + + ResponseBuilder(self->downstream_) + .status(200, "OK") + .body(fmt::format("{}", sfs.get_runtime_id())) + .sendWithEOM(); +} + +void S3Handler::commit(proxygen::HTTPMessage& headers) +{ + if(put_remaining>0) + { + ResponseBuilder(self->downstream_) + .status(500, "Internal error") + .body(fmt::format("Body length != 0")) + .sendWithEOM(); + } + + auto evb = folly::EventBaseManager::get()->getEventBase(); + folly::getGlobalCPUExecutor()->add( + [self = this->self, evb]() + { + bool b = self->sfs.commit(false, -1); + + evb->runInEventBaseThread([self = self, b]() + { + if(!b) + { + ResponseBuilder(self->downstream_) + .status(500, "Internal error") + .body(fmt::format("Commit error")) + .sendWithEOM(); + } + else { + ResponseBuilder(self->downstream_) + .status(200, "OK") + .sendWithEOM(); + } + }); + }); +} + +void S3Handler::getObject(proxygen::HTTPMessage& headers) +{ + auto evb = folly::EventBaseManager::get()->getEventBase(); + folly::getGlobalCPUExecutor()->add( + [self = self, evb]() + { + unsigned int flags = 0; + if(self->request_type == RequestType::HeadObject) + flags |= SingleFileStorage::ReadMetaOnly; + + auto res = self->sfs.read_prepare(self->fpath, flags); + + if (res.err != 0) + { + evb->runInEventBaseThread([self = self, res]() + { + + if(res.err==ENOENT) + { + ResponseBuilder(self->downstream_) + .status(404, "Not found") + .body(fmt::format("Object not found")) + .sendWithEOM(); + } + else if(res.err==ENOTRECOVERABLE) + { + ResponseBuilder(self->downstream_) + .status(500, "Internal error") + .body(fmt::format("Storage is dead")) + .sendWithEOM(); + } + else + { + ResponseBuilder(self->downstream_) + .status(500, "Internal error") + .body(fmt::format("Error code: {}", res.err)) + .sendWithEOM(); + } + }); + return; + } + + evb->runInEventBaseThread([self = self, total_len = res.total_len]() + { + auto resp = std::move(ResponseBuilder(self->downstream_).status(200, "OK").header(proxygen::HTTP_HEADER_CONTENT_LENGTH, std::to_string(total_len))); + + if(self->request_type==RequestType::HeadObject) + { + XLOGF(DBG0, "Content length {} bytes for readObject HEAD of {}", total_len, self->fpath); + resp.sendWithEOM(); + return; + } + else + { + XLOGF(DBG0, "Content length {} bytes for readObject GET of {}", total_len, self->fpath); + resp.send(); + } + }); + + if (self->request_type == RequestType::HeadObject) + return; + + self->extents = std::move(res.extents); + self->put_remaining.store(res.total_len, std::memory_order_relaxed); + + self->readObject(evb, std::move(self), 0); + }); +} + +void S3Handler::putObject(proxygen::HTTPMessage& headers) +{ + auto evb = folly::EventBaseManager::get()->getEventBase(); + folly::getGlobalCPUExecutor()->add( + [self = this->self, evb]() + { + auto res = self->sfs.write_prepare(self->fpath, self->put_remaining, std::string::npos); + if (res.err != 0) + { + evb->runInEventBaseThread([self = self, res]() + { + ResponseBuilder(self->downstream_) + .status(500, "Internal error") + .body(fmt::format("Error preparing writing. Errno {}", res.err)) + .sendWithEOM(); + std::lock_guard lock(self->extents_mutex); + self->finished_ = true; + self->extents_cond.notify_all(); }); + return; + } + + std::lock_guard lock(self->extents_mutex); + self->extents = std::move(res.extents); + self->extents_cond.notify_all(); + }); +} + +void S3Handler::deleteObject(proxygen::HTTPMessage& headers) +{ + fpath = std::string(headers.getPathAsStringPiece().subpiece(1)); + auto evb = folly::EventBaseManager::get()->getEventBase(); + + folly::getGlobalCPUExecutor()->add( + [self = this->self, evb]() + { + auto res = self->sfs.del(self->fpath, SingleFileStorage::DelAction::Del, false); + + if(res && !self->sfs.get_manual_commit()) + { + res = self->sfs.commit(false, -1); + } + + evb->runInEventBaseThread([self = self, res]() + { + if(!res && self->sfs.get_is_dead()) + { + ResponseBuilder(self->downstream_) + .status(500, "Internal error") + .body(fmt::format("Storage is dead")) + .sendWithEOM(); + } + else if(!res) + { + ResponseBuilder(self->downstream_) + .status(404, "Not found") + .body(fmt::format("Object not found")) + .sendWithEOM(); + } + else + { + ResponseBuilder(self->downstream_) + .status(200, "OK") + .sendWithEOM(); + } + self->finished_ = true; + }); + + }); +} + +void S3Handler::readObject(folly::EventBase *evb, std::shared_ptr self, int64_t offset) +{ + const size_t bufsize = 32768; + folly::IOBufQueue buf; + + bool did_pause = false; + bool has_error = false; + while(offset < put_remaining.load(std::memory_order_relaxed)) + { + if (self->paused_) + { + XLOGF(DBG0, "Sending of {} paused at {} done bytes. Finished={} Running={}", self->fpath, self->done_bytes, self->finished_, self->running); + did_pause = true; + break; + } + + auto it = std::upper_bound(extents.begin(), extents.end(), SingleFileStorage::Ext(offset, 0, 0)); + if(!extents.empty()) + --it; + assert(it != extents.end()); + if(it==extents.end()) + break; + + assert(it->obj_offset <= offset && it->obj_offset + it->len > offset); + + int64_t ext_offset = offset - it->obj_offset; + auto curr_ext = SingleFileStorage::Ext(it->obj_offset + ext_offset, it->data_file_offset + ext_offset, it->len - ext_offset); + int64_t rlen = std::min(static_cast(bufsize), curr_ext.len); + + auto res = sfs.read_ext(curr_ext, 0, bufsize, buf); + + if(res.err!=0) + { + XLOGF(WARN, "Error reading extent code {}", res.err); + evb->runInEventBaseThread([self = self]() mutable + { + self->downstream_->sendAbort(); + self->finished_ = true; + self->running = false; + } ); + has_error = true; + break; + } + + offset += res.buf->length(); + + XLOGF(DBG0, "Sending body len {} of fpath {} total_len {}", res.buf->length(), self->fpath, put_remaining.load(std::memory_order_relaxed)); + + evb->runInEventBaseThread([self = self, body = std::move(res.buf), total_len = put_remaining.load(std::memory_order_relaxed)]() mutable + { + if(self->finished_) + return; + + self->done_bytes += body->length(); + auto resp = std::move(ResponseBuilder(self->downstream_).body(std::move(body))); + if(self->done_bytes == total_len) + { + resp.sendWithEOM(); + self->finished_ = true; + self->running = false; + } + else + { + resp.send(); + } + }); + + + } + + if(!has_error && offset < put_remaining.load(std::memory_order_relaxed)) + { + evb->runInEventBaseThread([self = self, did_pause] + { + if(self->finished_) + { + auto rc = self->sfs.read_finalize(self->fpath, self->extents, 0); + assert(rc==0); + return; + } + + XLOG(DBG0) << "Setting running=false"; + self->running = false; + + if (did_pause) + { + XLOG(DBG0) << "Resuming deferred readObject"; + if(!self->paused_ && !self->running) + { + XLOG(DBG0) << "Was unpaused. Resuming."; + self->onEgressResumed(); + } + } + + }); + } + else + { + auto rc = sfs.read_finalize(self->fpath, self->extents, 0); + assert(rc==0); + } +} + +void S3Handler::listObjects(folly::EventBase *evb, std::shared_ptr self, const std::string& marker, int max_keys, const std::string& prefix, const std::string& delimiter) +{ + SingleFileStorage::IterData iter_data = {}; + if(!sfs.iter_start(marker, false, iter_data)) + { + evb->runInEventBaseThread([self = self]() + { + ResponseBuilder(self->downstream_) + .status(500, "Internal error") + .body(fmt::format("Error listing")) + .sendWithEOM(); }); + return; + } + + std::string val_data; + + int i; + bool truncated = true; + for(i=0;i extra_exts; + if(!sfs.iter_curr_val(key, offset, size, extra_exts, last_modified, md5sum, iter_data)) + { + truncated = false; + break; + } + + for(const auto& ext: extra_exts) + { + size += ext.len; + } + + val_data += fmt::format("\t\n" + "\t\t{}\n" + "\t\t2009-10-12T17:50:30.000Z\n" + "\t\t\"{}\"\n" + "\t\t{}\n" + "\t\tSTANDARD\n" + "\t\t\n" + "\t\t\t75aa57f09aa0c8caeab4f8c24e99d10f8e7faeebf76c078efc7c6caea54ba06a\n" + "\t\t\tmtd@amazon.com\n" + "\t\t\n" + "\t", key, folly::hexlify(md5sum), size); + + if(!sfs.iter_next(iter_data)) + { + evb->runInEventBaseThread([self = self]() + { + ResponseBuilder(self->downstream_) + .status(500, "Internal error") + .body(fmt::format("Error listing (in iteration)")) + .sendWithEOM(); }); + sfs.iter_stop(iter_data); + return; + } + } + + std::string next_maker; + if(!truncated) + { + std::string data; + sfs.iter_curr_val(next_maker, data, iter_data); + } + + sfs.iter_stop(iter_data); + + std::string resp = fmt::format("\n" + "\n" + "\t{}\n" + "\t{}\n" + "\t{}\n" + "\t{}\n" + "\t{}\n" + "{}" + "", truncated ? "true" : "false", marker, max_keys, delimiter, next_maker, val_data); + + evb->runInEventBaseThread([self = self, resp = std::move(resp)]() + { + ResponseBuilder(self->downstream_) + .status(200, "OK") + .body(resp) + .sendWithEOM(); }); +} + +void S3Handler::onEgressPaused() noexcept +{ + // This will terminate readFile soon + XLOG(DBG0) << "S3Handler paused"; + paused_ = true; +} + +void S3Handler::onEgressResumed() noexcept +{ + XLOG(DBG0) << "S3Handler resumed"; + paused_ = false; + // If readFileScheduled_, it will reschedule itself + if (!running && !fpath.empty() && !finished_) + { + running = true; + XLOGF(DBG0, "Starting readObject of {} offset {}", fpath, done_bytes); + folly::getGlobalCPUExecutor()->add( + [self = self, evb = folly::EventBaseManager::get()->getEventBase(), offset = done_bytes]() + { + std::string fpath = self->fpath; + self->readObject(evb, std::move(self), offset); + }); + } + else + { + XLOGF(DBG0, "Deferred scheduling readFile finished={}", finished_); + } +} + +void S3Handler::onBody(std::unique_ptr body) noexcept +{ + auto evb = folly::EventBaseManager::get()->getEventBase(); + + size_t body_bytes = body->length(); + + folly::getGlobalCPUExecutor()->add( + [self = this->self, evb, offset = done_bytes, lbody = std::move(body)]() mutable + { + self->onBodyCPU(evb, offset, std::move(lbody)); + }); + + done_bytes += body_bytes; +} + +void S3Handler::onBodyCPU(folly::EventBase *evb, int64_t offset, std::unique_ptr body) +{ + { + std::unique_lock lock(extents_mutex); + while (extents.empty() && !finished_) + { + extents_cond.wait(lock); + } + + if (finished_) + { + return; + } + } + + if(extents.size()>1) + { + assert(extents[0] < extents[1]); + } + + const uint8_t *data = body->data(); + size_t data_size = body->length(); + while(data_size > 0) + { + auto it = std::upper_bound(extents.begin(), extents.end(), SingleFileStorage::Ext(offset, 0, 0)); + if(!extents.empty()) + --it; + assert(it != extents.end()); + if(it==extents.end()) + break; + + if(!(it->obj_offset <= offset && it->obj_offset + it->len > offset)) + { + XLOGF(DBG0, "Selected ext obj_offset={} len={} data_file_offset={} offset={} exts={}", it->obj_offset, it->len, it->data_file_offset, offset, extents.size()); + std::sort(extents.begin(), extents.end()); + auto it2= std::upper_bound(extents.begin(), extents.end(), SingleFileStorage::Ext(offset, 0, 0)); + XLOGF(DBG0, "Selected ext obj_offset={} len={} data_file_offset={} offset={} exts={}", it2->obj_offset, it2->len, it2->data_file_offset, offset, extents.size()); + break; + } + assert(it->obj_offset <= offset && it->obj_offset + it->len > offset); + + int64_t ext_offset = offset - it->obj_offset; + auto curr_ext = SingleFileStorage::Ext(it->obj_offset + ext_offset, it->data_file_offset + ext_offset, it->len - ext_offset); + int64_t wlen = std::min(static_cast(data_size), curr_ext.len); + + auto rc = sfs.write_ext(curr_ext, data, data_size); + if (rc != 0) + { + evb->runInEventBaseThread([self = self]() + { + ResponseBuilder(self->downstream_) + .status(500, "Internal error") + .body("Write ext error") + .sendWithEOM(); + self->finished_ = true; }); + return; + } + + data += wlen; + data_size -= wlen; + offset += wlen; + } + + assert(data_size == 0); + + if (put_remaining.fetch_sub(body->length(), std::memory_order_release) == body->length()) + { + auto rc = sfs.write_finalize(fpath, extents, 0, std::string(), false, true); + + if (rc != 0) + { + evb->runInEventBaseThread([self = self]() + { + ResponseBuilder(self->downstream_) + .status(500, "Internal error") + .body("Write finalization error") + .sendWithEOM(); + self->finished_ = true; }); + return; + } + + if(!sfs.get_manual_commit()) + { + bool b = sfs.commit(false, -1); + + if(!b) + { + evb->runInEventBaseThread([self = self]() + { + ResponseBuilder(self->downstream_) + .status(500, "Internal error") + .body("Commit error") + .sendWithEOM(); + self->finished_ = true; }); + return; + } + } + + evb->runInEventBaseThread([self = self]() + { + ResponseBuilder(self->downstream_) + .status(200, "OK") + .sendWithEOM(); + self->finished_ = true; }); + } +} + +void S3Handler::onEOM() noexcept {} + +void S3Handler::onUpgrade(UpgradeProtocol /*protocol*/) noexcept +{ + // handler doesn't support upgrades +} + +void S3Handler::requestComplete() noexcept +{ + XLOG(DBG0, "Request complete"); + finished_ = true; + paused_ = true; + self.reset(); +} + +void S3Handler::onError(ProxygenError /*err*/) noexcept +{ + XLOG(DBG0, "onError"); + finished_ = true; + paused_ = true; + + if (request_type == RequestType::PutObject) + { + // TODO: Free extents + } + + self.reset(); +} \ No newline at end of file diff --git a/s3handler.h b/s3handler.h new file mode 100644 index 0000000..0b46396 --- /dev/null +++ b/s3handler.h @@ -0,0 +1,76 @@ +#pragma once + +#include "SingleFileStorage.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class S3Handler : public proxygen::RequestHandler +{ + SingleFileStorage &sfs; + const std::string& root_key; + +public: + S3Handler(SingleFileStorage &sfs, const std::string& root_key) : sfs(sfs), self(this), root_key(root_key) {} + + void + onRequest(std::unique_ptr headers) noexcept override; + + void onBody(std::unique_ptr body) noexcept override; + + void onEOM() noexcept override; + + void onUpgrade(proxygen::UpgradeProtocol proto) noexcept override; + + void requestComplete() noexcept override; + + void onError(proxygen::ProxygenError err) noexcept override; + + void onEgressPaused() noexcept override; + + void onEgressResumed() noexcept override; + +private: + void readFile(folly::EventBase *evb); + void readObject(folly::EventBase *evb, std::shared_ptr self, int64_t offset); + void onBodyCPU(folly::EventBase *evb, int64_t offs, std::unique_ptr body); + void listObjects(proxygen::HTTPMessage& headers); + void getCommitObject(proxygen::HTTPMessage& headers); + void getObject(proxygen::HTTPMessage& headers); + void putObject(proxygen::HTTPMessage& headers); + void commit(proxygen::HTTPMessage& headers); + void deleteObject(proxygen::HTTPMessage& headers); + + void listObjects(folly::EventBase *evb, std::shared_ptr self, const std::string& marker, int max_keys, const std::string& prefix, const std::string& delimiter); + + enum class RequestType + { + Unknown, + GetObject, + HeadObject, + PutObject, + DeleteObject, + ListObjects + }; + + std::shared_ptr self; + RequestType request_type = RequestType::Unknown; + + std::string fpath; + std::atomic paused_{ false }; + int64_t done_bytes = 0; + bool running = false; + bool finished_ = false; + std::atomic put_remaining = -1; + + std::mutex extents_mutex; + std::condition_variable extents_cond; + + std::vector extents; +}; diff --git a/test/hs5_fixture/__init__.py b/test/hs5_fixture/__init__.py new file mode 100644 index 0000000..4fd2b9c --- /dev/null +++ b/test/hs5_fixture/__init__.py @@ -0,0 +1,67 @@ +# Copyright Martin Raiber. All Rights Reserved. +# SPDX-License-Identifier: AGPL-3.0-or-later + +from pathlib import Path +from shutil import rmtree +import subprocess +import sys +import uuid +import botocore +import pytest +import os +import boto3 +from mypy_boto3_s3.client import S3Client + +curr_port = 11000 + +class Hs5Runner: + + def __init__(self, workdir : Path) -> None: + global curr_port + + curr_port += 1 + + self._port = curr_port + self._workdir = workdir + self._root_key = uuid.uuid4().hex + + self._process = subprocess.Popen( + [f"{os.getcwd()}/build/hs5", + "-http_port", + str(curr_port), + "-root_key", + self._root_key, + "-data_file_size_limit_mb", + "100", + "-data_file_alloc_chunk_size_mb", + "10", + "-logging", "DBG0"], + stdout=sys.stdout, + stderr=sys.stderr, + cwd=workdir + ) + pass + + def stop(self) -> None: + + with pytest.raises(subprocess.TimeoutExpired): + self._process.wait(0.001) + + self._process.kill() + self._process.wait() + + rmtree(self._workdir) + + def get_url(self) -> str: + return f"http://127.0.0.1:{self._port}" + + def get_s3_client(self) -> S3Client: + return boto3.client('s3', endpoint_url=self.get_url(), aws_access_key_id="root", aws_secret_access_key=self._root_key) + + + +@pytest.fixture +def hs5(tmpdir: Path): + runner = Hs5Runner(tmpdir) + yield runner + runner.stop() \ No newline at end of file diff --git a/test/test_upload.py b/test/test_upload.py new file mode 100644 index 0000000..cafc0a0 --- /dev/null +++ b/test/test_upload.py @@ -0,0 +1,168 @@ +# Copyright Martin Raiber. All Rights Reserved. +# SPDX-License-Identifier: AGPL-3.0-or-later + +from concurrent.futures import thread +from dataclasses import dataclass +from distutils.command.upload import upload +import logging +from pathlib import Path +from re import T +from uuid import uuid4 +import boto3 +from botocore.exceptions import ClientError +import os +from hs5_fixture import Hs5Runner, hs5 +import pytest +import threading +import binascii +from boto3.s3.transfer import TransferConfig +import time +import filecmp + +def create_random_file(fn: Path, size: int) -> int: + with open(fn, "wb") as f: + csize = 0 + crc = 0 + while csize < size: + towrite = min(size-csize, 512*1024) + data = os.urandom(towrite) + #data = bytearray(towrite) + f.write(data) + crc = binascii.crc32(data, crc) + csize += towrite + + return crc + + + +def test_put_get_del_list(tmp_path: Path, hs5: Hs5Runner): + + with open(tmp_path / "upload.txt", "w") as upload_file: + upload_file.write("abc") + + s3_client = hs5.get_s3_client() + s3_client.upload_file(upload_file.name, "testbucket", "upload.txt") + dl_path = tmp_path / "download.txt" + s3_client.download_file("testbucket", "upload.txt", str(dl_path)) + + with open(dl_path, "r") as f: + assert f.read() == "abc" + + list_resp = s3_client.list_objects(Bucket="testbucket") + + assert not list_resp["IsTruncated"] + objs = list_resp["Contents"] + assert len(objs) == 1 + assert "Key" in objs[0] and objs[0]["Key"] == "testbucket/upload.txt" + assert "Size" in objs[0] and objs[0]["Size"] == 3 + + s3_client.delete_object(Bucket="testbucket", Key="upload.txt") + with pytest.raises(ClientError): + s3_client.download_file("testbucket", "upload.txt", str(dl_path)) + + list_resp = s3_client.list_objects(Bucket="testbucket") + assert not list_resp["IsTruncated"] + assert "Contents" not in list_resp + +def test_get_commit_obj(tmp_path: Path, hs5: Hs5Runner): + s3_client = hs5.get_s3_client() + fpath = tmp_path / "commit_uuid.txt" + s3_client.download_file("testbucket", "a711e93e-93b4-4a9e-8a0b-688797470002", str(fpath)) + + with open(fpath, "r") as f: + assert len(f.read())>30 + +def test_put_multipart(tmp_path: Path, hs5: Hs5Runner): + + with open(tmp_path / "upload_multipart.dat", "wb") as upload_file: + size = 50*1024*1024 + + while size > 0: + buf = os.urandom(512*1024) + upload_file.write(buf) + size -= len(buf) + + s3_client = hs5.get_s3_client() + s3_client.upload_file(upload_file.name, "testbucket", "upload.txt") + dl_path = tmp_path / "download.dat" + s3_client.download_file("testbucket", "upload.txt", str(dl_path)) + + assert filecmp.cmp(upload_file.name, dl_path) + + +def test_put_get_del_stress(tmp_path: Path, hs5: Hs5Runner): + s3_client = hs5.get_s3_client() + + @dataclass + class DlInfo: + allow_throttle = True + running_downloads = 0 + + def put_get_del(n: int, obj_size: int, throttle: bool, dl_info: DlInfo): + for i in range(0, n): + fname = uuid4().hex + ".dat" + fpath = tmp_path / fname + ul_crc = create_random_file(fpath, obj_size) + + config = TransferConfig(multipart_threshold=5*1024*1024*1024) + + s3_client.upload_file(str(fpath), "testbucket", fname, Config=config) + + def del_thread(fpath): + s3_client.delete_object(Bucket="testbucket", Key=fname) + + t = threading.Thread(target=del_thread, args=(fpath,)) + + resp = s3_client.get_object(Bucket="testbucket", Key=fname) + crc = 0 + body = resp["Body"] + cl = resp["ContentLength"] + assert cl == obj_size + cl -= 1 + b1 = body.read(1) + crc = binascii.crc32(b1, crc) + + dl_info.running_downloads += 1 + + t.start() + + if throttle: + while cl > 0: + b2 = body.read(min(4096, cl)) + crc = binascii.crc32(b2, crc) + if dl_info.allow_throttle: + time.sleep(0.01) + cl-=len(b2) + else: + b2 = body.read() + crc = binascii.crc32(b2, crc) + + assert crc == ul_crc + + t.join() + + dl_info.running_downloads -= 1 + + dl_info = DlInfo() + + t0 = threading.Thread(target=put_get_del, args=(1, 90*1024*1024, True, dl_info)) + t0.start() + + while dl_info.running_downloads==0: + time.sleep(0.1) + + threads : list[threading.Thread] = [] + for i in range(0, 1): + t = threading.Thread(target=put_get_del, args=(100, 1*1024*1024, False, dl_info)) + t.start() + threads.append(t) + + for t in threads: + t.join() + + dl_info.allow_throttle = False + + t0.join() + + + diff --git a/utils.cpp b/utils.cpp new file mode 100644 index 0000000..faff208 --- /dev/null +++ b/utils.cpp @@ -0,0 +1,24 @@ +/** + * Copyright Martin Raiber. All Rights Reserved. + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +#include "utils.h" +#include +#include + +std::string random_uuid() +{ + std::string rnd; + rnd.resize(16); + folly::Random::secureRandom(&rnd[0], 16); + return folly::hexlify(rnd); +} + +std::string random_uuid_binary() +{ + std::string rnd; + rnd.resize(16); + folly::Random::secureRandom(&rnd[0], 16); + return rnd; +} \ No newline at end of file diff --git a/utils.h b/utils.h new file mode 100644 index 0000000..d406e6a --- /dev/null +++ b/utils.h @@ -0,0 +1,68 @@ +#include +#include + + +static bool next(const std::string &pData, const size_t & doff, const std::string &pStr) +{ + for(size_t i=0;i=pData.size() ) + return false; + if( pData[doff+i]!=pStr[i] ) + return false; + } + return true; +} + +static std::string getafter(const std::string &str,const std::string &data) +{ + size_t pos=data.find(str); + if(pos!=std::string::npos) + { + return data.substr(pos + str.size()); + } + else + { + return std::string(); + } +} + +static std::string getuntil(const std::string& str, const std::string& data) +{ + size_t off=data.find(str); + if(off==std::string::npos) + return std::string(); + return data.substr(0,off); +} + +static bool isHex(const std::string &str) +{ + if (str.size() % 2 != 0) + { + return false; + } + + for(size_t i=0;i= '0' + && ch <= '9') + { + } + else if (ch >= 'a' && ch <= 'f') + { + } + else if (ch >= 'A' && ch <= 'F') + { + } + else + { + return false; + } + } + return true; +} + +std::string random_uuid(); + +std::string random_uuid_binary(); \ No newline at end of file