From: Kunal Mehta Date: Mon, 5 Nov 2018 04:31:24 +0000 (+0000) Subject: Import zimlib_4.0.4.orig.tar.gz X-Git-Tag: archive/raspbian/4.0.4-5.1+rpi1~3 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=37a2838166b9c97bce781e27c01dc35f4fbcb0a0;p=zimlib.git Import zimlib_4.0.4.orig.tar.gz [dgit import orig zimlib_4.0.4.orig.tar.gz] --- 37a2838166b9c97bce781e27c01dc35f4fbcb0a0 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1d89127 --- /dev/null +++ b/.gitignore @@ -0,0 +1,34 @@ +*~ +*#* +autom4te.cache +build +compile +config.h +configure +depcomp +.deps +.dirstamp +INSTALL +install-sh +*.kate-swp +*.la +.libs +libtool +*.lo +ltmain.sh +*.m4 +Makefile +Makefile.in +missing +*.o +stamp-h1 +.svn +.*.swp +*.zim +examples/createZimExample +src/tools/zimdump +src/tools/zimsearch +libzim.pc +test-driver +test/zimlib-test* +test/test-suite.log diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..8544c6a --- /dev/null +++ b/.travis.yml @@ -0,0 +1,37 @@ +language: cpp +dist: trusty +sudo: required +cache: ccache +before_install: +- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then eval "${MATRIX_EVAL}"; fi +- PATH=$PATH:$HOME/bin +install: travis/install_deps.sh +script: travis/compile.sh +env: + matrix: + - PLATFORM="native_static" + - PLATFORM="native_dyn" + - PLATFORM="win32_static" + - PLATFORM="win32_dyn" + - PLATFORM="android_arm" + - PLATFORM="android_arm64" + +addons: + apt: + packages: + - cmake + - python3-pip + - libbz2-dev + - ccache + - zlib1g-dev + - uuid-dev + - cython3 + - g++-mingw-w64-i686 + - gcc-mingw-w64-i686 + - gcc-mingw-w64-base + - mingw-w64-tools + +matrix: + include: + - env: PLATFORM="native_dyn" + os: osx diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..1197f56 --- /dev/null +++ b/AUTHORS @@ -0,0 +1 @@ +Tommi Maekitalo diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..e2683b5 --- /dev/null +++ b/COPYING @@ -0,0 +1,280 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..8a66dff --- /dev/null +++ b/ChangeLog @@ -0,0 +1,141 @@ +libzim 4.0.4 +============ + + * Fix opening of multi-part zim. + * Fix convertion of path to wpath on Windows. + +libzim 4.0.3 +============ + + * Implement low level file manipilation using different backends + +libzim 4.0.2 +============ + + * [Windows] Fix opening of zim file bigger than 4GiB + +libzim 4.0.1 +============ + + * [Writer] Fix wrong redirectyon log message + * Make libzim compile natively on windows using MSVC + * Better message when failing to read a zim file. + * Make libzim on windows correctly open unicode path. + * Add compilation option to use less memory (but more I/O). + Usefull on low memory devices (android) + * Small fixes + +libzim 4.0.0 +============ + + * [Writer] Remove a lot of memory copy. + * [Writer] Add xapian indexing directly in libzim. + * [Writer] Better API. + * [Writer] Use multi-threading to write clusters. + * [Writer] Ensure mimetype of articles article is not null. + * Extend test timeout for cluster's test. + * Less memory copy for cluster's test. + * Allow skipping test using a lot memory using env variable + `SKIP_BIG_MEMORY_TEST=1` + * Explicitly use the icu namespace to allow using of packaged icu lib. + * Use a temporary file name as long as the ZIM writting process is + not finished (#163) + * [Travis] Do no compile using gcc-5 (but the default trusty's one 4.8) + +libzim 3.3.0 +============ + + * Fix handling of big cluster (>4GiB) on 32 bits architecture. This is mainly + done by : + * Do not mmap the whole cluster by default. + * MMap only the memory asociated to an article. + * If an article is > 4GiB, the blob associated to it is invalid + (data==size==0). + * Other information are still valid (directAccessInformation, ...) + * Fix writing of extended cluster in writer. + * Compile libzim on macos. + * Build libzim setting RPATH. + * Search result urls are now what is stored in the zim file. They should not + start with a `/`. This is a revert of the change made in last release. + (See kiwix/kiwix-lib#123) + * Spelling corrections in README. + +libzim 3.2.0 +============ + + * Support geo query if the xapian database has indexed localisation. + * Handle articles bigger than 4Go in the zim file (#110). + * Use AND operator between search term. + * Fix compilation with recent clang (#95). + * Add method to get article's data localisation in the zim file. + * Be able to get only a part of article (#77). + * Do not crash if we cannot open the xapian Database for some reasons. + (kiwix/kiwix-tools#153) + * Do not assumen there is always a checksum in the zim file. + (kiwix/kiwix-tools#150) + * Try to do some sanity checks when opening a zim file. + * Use pytest to do some tests (when cython is available). + * Use levenshtein distance to sort and have better suggestion results. + * Search result urls are now always absolute (starts with a '/'). + (kiwix/kiwix-lib#110) + * Open the file readonly when checking the zim file (and so be able to check + read only file). + * Accept absolute url starting with '/' when searching for article. + * Fix various bugs + +libzim 3.1.0 +============ + + * Lzma is not a optional dependency anymore. + * Better handle (report and not crash) invalid zim file. + * Embed source of gtest (used only if gtest is not available on the system) + * Move zimDump tools out of libzim repository to zim-tools + * ZimCreator tools doesn't not read command line to set options. + +libzim 3.0.0 +============ + +This is a major change of the libzim. +Expect a lot new improvement and API changes. + + * Add a suggestion mode to the search + * Fix licensing issues + * Fix wrong stemming of the query when searching + * Deactivate searching (and so crash) in the embedded database if the zim is + splitted + * Rewrite the low level memory management of libzim when reading a zim file: + * We use a buffer base entity to handle memory and reading file instead of + reading file using stream. + * MMap the memory when posible to avoid memory copy. + * Use const when posible (API break) + * Move to googletest instead of cxxtools for unit-tests. + * Fix endiannes bug on arm. + * Do not install private headers. Those headers declare private structure and + should not be visible (API break) + * Compile libzim with `-Werror` and `-Wall` options. + * Make libzim thread safe for reading article. + The search part is not thread safe, and all search operation must be + protected by a lock. + * Add method to get only a part of a article. + * Move some tools to zim-tools repository. + + +libzim 2.0.0 +============ + + * Move to meson build system + `libzim` now use `meson` as build system instead of `autotools` + * Move to C++11 standard. + * Fulltext search in zim file. + We have integrated the xapian fulltext search in libzim. + So now, libzim provide an API to search in a zim containing embeded fulltext + index. This means that : + *libzim need xapian as (optional) dependencies (if you want compile with + xapian support). + * The old and unused search API has been removed. + * Remove bzip2 support. + * Remove Symbian support. + * Few API hanges + * Make some header files private (not installed); + * A `Blob` can now be cast to a `string` directly; + * Change a lot of `File` methods to const methods. diff --git a/README.md b/README.md new file mode 100644 index 0000000..62eabde --- /dev/null +++ b/README.md @@ -0,0 +1,141 @@ +ZIM library +=========== + +The ZIM library is the reference implementation for the ZIM file +format. It's a solution to read and write ZIM files on many systems +and architectures. More information about the ZIM format and the +openZIM project at http://www.openzim.org/ + +Disclaimer +---------- + +This document assumes you have a little knowledge about software +compilation. If you experience difficulties with the dependencies or +with the ZIM library compilation itself, we recommend to have a look to +[kiwix-build](https://github.com/kiwix/kiwix-build). + +Preamble +-------- + +Although the ZIM library can be compiled/cross-compiled on/for many +systems, the following documentation explains how to do it on POSIX +ones. It is primarily though for GNU/Linux systems and has been tested +on recent releases of Ubuntu and Fedora. + +Dependencies +------------ + +The ZIM library relies on many third parts software libraries. They +are prerequisites to the Kiwix library compilation. Following +libraries need to be available: + +* Z ................................................. http://zlib.net/ +(package zlib1g-dev on Ubuntu) +* LZMA ...................................... http://tukaani.org/lzma/ +(package lzma-dev on Ubuntu) +* ICU ................................... http://site.icu-project.org/ +(package libicu-dev on Ubuntu) +* Xapian (optional) .............................. https://xapian.org/ +(package libxapian-dev on Ubuntu) +* Google test (optional) ........ https://github.com/google/googletest +(No valid package on Ubuntu, if gtest is not present, libzim will use +embedded one) + +These dependencies may or may not be packaged by your operating +system. They may also be packaged but only in an older version. The +compilation script will tell you if one of them is missing or too old. +In the worse case, you will have to download and compile a more recent +version by hand. + +If you want to install these dependencies locally, then ensure that +meson (through pkg-config) will properly find them. + +Environment +------------- + +The ZIM library builds using [Meson](http://mesonbuild.com/) version +0.39 or higher. Meson relies itself on Ninja, pkg-config and few other +compilation tools. + +Install first the few common compilation tools: +* Meson +* Ninja +* Pkg-config + +These tools should be packaged if you use a cutting edge operating +system. If not, have a look to the "Troubleshooting" section. + +Compilation +----------- + +Once all dependencies are installed, you can compile ZIM library with: +``` +meson . build +ninja -C build +``` + +By default, it will compile dynamic linked libraries. All binary files +will be created in the "build" directory created automatically by +Meson. If you want statically linked libraries, you can add +`--default-library=static` option to the Meson command. + +Depending of you system, `ninja` may be called `ninja-build`. + +Installation +------------ + +If you want to install the libzim and the headers you just have +compiled on your system, here we go: + +``` +ninja -C build install +``` + +You might need to run the command as root (or using 'sudo'), depending +where you want to install the libraries. After the installation +succeeded, you may need to run ldconfig (as root). + +Uninstallation +------------ + +If you want to uninstall the libzim: + +``` +ninja -C build uninstall +``` + +Like for the installation, you might need to run the command as root +(or using 'sudo'). + +Troubleshooting +--------------- + +If you need to install Meson "manually": +``` +virtualenv -p python3 ./ # Create virtualenv +source bin/activate # Activate the virtualenv +pip3 install meson # Install Meson +hash -r # Refresh bash paths +``` + +If you need to install Ninja "manually": +``` +git clone git://github.com/ninja-build/ninja.git +cd ninja +git checkout release +./configure.py --bootstrap +mkdir ../bin +cp ninja ../bin +cd .. +``` + +If the compilation still fails, you might need to get a more recent +version of a dependency than the one packaged by your Linux +distribution. Try then with a source tarball distributed by the +problematic upstream project or even directly from the source code +repository. + +License +------- + +GPLv2 or later, see COPYING for more details. diff --git a/examples/createZimExample.cpp b/examples/createZimExample.cpp new file mode 100644 index 0000000..fc74173 --- /dev/null +++ b/examples/createZimExample.cpp @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2012 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include + +class TestArticle : public zim::writer::Article +{ + std::string _id; + std::string _data; + + public: + TestArticle() { } + explicit TestArticle(const std::string& id); + virtual ~TestArticle() = default; + + virtual std::string getAid() const; + virtual char getNamespace() const; + virtual std::string getUrl() const; + virtual std::string getTitle() const; + virtual bool isRedirect() const; + virtual bool shouldCompress() const { return true; } + virtual std::string getMimeType() const; + virtual std::string getRedirectAid() const; + virtual bool shouldIndex() const { return false; } + virtual zim::size_type getSize() const { return _data.size(); } + virtual std::string getFilename() const { return ""; } + + virtual zim::Blob getData() const + { return zim::Blob(&_data[0], _data.size()); } +}; + +TestArticle::TestArticle(const std::string& id) + : _id(id) +{ + std::ostringstream data; + data << "this is article " << id << std::endl; + _data = data.str(); +} + +std::string TestArticle::getAid() const +{ + return _id; +} + +char TestArticle::getNamespace() const +{ + return 'A'; +} + +std::string TestArticle::getUrl() const +{ + return _id; +} + +std::string TestArticle::getTitle() const +{ + return _id; +} + +bool TestArticle::isRedirect() const +{ + return false; +} + +std::string TestArticle::getMimeType() const +{ + return "text/plain"; +} + +std::string TestArticle::getRedirectAid() const +{ + return ""; +} + +int main(int argc, char* argv[]) +{ + std::vector _articles; + unsigned max = 16; + _articles.resize(max); + for (unsigned n = 0; n < max; ++n) + { + std::ostringstream id; + id << (n + 1); + _articles[n] = TestArticle(id.str()); + } + try + { + zim::writer::ZimCreator c; + c.startZimCreation("foo.zim"); + for (auto& article:_articles) + { + c.addArticle(article); + } + c.finishZimCreation(); + } + catch (const std::exception& e) + { + std::cerr << e.what() << std::endl; + } +} + diff --git a/examples/meson.build b/examples/meson.build new file mode 100644 index 0000000..fb6b77c --- /dev/null +++ b/examples/meson.build @@ -0,0 +1,6 @@ + +executable('createZimExample', 'createZimExample.cpp', + link_with: libzim, + link_args: extra_link_args, + include_directories: include_directory, + dependencies: [thread_dep, xapian_dep, icu_dep, zlib_dep, lzma_dep]) diff --git a/include/meson.build b/include/meson.build new file mode 100644 index 0000000..6b4630f --- /dev/null +++ b/include/meson.build @@ -0,0 +1,22 @@ +include_directory = include_directories('.') + +install_headers( + 'zim/article.h', + 'zim/blob.h', + 'zim/error.h', + 'zim/file.h', + 'zim/fileheader.h', + 'zim/fileiterator.h', + 'zim/search.h', + 'zim/search_iterator.h', + 'zim/uuid.h', + 'zim/zim.h', + subdir:'zim' +) + +install_headers( + 'zim/writer/article.h', + 'zim/writer/zimcreator.h', + subdir:'zim/writer' +) + diff --git a/include/zim/article.h b/include/zim/article.h new file mode 100644 index 0000000..b85d9e9 --- /dev/null +++ b/include/zim/article.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ARTICLE_H +#define ZIM_ARTICLE_H + +#include +#include "zim.h" +#include "blob.h" +#include +#include + +namespace zim +{ + class Cluster; + class Dirent; + class FileImpl; + + class Article + { + private: + std::shared_ptr file; + article_index_type idx; + + std::shared_ptr getDirent() const; + + public: + Article() + : idx(std::numeric_limits::max()) + { } + + Article(std::shared_ptr file_, article_index_type idx_) + : file(file_), + idx(idx_) + { } + + std::string getParameter() const; + + std::string getTitle() const; + std::string getUrl() const; + std::string getLongUrl() const; + + uint16_t getLibraryMimeType() const; + const std::string& getMimeType() const; + + bool isRedirect() const; + bool isLinktarget() const; + bool isDeleted() const; + + char getNamespace() const; + + article_index_type getRedirectIndex() const; + Article getRedirectArticle() const; + + size_type getArticleSize() const; + + bool operator< (const Article& a) const + { return getNamespace() < a.getNamespace() + || (getNamespace() == a.getNamespace() + && getTitle() < a.getTitle()); } + + std::shared_ptr getCluster() const; + + Blob getData(offset_type offset=0) const; + Blob getData(offset_type offset, size_type size) const; + + offset_type getOffset() const; + std::pair getDirectAccessInformation() const; + + std::string getPage(bool layout = true, unsigned maxRecurse = 10); + void getPage(std::ostream&, bool layout = true, unsigned maxRecurse = 10); + + article_index_type getIndex() const { return idx; } + + bool good() const { return idx != std::numeric_limits::max(); } + }; + +} + +#endif // ZIM_ARTICLE_H + diff --git a/include/zim/blob.h b/include/zim/blob.h new file mode 100644 index 0000000..928394e --- /dev/null +++ b/include/zim/blob.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_BLOB_H +#define ZIM_BLOB_H + +#include "zim.h" + +#include +#include +#include +#include + +namespace zim +{ + class Buffer; + class Blob + { + const char* _data; + size_type _size; + std::shared_ptr _buffer; + + public: + Blob(); + Blob(const char* data, size_type size); + Blob(std::shared_ptr buffer); + + operator std::string() const { return std::string(_data, _size); } + const char* data() const { return _data; } + const char* end() const { return _data + _size; } + size_type size() const { return _size; } + }; + + inline std::ostream& operator<< (std::ostream& out, const Blob& blob) + { + if (blob.data()) + out.write(blob.data(), blob.size()); + return out; + } + + inline bool operator== (const Blob& b1, const Blob& b2) + { + return b1.size() == b2.size() + && std::equal(b1.data(), b1.data() + b1.size(), b2.data()); + } +} + +#endif // ZIM_BLOB_H diff --git a/include/zim/error.h b/include/zim/error.h new file mode 100644 index 0000000..fb59e0d --- /dev/null +++ b/include/zim/error.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ERROR_H +#define ZIM_ERROR_H + +#include + +namespace zim +{ + class ZimFileFormatError : public std::runtime_error + { + public: + explicit ZimFileFormatError(const std::string& msg) + : std::runtime_error(msg) + { } + }; + +} + +#endif // ZIM_ERROR_H + diff --git a/include/zim/file.h b/include/zim/file.h new file mode 100644 index 0000000..cfad05c --- /dev/null +++ b/include/zim/file.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2006,2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILE_H +#define ZIM_FILE_H + +#include +#include +#include +#include "zim.h" +#include "article.h" +#include "blob.h" +#include "fileheader.h" + +class ZimDumper; + +namespace zim +{ + class Search; + class FileImpl; + class Cluster; + + class File + { + friend class ::ZimDumper; + std::shared_ptr impl; + + public: + File() + { } + explicit File(const std::string& fname); + + const std::string& getFilename() const; + const Fileheader& getFileheader() const; + offset_type getFilesize() const; + + article_index_type getCountArticles() const; + + Article getArticle(article_index_type idx) const; + Article getArticle(char ns, const std::string& url) const; + Article getArticleByUrl(const std::string& url) const; + Article getArticleByTitle(article_index_type idx) const; + Article getArticleByTitle(char ns, const std::string& title) const; + + std::shared_ptr getCluster(cluster_index_type idx) const; + cluster_index_type getCountClusters() const; + offset_type getClusterOffset(cluster_index_type idx) const; + + Blob getBlob(cluster_index_type clusterIdx, blob_index_type blobIdx) const; + offset_type getOffset(cluster_index_type clusterIdx, blob_index_type blobIdx) const; + + article_index_type getNamespaceBeginOffset(char ch) const; + article_index_type getNamespaceEndOffset(char ch) const; + article_index_type getNamespaceCount(char ns) const; + + std::string getNamespaces() const; + bool hasNamespace(char ch) const; + + class const_iterator; + + const_iterator begin() const; + const_iterator beginByTitle() const; + const_iterator end() const; + const_iterator findByTitle(char ns, const std::string& title) const; + const_iterator find(char ns, const std::string& url) const; + const_iterator find(const std::string& url) const; + + const Search* search(const std::string& query, int start, int end) const; + const Search* suggestions(const std::string& query, int start, int end) const; + + time_t getMTime() const; + + const std::string& getMimeType(uint16_t idx) const; + + std::string getChecksum(); + bool verify(); + + bool is_multiPart() const; + }; + + std::string urldecode(const std::string& url); + +} + +#endif // ZIM_FILE_H + diff --git a/include/zim/fileheader.h b/include/zim/fileheader.h new file mode 100644 index 0000000..6bf60da --- /dev/null +++ b/include/zim/fileheader.h @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2008 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILEHEADER_H +#define ZIM_FILEHEADER_H + +#include +#include "zim.h" +#include "uuid.h" +#include +#include + +#ifdef _WIN32 +#define NOMINMAX 1 +#include +#undef NOMINMAX +#undef max +#endif + +namespace zim +{ + class Buffer; + class Fileheader + { + public: + static const uint32_t zimMagic; + static const uint16_t zimClassicMajorVersion; + static const uint16_t zimExtendedMajorVersion; + static const uint16_t zimMinorVersion; + static const size_type size; + + private: + uint16_t majorVersion; + uint16_t minorVersion; + Uuid uuid; + article_index_type articleCount; + offset_type titleIdxPos; + offset_type urlPtrPos; + offset_type mimeListPos; + cluster_index_type clusterCount; + offset_type clusterPtrPos; + article_index_type mainPage; + article_index_type layoutPage; + offset_type checksumPos; + + public: + Fileheader() + : majorVersion(zimClassicMajorVersion), + minorVersion(zimMinorVersion), + articleCount(0), + titleIdxPos(0), + urlPtrPos(0), + clusterCount(0), + clusterPtrPos(0), + mainPage(std::numeric_limits::max()), + layoutPage(std::numeric_limits::max()), + checksumPos(std::numeric_limits::max()) + {} + + void read(std::shared_ptr buffer); + + // Do some sanity check, raise a ZimFileFormateError is + // something is wrong. + void sanity_check() const; + + uint16_t getMajorVersion() const { return majorVersion; } + void setMajorVersion(uint16_t v) { majorVersion = v; } + + uint16_t getMinorVersion() const { return minorVersion; } + void setMinorVersion(uint16_t v) { minorVersion = v; } + + const Uuid& getUuid() const { return uuid; } + void setUuid(const Uuid& uuid_) { uuid = uuid_; } + + article_index_type getArticleCount() const { return articleCount; } + void setArticleCount(article_index_type s) { articleCount = s; } + + offset_type getTitleIdxPos() const { return titleIdxPos; } + void setTitleIdxPos(offset_type p) { titleIdxPos = p; } + + offset_type getUrlPtrPos() const { return urlPtrPos; } + void setUrlPtrPos(offset_type p) { urlPtrPos = p; } + + offset_type getMimeListPos() const { return mimeListPos; } + void setMimeListPos(offset_type p) { mimeListPos = p; } + + cluster_index_type getClusterCount() const { return clusterCount; } + void setClusterCount(cluster_index_type s) { clusterCount = s; } + + offset_type getClusterPtrPos() const { return clusterPtrPos; } + void setClusterPtrPos(offset_type p) { clusterPtrPos = p; } + + bool hasMainPage() const { return mainPage != std::numeric_limits::max(); } + article_index_type getMainPage() const { return mainPage; } + void setMainPage(article_index_type s){ mainPage = s; } + + bool hasLayoutPage() const { return layoutPage != std::numeric_limits::max(); } + article_index_type getLayoutPage() const { return layoutPage; } + void setLayoutPage(article_index_type s) { layoutPage = s; } + + bool hasChecksum() const { return getMimeListPos() >= 80; } + offset_type getChecksumPos() const { return hasChecksum() ? checksumPos : 0; } + void setChecksumPos(offset_type p) { checksumPos = p; } + }; + + std::ostream& operator<< (std::ostream& out, const Fileheader& fh); + +} + +#endif // ZIM_FILEHEADER_H diff --git a/include/zim/fileiterator.h b/include/zim/fileiterator.h new file mode 100644 index 0000000..3533e52 --- /dev/null +++ b/include/zim/fileiterator.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILEITERATOR_H +#define ZIM_FILEITERATOR_H + +#include +#include "article.h" + +namespace zim +{ + class File::const_iterator : public std::iterator + { + public: + enum Mode { + UrlIterator, + ArticleIterator + }; + + private: + const File* file; + article_index_type idx; + mutable Article article; + Mode mode; + + bool is_end() const { return file == 0 || idx >= file->getCountArticles(); } + + public: + explicit const_iterator(const File* file_ = 0, article_index_type idx_ = 0, Mode mode_ = UrlIterator) + : file(file_), + idx(idx_), + mode(mode_) + { } + + article_index_type getIndex() const { return idx; } + const File& getFile() const { return *file; } + + bool operator== (const const_iterator& it) const + { return (is_end() && it.is_end()) + || (file == it.file && idx == it.idx); } + bool operator!= (const const_iterator& it) const + { return !operator==(it); } + + const_iterator& operator++() + { + ++idx; + article = Article(); + return *this; + } + + const_iterator operator++(int) + { + const_iterator it = *this; + operator++(); + return it; + } + + const_iterator& operator--() + { + --idx; + article = Article(); + return *this; + } + + const_iterator operator--(int) + { + const_iterator it = *this; + operator--(); + return it; + } + + const Article& operator*() const + { + if (!article.good()) + article = mode == UrlIterator ? file->getArticle(idx) + : file->getArticleByTitle(idx); + return article; + } + + pointer operator->() const + { + operator*(); + return &article; + } + + }; + +} + +#endif // ZIM_FILEITERATOR_H + diff --git a/include/zim/search.h b/include/zim/search.h new file mode 100644 index 0000000..bc8d9a5 --- /dev/null +++ b/include/zim/search.h @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2007 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_SEARCH_H +#define ZIM_SEARCH_H + +#include "search_iterator.h" +#include +#include +#include + +namespace zim +{ + +class File; +class Search +{ + friend class search_iterator; + friend struct search_iterator::InternalData; + public: + typedef search_iterator iterator; + + explicit Search(const std::vector zimfiles); + explicit Search(const File* zimfile); + Search(const Search& it); + Search& operator=(const Search& it); + Search(Search&& it); + Search& operator=(Search&& it); + ~Search(); + + void set_verbose(bool verbose); + + Search& add_zimfile(const File* zimfile); + Search& set_query(const std::string& query); + Search& set_georange(float latitude, float longitude, float distance); + Search& set_range(int start, int end); + Search& set_suggestion_mode(bool suggestion_mode); + + search_iterator begin() const; + search_iterator end() const; + int get_matches_estimated() const; + + private: + struct InternalData; + std::unique_ptr internal; + std::vector zimfiles; + + mutable std::map valuesmap; + mutable std::string prefixes; + std::string query; + float latitude; + float longitude; + float distance; + int range_start; + int range_end; + bool suggestion_mode; + bool geo_query; + mutable bool search_started; + mutable bool has_database; + mutable bool verbose; + mutable int estimated_matches_number; +}; + +} //namespace zim + +#endif // ZIM_SEARCH_H diff --git a/include/zim/search_iterator.h b/include/zim/search_iterator.h new file mode 100644 index 0000000..9a44f32 --- /dev/null +++ b/include/zim/search_iterator.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_SEARCH_ITERATOR_H +#define ZIM_SEARCH_ITERATOR_H + +#include +#include +#include "article.h" + +namespace zim +{ +class Search; +class search_iterator : public std::iterator +{ + friend class zim::Search; + public: + search_iterator(); + search_iterator(const search_iterator& it); + search_iterator& operator=(const search_iterator& it); + search_iterator(search_iterator&& it); + search_iterator& operator=(search_iterator&& it); + ~search_iterator(); + + bool operator== (const search_iterator& it) const; + bool operator!= (const search_iterator& it) const; + + search_iterator& operator++(); + search_iterator operator++(int); + search_iterator& operator--(); + search_iterator operator--(int); + + std::string get_url() const; + std::string get_title() const; + int get_score() const; + std::string get_snippet() const; + int get_wordCount() const; + int get_size() const; + int get_fileIndex() const; + reference operator*() const; + pointer operator->() const; + + private: + struct InternalData; + std::unique_ptr internal; + search_iterator(InternalData* internal_data); + + bool is_end() const; +}; + +} // namespace ziç + +#endif // ZIM_SEARCH_ITERATOR_H diff --git a/include/zim/uuid.h b/include/zim/uuid.h new file mode 100644 index 0000000..f86b51c --- /dev/null +++ b/include/zim/uuid.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_UUID_H +#define ZIM_UUID_H + +#include +#include +#include +#include + +namespace zim +{ + struct Uuid + { + Uuid() + { + std::memset(data, 0, 16); + } + + Uuid(const char uuid[16]) + { + std::copy(uuid, uuid+16, data); + } + + static Uuid generate(std::string value = ""); + + bool operator== (const Uuid& other) const + { return std::equal(data, data+16, other.data); } + bool operator!= (const Uuid& other) const + { return !(*this == other); } + unsigned size() const { return 16; } + + char data[16]; + }; + + std::ostream& operator<< (std::ostream& out, const Uuid& uuid); + +} + +#endif // ZIM_UUID_H diff --git a/include/zim/writer/article.h b/include/zim/writer/article.h new file mode 100644 index 0000000..2de0725 --- /dev/null +++ b/include/zim/writer/article.h @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_ARTICLESOURCE_H +#define ZIM_WRITER_ARTICLESOURCE_H + +#include +#include +#include +#include +#include + +namespace zim +{ + namespace writer + { + class ArticleSource; + class Article + { + public: + virtual std::string getAid() const = 0; + virtual char getNamespace() const = 0; + virtual std::string getUrl() const = 0; + virtual std::string getTitle() const = 0; + virtual bool isRedirect() const = 0; + virtual bool isLinktarget() const; + virtual bool isDeleted() const; + virtual std::string getMimeType() const = 0; + virtual bool shouldCompress() const = 0; + virtual bool shouldIndex() const = 0; + virtual std::string getRedirectAid() const = 0; + virtual std::string getParameter() const; + virtual zim::size_type getSize() const = 0; + virtual Blob getData() const = 0; + virtual std::string getFilename() const = 0; + virtual ~Article() = default; + + // returns the next category id, to which the article is assigned to + virtual std::string getNextCategory(); + }; + + class Category + { + public: + virtual Blob getData() = 0; + virtual std::string getUrl() const = 0; + virtual std::string getTitle() const = 0; + virtual ~Category() = default; + }; + + class ArticleSource + { + public: + virtual void setFilename(const std::string& fname) { } + virtual const Article* getNextArticle() = 0; + + // After fetching the articles and for each article the category ids + // using Article::getNextCategory, the writer has a list of category + // ids. Using this list, the writer fetches the category data using + // this method. + virtual Category* getCategory(const std::string& cid); + virtual ~ArticleSource() = default; + }; + + } +} + +#endif // ZIM_WRITER_ARTICLESOURCE_H diff --git a/include/zim/writer/zimcreator.h b/include/zim/writer/zimcreator.h new file mode 100644 index 0000000..439ebae --- /dev/null +++ b/include/zim/writer/zimcreator.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_ZIMCREATOR_H +#define ZIM_WRITER_ZIMCREATOR_H + +#include +#include +#include + +namespace zim +{ + class Fileheader; + namespace writer + { + class ZimCreatorData; + class ZimCreator + { + public: + ZimCreator(bool verbose = false); + virtual ~ZimCreator(); + + zim::size_type getMinChunkSize() const { return minChunkSize; } + void setMinChunkSize(zim::size_type s) { minChunkSize = s; } + void setIndexing(bool indexing, std::string language) + { withIndex = indexing; indexingLanguage = language; } + void setCompressionThreads(unsigned ct) { compressionThreads = ct; } + + virtual void startZimCreation(const std::string& fname); + virtual void addArticle(const Article& article); + virtual void finishZimCreation(); + + virtual std::string getMainPage() { return ""; } + virtual std::string getLayoutPage() { return ""; } + virtual zim::Uuid getUuid() { return Uuid::generate(); } + + private: + std::unique_ptr data; + bool verbose; + bool withIndex = false; + size_t minChunkSize = 1024-64; + std::string indexingLanguage; + unsigned compressionThreads = 4; + + void fillHeader(Fileheader* header); + void write(const Fileheader& header, const std::string& fname) const; + static void* clusterWriter(void* arg); + }; + } + +} + +#endif // ZIM_WRITER_ZIMCREATOR_H diff --git a/include/zim/zim.h b/include/zim/zim.h new file mode 100644 index 0000000..9836c46 --- /dev/null +++ b/include/zim/zim.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ZIM_H +#define ZIM_ZIM_H + +#include + +namespace zim +{ + // An index of an article (in a zim file) + typedef uint32_t article_index_type; + + // An index of an cluster (in a zim file) + typedef uint32_t cluster_index_type; + + // An index of a blog (in a cluster) + typedef uint32_t blob_index_type; + + // The size of something (article, zim, cluster, blob, ...) + typedef uint64_t size_type; + + // An offset. + typedef uint64_t offset_type; + + enum CompressionType + { + zimcompDefault, + zimcompNone, + zimcompZip, + zimcompBzip2, // Not supported anymore in the libzim + zimcompLzma + }; + + static const char MimeHtmlTemplate[] = "text/x-zim-htmltemplate"; +} + +#endif // ZIM_ZIM_H + diff --git a/meson.build b/meson.build new file mode 100644 index 0000000..6b674e5 --- /dev/null +++ b/meson.build @@ -0,0 +1,72 @@ +project('libzim', ['c', 'cpp'], + version : '4.0.4', + license : 'GPL2', + default_options : ['c_std=c11', 'cpp_std=c++11', 'werror=true']) + +if build_machine.system() != 'windows' + add_project_arguments('-D_LARGEFILE64_SOURCE=1', '-D_FILE_OFFSET_BITS=64', language: 'cpp') +endif + +sizeof_off_t = meson.get_compiler('cpp').sizeof('off_t') + +conf = configuration_data() +conf.set('VERSION', '"@0@"'.format(meson.project_version())) +conf.set('DIRENT_CACHE_SIZE', get_option('DIRENT_CACHE_SIZE')) +conf.set('CLUSTER_CACHE_SIZE', get_option('CLUSTER_CACHE_SIZE')) +conf.set('LZMA_MEMORY_SIZE', get_option('LZMA_MEMORY_SIZE')) +conf.set10('MMAP_SUPPORT_64', sizeof_off_t==8) +if target_machine.system() == 'windows' + conf.set('ENABLE_USE_MMAP', false) +else + conf.set('ENABLE_USE_MMAP', get_option('USE_MMAP')) +endif +conf.set('ENABLE_USE_BUFFER_HEADER', get_option('USE_BUFFER_HEADER')) + +zlib_dep = dependency('zlib', required:false) +conf.set('ENABLE_ZLIB', zlib_dep.found()) + +lzma_dep = dependency('liblzma') + +xapian_dep = dependency('xapian-core', + required:false, + static:(get_option('default_library')=='static')) +conf.set('ENABLE_XAPIAN', xapian_dep.found()) + +pkg_requires = ['liblzma'] +if build_machine.system() == 'windows' + thread_dep = dependency('libpthreadVC2') + pkg_requires += ['libpthreadVC2'] + extra_link_args = ['-lRpcrt4', '-lWs2_32', '-lwinmm', '-licuuc', '-licuin'] + extra_cpp_args = ['-DSORTPP_PASS'] +else + thread_dep = dependency('threads') + extra_link_args = [] + extra_cpp_args = [] +endif +if zlib_dep.found() + pkg_requires += ['zlib'] +endif +if xapian_dep.found() + pkg_requires += ['xapian-core'] + icu_dep = dependency('icu-i18n') + pkg_requires += ['icu-i18n'] +else + icu_dep = dependency('icu-i18n', required:false) +endif + +inc = include_directories('include') + +subdir('include') +subdir('scripts') +subdir('static') +subdir('src') +subdir('examples') +subdir('test') + +pkg_mod = import('pkgconfig') +pkg_mod.generate(libraries : libzim, + version : meson.project_version(), + name : 'libzim', + filebase : 'libzim', + description : 'A Library to zim.', + requires : pkg_requires) diff --git a/meson_options.txt b/meson_options.txt new file mode 100644 index 0000000..3689089 --- /dev/null +++ b/meson_options.txt @@ -0,0 +1,12 @@ +option('CLUSTER_CACHE_SIZE', type : 'string', value : '16', + description : 'set cluster cache size to number (default:16)') +option('DIRENT_CACHE_SIZE', type : 'string', value : '512', + description : 'set dirent cache size to number (default:512)') +option('LZMA_MEMORY_SIZE', type : 'string', value : '128', + description : 'set lzma uncompress memory in MB (default:128)') +option('USE_MMAP', type: 'boolean', value: true, + description: 'Use mmap to avoid copy from file. (default:true, always false on windows)') +option('USE_BUFFER_HEADER', type: 'boolean', value: true, + description: 'Copy (or use mmap) header index buffers. (default:true) +Header index are used to access articles, having them in memory can improve access speed but on low memory devices it may use to many memory. +If false, we directly read the index in the file at each article access.') diff --git a/scripts/libzim-compile-resources b/scripts/libzim-compile-resources new file mode 100755 index 0000000..e4993ba --- /dev/null +++ b/scripts/libzim-compile-resources @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 + +''' +Copyright 2016 Matthieu Gautier + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or any +later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301, USA. +''' + +import argparse +import os.path +import re + +def full_identifier(filename): + parts = os.path.normpath(filename).split(os.sep) + parts = [to_identifier(part) for part in parts] + print(filename, parts) + return parts + +def to_identifier(name): + ident = re.sub(r'[^0-9a-zA-Z]', '_', name) + if ident[0].isnumeric(): + return "_"+ident + return ident + +resource_impl_template = """ +static const unsigned char {data_identifier}[] = {{ + {resource_content} +}}; + +namespace RESOURCE {{ +{namespaces_open} +const std::string {identifier} = init_resource("{env_identifier}", {data_identifier}, {resource_len}); +{namespaces_close} +}} +""" + +resource_getter_template = """ + if (name == "{common_name}") + return RESOURCE::{identifier}; +""" + +resource_decl_template = """{namespaces_open} +extern const std::string {identifier}; +{namespaces_close}""" + +class Resource: + def __init__(self, base_dirs, filename): + filename = filename.strip() + self.filename = filename + self.identifier = full_identifier(filename) + found = False + for base_dir in base_dirs: + try: + with open(os.path.join(base_dir, filename), 'rb') as f: + self.data = f.read() + found = True + break + except FileNotFoundError: + continue + if not found: + raise Exception("Impossible to found {}".format(filename)) + + def dump_impl(self): + nb_row = len(self.data)//16 + (1 if len(self.data) % 16 else 0) + sliced = (self.data[i*16:(i+1)*16] for i in range(nb_row)) + + return resource_impl_template.format( + data_identifier="_".join([""]+self.identifier), + resource_content=",\n ".join(", ".join("{:#04x}".format(i) for i in r) for r in sliced), + resource_len=len(self.data), + namespaces_open=" ".join("namespace {} {{".format(id) for id in self.identifier[:-1]), + namespaces_close=" ".join(["}"]*(len(self.identifier)-1)), + identifier=self.identifier[-1], + env_identifier="RES_"+"_".join(self.identifier)+"_PATH" + ) + + def dump_getter(self): + return resource_getter_template.format( + common_name=self.filename, + identifier="::".join(self.identifier) + ) + + def dump_decl(self): + return resource_decl_template.format( + namespaces_open=" ".join("namespace {} {{".format(id) for id in self.identifier[:-1]), + namespaces_close=" ".join(["}"]*(len(self.identifier)-1)), + identifier=self.identifier[-1] + ) + + + +master_c_template = """//This file is automaically generated. Do not modify it. + +#include +#include +#include "{include_file}" + +static std::string init_resource(const char* name, const unsigned char* content, int len) +{{ + char * resPath = getenv(name); + if (NULL == resPath) + return std::string(reinterpret_cast(content), len); + + std::ifstream ifs(resPath); + if (!ifs.good()) + return std::string(reinterpret_cast(content), len); + return std::string( (std::istreambuf_iterator(ifs)), + (std::istreambuf_iterator() )); +}} + +const std::string& getResource_{basename}(const std::string& name) {{ +{RESOURCES_GETTER} + throw ResourceNotFound("Resource not found."); +}} + +{RESOURCES} + +""" + +def gen_c_file(resources, basename): + return master_c_template.format( + RESOURCES="\n\n".join(r.dump_impl() for r in resources), + RESOURCES_GETTER="\n\n".join(r.dump_getter() for r in resources), + include_file=basename, + basename=to_identifier(basename) + ) + + + +master_h_template = """//This file is automaically generated. Do not modify it. +#ifndef KIWIX_{BASENAME} +#define KIWIX_{BASENAME} + +#include +#include + +namespace RESOURCE {{ + {RESOURCES} +}}; + +class ResourceNotFound : public std::runtime_error {{ + public: + ResourceNotFound(const std::string& what_arg): + std::runtime_error(what_arg) + {{ }}; +}}; + +const std::string& getResource_{basename}(const std::string& name); + +#define getResource(a) (getResource_{basename}(a)) + +#endif // KIWIX_{BASENAME} + +""" + +def gen_h_file(resources, basename): + return master_h_template.format( + RESOURCES="\n ".join(r.dump_decl() for r in resources), + BASENAME=basename.upper(), + basename=basename, + ) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--cxxfile', + help='The Cpp file name to generate') + parser.add_argument('--hfile', + help='The h file name to generate') + parser.add_argument('--source_dir', + help="Additional directory where to look for resources.", + action='append') + parser.add_argument('resource_file', + help='The list of resources to compile.') + args = parser.parse_args() + + base_dir = os.path.dirname(os.path.realpath(args.resource_file)) + source_dir = args.source_dir or [] + with open(args.resource_file, 'r') as f: + resources = [Resource([base_dir]+source_dir, filename) + for filename in f.readlines()] + + h_identifier = to_identifier(os.path.basename(args.hfile)) + with open(args.hfile, 'w') as f: + f.write(gen_h_file(resources, h_identifier)) + + with open(args.cxxfile, 'w') as f: + f.write(gen_c_file(resources, os.path.basename(args.hfile))) + diff --git a/scripts/meson.build b/scripts/meson.build new file mode 100644 index 0000000..e1437ae --- /dev/null +++ b/scripts/meson.build @@ -0,0 +1,2 @@ + +res_compiler = find_program('libzim-compile-resources') diff --git a/src/_dirent.h b/src/_dirent.h new file mode 100644 index 0000000..e3c0024 --- /dev/null +++ b/src/_dirent.h @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_DIRENT_H +#define ZIM_DIRENT_H + +#include +#include +#include +#include + +#include "zim_types.h" +#include "debug.h" + +namespace zim +{ + class Buffer; + class InvalidSize : public std::exception {}; + class Dirent + { + protected: + uint16_t mimeType; + + uint32_t version; + + cluster_index_t clusterNumber; // only used when redirect is false + blob_index_t blobNumber; // only used when redirect is false + + article_index_t redirectIndex; // only used when redirect is true + + char ns; + std::string title; + std::string url; + std::string parameter; + + public: + // these constants are put into mimeType field + static const uint16_t redirectMimeType = 0xffff; + static const uint16_t linktargetMimeType = 0xfffe; + static const uint16_t deletedMimeType = 0xfffd; + + Dirent() + : mimeType(0), + version(0), + clusterNumber(0), + blobNumber(0), + redirectIndex(0), + ns('\0') + {} + + Dirent(std::unique_ptr buffer); + + bool isRedirect() const { return mimeType == redirectMimeType; } + bool isLinktarget() const { return mimeType == linktargetMimeType; } + bool isDeleted() const { return mimeType == deletedMimeType; } + bool isArticle() const { return !isRedirect() && !isLinktarget() && !isDeleted(); } + uint16_t getMimeType() const { return mimeType; } + + uint32_t getVersion() const { return version; } + void setVersion(uint32_t v) { version = v; } + + cluster_index_t getClusterNumber() const { return isRedirect() ? cluster_index_t(0) : clusterNumber; } + blob_index_t getBlobNumber() const { return isRedirect() ? blob_index_t(0) : blobNumber; } + + article_index_t getRedirectIndex() const { return isRedirect() ? redirectIndex : article_index_t(0); } + + char getNamespace() const { return ns; } + const std::string& getTitle() const { return title.empty() ? url : title; } + const std::string& getUrl() const { return url; } + std::string getLongUrl() const; + const std::string& getParameter() const { return parameter; } + + size_t getDirentSize() const + { + size_t ret = (isRedirect() ? 12 : 16) + url.size() + parameter.size() + 2; + if (title != url) + ret += title.size(); + return ret; + } + + void setTitle(const std::string& title_) + { + title = title_; + } + + void setUrl(char ns_, const std::string& url_) + { + ns = ns_; + url = url_; + } + + void setParameter(const std::string& parameter_) + { + parameter = parameter_; + } + + void setRedirect(article_index_t idx) + { + redirectIndex = idx; + mimeType = redirectMimeType; + } + + void setMimeType(uint16_t mime) + { + mimeType = mime; + } + + void setLinktarget() + { + ASSERT(mimeType, ==, 0); + mimeType = linktargetMimeType; + } + + void setDeleted() + { + ASSERT(mimeType, ==, 0); + mimeType = deletedMimeType; + } + + void setArticle(uint16_t mimeType_, cluster_index_t clusterNumber_, blob_index_t blobNumber_) + { + ASSERT(mimeType, ==, 0); + mimeType = mimeType_; + clusterNumber = clusterNumber_; + blobNumber = blobNumber_; + } + + }; + +} + +#endif // ZIM_DIRENT_H + diff --git a/src/article.cpp b/src/article.cpp new file mode 100644 index 0000000..684da71 --- /dev/null +++ b/src/article.cpp @@ -0,0 +1,279 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include "template.h" +#include "_dirent.h" +#include "cluster.h" +#include +#include "fileimpl.h" +#include "file_part.h" +#include +#include +#include +#include "log.h" + +log_define("zim.article") + +namespace zim +{ + size_type Article::getArticleSize() const + { + auto dirent = getDirent(); + return size_type(file->getCluster(dirent->getClusterNumber()) + ->getBlobSize(dirent->getBlobNumber())); + } + + namespace + { + class Ev : public TemplateParser::Event + { + std::ostream& out; + Article& article; + std::shared_ptr file; + unsigned maxRecurse; + + public: + Ev(std::ostream& out_, Article& article_, std::shared_ptr file_, unsigned maxRecurse_) + : out(out_), + article(article_), + file(file_), + maxRecurse(maxRecurse_) + { } + void onData(const std::string& data); + void onToken(const std::string& token); + void onLink(char ns, const std::string& title); + }; + + void Ev::onData(const std::string& data) + { + out << data; + } + + void Ev::onToken(const std::string& token) + { + log_trace("onToken(\"" << token << "\")"); + + if (token == "title") + out << article.getTitle(); + else if (token == "url") + out << article.getUrl(); + else if (token == "namespace") + out << article.getNamespace(); + else if (token == "content") + { + if (maxRecurse <= 0) + throw std::runtime_error("maximum recursive limit is reached"); + article.getPage(out, false, maxRecurse - 1); + } + else + { + log_warn("unknown token \"" << token << "\" found in template"); + out << "<%" << token << "%>"; + } + } + + void Ev::onLink(char ns, const std::string& url) + { + if (maxRecurse <= 0) + throw std::runtime_error("maximum recursive limit is reached"); + std::pair r = file->findx(ns, url); + if (r.first) { + Article(file, article_index_type(r.second)).getPage(out, false, maxRecurse - 1); + } else { + throw std::runtime_error(std::string("impossible to find article ") + std::string(1, ns) + std::string("/") + url); + } + } + + } + + std::shared_ptr Article::getDirent() const + { + return file->getDirent(article_index_t(idx)); + } + + std::string Article::getParameter() const + { + return getDirent()->getParameter(); + } + + std::string Article::getTitle() const + { + return getDirent()->getTitle(); + } + + std::string Article::getUrl() const + { + return getDirent()->getUrl(); + } + + std::string Article::getLongUrl() const + { + return getDirent()->getLongUrl(); + } + + uint16_t Article::getLibraryMimeType() const + { + return getDirent()->getMimeType(); + } + + const std::string& Article::getMimeType() const + { + return file->getMimeType(getLibraryMimeType()); + } + + bool Article::isRedirect() const + { + return getDirent()->isRedirect(); + } + + bool Article::isLinktarget() const + { + return getDirent()->isLinktarget(); + } + + bool Article::isDeleted() const + { + return getDirent()->isDeleted(); + } + + char Article::getNamespace() const + { + return getDirent()->getNamespace(); + } + + article_index_type Article::getRedirectIndex() const + { + return article_index_type(getDirent()->getRedirectIndex()); + } + + Article Article::getRedirectArticle() const + { + return Article(file, getRedirectIndex()); + } + + std::shared_ptr Article::getCluster() const + { + auto dirent = getDirent(); + if ( dirent->isRedirect() + || dirent->isLinktarget() + || dirent->isDeleted() ) { + return std::shared_ptr(); + } + return file->getCluster(dirent->getClusterNumber()); + } + + Blob Article::getData(offset_type offset) const + { + auto size = getArticleSize()-offset; + return getData(offset, size); + } + + Blob Article::getData(offset_type offset, size_type size) const + { + std::shared_ptr cluster = getCluster(); + if (!cluster) { + return Blob(); + } + return cluster->getBlob(getDirent()->getBlobNumber(), offset_t(offset), zsize_t(size)); + } + + offset_type Article::getOffset() const + { + auto dirent = getDirent(); + if (dirent->isRedirect() + || dirent->isLinktarget() + || dirent->isDeleted()) + return 0; + return offset_type(file->getBlobOffset(dirent->getClusterNumber(), dirent->getBlobNumber())); + } + + std::pair Article::getDirectAccessInformation() const + { + auto dirent = getDirent(); + if ( dirent->isRedirect() + || dirent->isLinktarget() + || dirent->isDeleted() ) { + return std::make_pair("", 0); + } + + auto full_offset = file->getBlobOffset(dirent->getClusterNumber(), + dirent->getBlobNumber()); + + if (!full_offset) { + // cluster is compressed + return std::make_pair("", 0); + } + auto part_its = file->getFileParts(full_offset, zsize_t(getArticleSize())); + auto range = part_its.first->first; + auto part = part_its.first->second; + if (++part_its.first != part_its.second) { + return std::make_pair("", 0); + } + auto local_offset = full_offset - range.min; + return std::make_pair(part->filename(), offset_type(local_offset)); + + } + + std::string Article::getPage(bool layout, unsigned maxRecurse) + { + std::ostringstream s; + getPage(s, layout, maxRecurse); + return s.str(); + } + + void Article::getPage(std::ostream& out, bool layout, unsigned maxRecurse) + { + log_trace("Article::getPage(" << layout << ", " << maxRecurse << ')'); + + if (getMimeType().compare(0, 9, "text/html") == 0 || getMimeType() == MimeHtmlTemplate) + { + if (layout && file->getFileheader().hasLayoutPage()) + { + Article layoutPage(file, file->getFileheader().getLayoutPage()); + Blob data = layoutPage.getData(); + + Ev ev(out, *this, file, maxRecurse); + log_debug("call template parser"); + TemplateParser parser(&ev); + for (const char* p = data.data(); p != data.end(); ++p) + parser.parse(*p); + parser.flush(); + + return; + } + else if (getMimeType() == MimeHtmlTemplate) + { + Blob data = getData(); + + Ev ev(out, *this, file, maxRecurse); + TemplateParser parser(&ev); + for (const char* p = data.data(); p != data.end(); ++p) + parser.parse(*p); + parser.flush(); + + return; + } + } + + // default case - template cases has return above + out << getData(); + } + +} diff --git a/src/blob.cpp b/src/blob.cpp new file mode 100644 index 0000000..fe5b82f --- /dev/null +++ b/src/blob.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + + +#include "zim/blob.h" +#include "debug.h" +#include "buffer.h" + +namespace zim { + +Blob::Blob() + : _data(0), + _size(0) +{} + +Blob::Blob(const char* data, size_type size) + : _data(data), + _size(size) +{ + ASSERT(size, <, SIZE_MAX); + ASSERT(data, <, (void*)(SIZE_MAX-size)); +} + +Blob::Blob(std::shared_ptr buffer) + : _data(buffer->data()), + _size(size_type(buffer->size())), + _buffer(buffer) +{} + + + + +} //zim diff --git a/src/buffer.cpp b/src/buffer.cpp new file mode 100644 index 0000000..8c14889 --- /dev/null +++ b/src/buffer.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "buffer.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifndef _WIN32 +# include +# include +#endif + +namespace zim { + +std::shared_ptr Buffer::sub_buffer(offset_t offset, zsize_t size) const +{ + return std::make_shared(shared_from_this(), offset, size); +} + +#ifdef ENABLE_USE_MMAP +MMapBuffer::MMapBuffer(int fd, offset_t offset, zsize_t size): + Buffer(size), + _offset(0) +{ + offset_t pa_offset(offset.v & ~(sysconf(_SC_PAGE_SIZE) - 1)); + _offset = offset-pa_offset; +#if defined(__APPLE__) + #define MAP_FLAGS MAP_PRIVATE +#else + #define MAP_FLAGS MAP_PRIVATE|MAP_POPULATE +#endif +#if !MMAP_SUPPORT_64 + if(pa_offset.v >= INT32_MAX) { + throw MMapException(); + } +#endif + _data = (char*)mmap(NULL, size.v + _offset.v, PROT_READ, MAP_FLAGS, fd, pa_offset.v); + if (_data == MAP_FAILED ) + { + std::ostringstream s; + s << "Cannot mmap size " << size.v << " at off " << offset.v << " : " << strerror(errno); + throw std::runtime_error(s.str()); + } +#undef MAP_FLAGS +} + +MMapBuffer::~MMapBuffer() +{ + munmap(_data, size_.v + _offset.v); +} + +#endif + +} //zim diff --git a/src/buffer.h b/src/buffer.h new file mode 100644 index 0000000..5d07aea --- /dev/null +++ b/src/buffer.h @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_BUFFER_H_ +#define ZIM_BUFFER_H_ + +#include +#include +#include +#include + +#include "config.h" +#include "zim_types.h" +#include "endian_tools.h" +#include "debug.h" + +namespace zim { + +class MMapException : std::exception {}; + +class Buffer : public std::enable_shared_from_this { + public: + Buffer(zsize_t size) + : size_(size) + { + ASSERT(size_.v, <, SIZE_MAX); + }; + virtual ~Buffer() {}; + virtual const char* data(offset_t offset=offset_t(0)) const = 0; + virtual char at(offset_t offset) const { + return *(data(offset)); + } + zsize_t size() const { return size_; } + virtual std::shared_ptr sub_buffer(offset_t offset, zsize_t size) const; + + template + T as(offset_t offset) const { + ASSERT(offset.v, <, size_.v); + ASSERT(offset.v+sizeof(T), <=, size_.v); + return fromLittleEndian(data(offset)); + } + + protected: + const zsize_t size_; +}; + + +template +class MemoryBuffer : public Buffer { + public: + MemoryBuffer(const char* buffer, zsize_t size) + : Buffer(size), + _data(buffer) + {} + + virtual ~MemoryBuffer() { + if ( CLEAN_AT_END ) { + delete [] _data; + } + } + + const char* data(offset_t offset) const { + ASSERT(offset.v, <=, size_.v); + return _data + offset.v; + } + private: + const char* _data; +}; + + +#ifdef ENABLE_USE_MMAP +class MMapBuffer : public Buffer { + public: + MMapBuffer(int fd, offset_t offset, zsize_t size); + ~MMapBuffer(); + + const char* data(offset_t offset) const { + offset += _offset; + return _data + offset.v; + } + + private: + offset_t _offset; + char* _data; +}; +#endif + + +class SubBuffer : public Buffer { + public: + SubBuffer(const std::shared_ptr src, offset_t offset, zsize_t size) + : Buffer(size), + _data(src, src->data(offset)) + { + ASSERT(offset.v+size.v, <=, src->size().v); + } + + const char* data(offset_t offset) const { + ASSERT(offset.v, <=, size_.v); + return _data.get() + offset.v; + } + + private: + std::shared_ptr _data; +}; + +}; + +#endif //ZIM_BUFFER_H_ diff --git a/src/cache.h b/src/cache.h new file mode 100644 index 0000000..a91ee66 --- /dev/null +++ b/src/cache.h @@ -0,0 +1,353 @@ +/* + * Copyright (C) 2008 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_CACHE_H +#define ZIM_CACHE_H + +#include +#include +#include + +#ifdef _WIN32 +#define NOMINMAX +#include +#undef NOMINMAX +#undef max +#endif + +namespace zim +{ + /** + Implements a container for caching elements. + + The cache holds a list of key-value-pairs. There are 2 main operations for + accessing the cache: put and get. Put takes a key and a value and puts the + element into the list. Get takes a key and optional a value. If the value + for the key is found, it is returned. The passed value otherwise. By + default the value is constructed with the empty ctor of the value-type. + + The cache has a maximum size, after which key-value-pairs are dropped, + when a new item is put into the cache. + + The algorithm for this cache is as follows: + - when the cache is not full, new elements are appended + - new elements are put into the middle of the list otherwise + - the last element of the list is then dropped + - when getting a value and the value is found, it is put to the + beginning of the list + + When elements are searched, a linear search is done using the ==-operator + of the key type. + + The caching algorithm keeps elements, which are fetched more than once in + the first half of the list. In the second half the elements are either new + or the elements are pushed from the first half to the second half by other + elements, which are found in the cache. + + You should be aware, that the key type should be simple. Comparing keys + must be cheap. Copying elements (both key and value) must be possible and + should be cheap, since they are moved in the underlying container. + + */ + template + class Cache + { + struct Data + { + bool winner; + unsigned serial; + Value value; + Data() { } + Data(bool winner_, unsigned serial_, const Value& value_) + : winner(winner_), + serial(serial_), + value(value_) + { } + }; + + typedef std::map DataType; + DataType data; + + typename DataType::size_type maxElements; + unsigned serial; + unsigned hits; + unsigned misses; + + unsigned _nextSerial() + { + if (serial == std::numeric_limits::max()) + { + for (typename DataType::iterator it = data.begin(); it != data.end(); ++it) + it->second.serial = 0; + serial = 1; + } + + return serial++; + } + + typename DataType::iterator _getOldest(bool winner) + { + typename DataType::iterator foundElement = data.begin(); + + typename DataType::iterator it = data.begin(); + + for (++it; it != data.end(); ++it) + if (it->second.winner == winner + && (foundElement->second.winner != winner || it->second.serial < foundElement->second.serial)) + foundElement = it; + + return foundElement; + } + + typename DataType::iterator _getNewest(bool winner) + { + typename DataType::iterator foundElement = data.begin(); + + typename DataType::iterator it = data.begin(); + + for (++it; it != data.end(); ++it) + if (it->second.winner == winner + && (foundElement->second.winner != winner || it->second.serial > foundElement->second.serial)) + foundElement = it; + + return foundElement; + } + + // drop one element + void _dropLooser() + { + // look for the oldest element in the list of loosers to drop it + data.erase(_getOldest(false)); + } + + void _makeLooser() + { + // look for the oldest element in the list of winners to make it a looser + typename DataType::iterator it = _getOldest(true); + it->second.winner = false; + it->second.serial = _nextSerial(); + } + + public: + typedef typename DataType::size_type size_type; + typedef Value value_type; + + explicit Cache(size_type maxElements_) + : maxElements(maxElements_ + (maxElements_ & 1)), + serial(0), + hits(0), + misses(0) + { } + + /// returns the number of elements currently in the cache + size_type size() const { return data.size(); } + + /// returns the maximum number of elements in the cache + size_type getMaxElements() const { return maxElements; } + + void setMaxElements(size_type maxElements_) + { + size_type numWinners = size() < maxElements / 2 ? size() : maxElements / 2; + + maxElements_ += (maxElements_ & 1); + + if (maxElements_ > maxElements) + { + maxElements = maxElements_; + + while (numWinners < maxElements / 2) + { + _getNewest(false)->winner = true; + ++numWinners; + } + } + else + { + while (maxElements > maxElements_) + { + _dropLooser(); + _dropLooser(); + _makeLooser(); + maxElements -= 2; + } + + while (numWinners > maxElements / 2) + { + _getNewest(true)->winner = false; + --numWinners; + } + } + + } + + /// removes a element from the cache and returns true, if found + bool erase(const Key& key) + { + typename DataType::iterator it = data.find(key); + if (it == data.end()) + return false; + + if (it->second.winner) + _getNewest(false)->winner=true; + + data.erase(it); + return true; + } + + /// clears the cache. + void clear(bool stats = false) + { + data.clear(); + if (stats) + hits = misses = 0; + } + + /// puts a new element in the cache. If the element is already found in + /// the cache, it is considered a cache hit and pushed to the top of the + /// list. + void put(const Key& key, const Value& value) + { + typename DataType::iterator it; + if (data.size() < maxElements) + { + data.insert(data.begin(), + typename DataType::value_type(key, + Data(data.size() < maxElements / 2, _nextSerial(), value))); + } + else if ((it = data.find(key)) == data.end()) + { + // element not found + _dropLooser(); + data.insert(data.begin(), + typename DataType::value_type(key, + Data(false, _nextSerial(), value))); + } + else + { + // element found + it->second.serial = _nextSerial(); + if (!it->second.winner) + { + // move element to the winner part + it->second.winner = true; + _makeLooser(); + } + } + } + + /// puts a new element on the top of the cache. If the element is already + /// found in the cache, it is considered a cache hit and pushed to the + /// top of the list. This method actually overrides the need, that a element + /// needs a hit to get to the top of the cache. + void put_top(const Key& key, const Value& value) + { + typename DataType::iterator it; + if (data.size() < maxElements) + { + if (data.size() >= maxElements / 2) + _makeLooser(); + + data.insert(data.begin(), + typename DataType::value_type(key, + Data(true, _nextSerial(), value))); + } + else if ((it = data.find(key)) == data.end()) + { + // element not found + _dropLooser(); + _makeLooser(); + data.insert(data.begin(), + typename DataType::value_type(key, + Data(true, _nextSerial(), value))); + } + else + { + // element found + it->second.serial = _nextSerial(); + if (!it->second.winner) + { + // move element to the winner part + it->second.winner = true; + _makeLooser(); + } + } + } + + Value* getptr(const Key& key) + { + typename DataType::iterator it = data.find(key); + if (it == data.end()) + return 0; + + it->second.serial = _nextSerial(); + + if (!it->second.winner) + { + // move element to the winner part + it->second.winner = true; + _makeLooser(); + } + + return &it->second.value; + } + + /// returns a pair of values - a flag, if the value was found and the + /// value if found or the passed default otherwise. If the value is + /// found it is a cahce hit and pushed to the top of the list. + std::pair getx(const Key& key, Value def = Value()) + { + Value* v = getptr(key); + return v ? std::pair(true, *v) + : std::pair(false, def); + } + + /// returns the value to a key or the passed default value if not found. + /// If the value is found it is a cahce hit and pushed to the top of the + /// list. + Value get(const Key& key, Value def = Value()) + { + return getx(key, def).second; + } + + /// returns the number of hits. + unsigned getHits() const { return hits; } + /// returns the number of misses. + unsigned getMisses() const { return misses; } + /// returns the cache hit ratio between 0 and 1. + double hitRatio() const { return hits+misses > 0 ? static_cast(hits)/static_cast(hits+misses) : 0; } + /// returns the ratio, between held elements and maximum elements. + double fillfactor() const { return static_cast(data.size()) / static_cast(maxElements); } + +/* + void dump(std::ostream& out) const + { + out << "cache max size=" << maxElements << " current size=" << size() << '\n'; + for (typename DataType::const_iterator it = data.begin(); it != data.end(); ++it) + { + out << "\tkey=\"" << it->first << "\" value=\"" << it->second.value << "\" serial=" << it->second.serial << " winner=" << it->second.winner << '\n'; + } + out << "--------\n"; + } +*/ + + }; + +} + +#endif // ZIM_CACHE_H diff --git a/src/cluster.cpp b/src/cluster.cpp new file mode 100644 index 0000000..11afda4 --- /dev/null +++ b/src/cluster.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "cluster.h" +#include +#include +#include "file_reader.h" +#include "endian_tools.h" +#include +#include +#include + +#include "log.h" + +#include "config.h" + +log_define("zim.cluster") + +#define log_debug1(e) + +namespace zim +{ + Cluster::Cluster(std::shared_ptr reader_, CompressionType comp, bool isExtended) + : compression(comp), + isExtended(isExtended), + reader(reader_), + startOffset(0) + { + auto d = reader->offset(); + if (isExtended) { + startOffset = read_header(); + } else { + startOffset = read_header(); + } + reader = reader->sub_reader(startOffset); + auto d1 = reader->offset(); + ASSERT(d+startOffset, ==, d1); + } + + /* This return the number of char read */ + template + offset_t Cluster::read_header() + { + // read first offset, which specifies, how many offsets we need to read + OFFSET_TYPE offset; + offset = reader->read(offset_t(0)); + + size_t n_offset = offset / sizeof(OFFSET_TYPE); + offset_t data_address(offset); + + // read offsets + offsets.clear(); + offsets.reserve(n_offset); + offsets.push_back(offset_t(0)); + + auto buffer = reader->get_buffer(offset_t(0), zsize_t(offset)); + offset_t current = offset_t(sizeof(OFFSET_TYPE)); + while (--n_offset) + { + OFFSET_TYPE new_offset = buffer->as(current); + ASSERT(new_offset, >=, offset); + ASSERT(offset, >=, data_address.v); + ASSERT(offset, <=, reader->size().v); + + offset = new_offset; + offsets.push_back(offset_t(offset - data_address.v)); + current += sizeof(OFFSET_TYPE); + } + ASSERT(offset, ==, reader->size().v); + return data_address; + } + + Blob Cluster::getBlob(blob_index_t n) const + { + if (size()) { + auto blobSize = getBlobSize(n); + if (blobSize.v > SIZE_MAX) { + return Blob(); + } + auto buffer = reader->get_buffer(offsets[blob_index_type(n)], getBlobSize(n)); + return Blob(buffer); + } else { + return Blob(); + } + } + + Blob Cluster::getBlob(blob_index_t n, offset_t offset, zsize_t size) const + { + if (this->size()) { + offset += offsets[blob_index_type(n)]; + size = std::min(size, getBlobSize(n)); + if (size.v > SIZE_MAX) { + return Blob(); + } + auto buffer = reader->get_buffer(offset, size); + return Blob(buffer); + } else { + return Blob(); + } + } + + zsize_t Cluster::size() const + { + if (isExtended) + return zsize_t(offsets.size() * sizeof(uint64_t) + reader->size().v); + else + return zsize_t(offsets.size() * sizeof(uint32_t) + reader->size().v); + } + +} diff --git a/src/cluster.h b/src/cluster.h new file mode 100644 index 0000000..a04b76d --- /dev/null +++ b/src/cluster.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_CLUSTER_H +#define ZIM_CLUSTER_H + +#include +#include "buffer.h" +#include "zim_types.h" +#include +#include +#include + +#include "zim_types.h" + +namespace zim +{ + class Blob; + class Reader; + + class Cluster : public std::enable_shared_from_this { + typedef std::vector Offsets; + + const CompressionType compression; + const bool isExtended; + Offsets offsets; + std::shared_ptr reader; + offset_t startOffset; + + template + offset_t read_header(); + + public: + Cluster(std::shared_ptr reader, CompressionType comp, bool isExtended); + CompressionType getCompression() const { return compression; } + bool isCompressed() const { return compression == zimcompZip || compression == zimcompBzip2 || compression == zimcompLzma; } + + blob_index_t count() const { return blob_index_t(offsets.size() - 1); } + zsize_t size() const; + + zsize_t getBlobSize(blob_index_t n) const { return zsize_t(offsets[blob_index_type(n)+1].v + - offsets[blob_index_type(n)].v); } + offset_t getBlobOffset(blob_index_t n) const { return startOffset + offsets[blob_index_type(n)]; } + Blob getBlob(blob_index_t n) const; + Blob getBlob(blob_index_t n, offset_t offset, zsize_t size) const; + void clear(); + + void init_from_buffer(Buffer& buffer); + }; + +} + +#endif // ZIM_CLUSTER_H diff --git a/src/config.h.in b/src/config.h.in new file mode 100644 index 0000000..fc22757 --- /dev/null +++ b/src/config.h.in @@ -0,0 +1,18 @@ + +#mesondefine VERSION + +#mesondefine DIRENT_CACHE_SIZE + +#mesondefine CLUSTER_CACHE_SIZE + +#mesondefine LZMA_MEMORY_SIZE + +#mesondefine ENABLE_ZLIB + +#mesondefine ENABLE_XAPIAN + +#mesondefine ENABLE_USE_MMAP + +#mesondefine ENABLE_USE_BUFFER_HEADER + +#mesondefine MMAP_SUPPORT_64 diff --git a/src/debug.h b/src/debug.h new file mode 100644 index 0000000..2cfe8e1 --- /dev/null +++ b/src/debug.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef DEBUG_H_ +#define DEBUG_H_ + +#include +#include + +#if defined (NDEBUG) +# define ASSERT(left, operator, right) (void(0)) +#else + +#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__) +#include +#endif + +template +void _on_assert_fail(const char* vara, const char* op, const char* varb, + T a, U b, const char* file, int line) { + std::cerr << "\nAssertion failed at "<< file << ":" << line << "\n " << + vara << "[" << a << "] " << op << " " << varb << "[" << b << "]" << + std::endl; + +#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__) + void *callstack[64]; + size_t size; + size = backtrace(callstack, 64); + char** strings = backtrace_symbols(callstack, size); + for (size_t i=0; i +#include "buffer.h" +#include "endian_tools.h" +#include "log.h" +#include +#include + +log_define("zim.dirent") + +namespace zim +{ + ////////////////////////////////////////////////////////////////////// + // Dirent + // + + const uint16_t Dirent::redirectMimeType; + const uint16_t Dirent::linktargetMimeType; + const uint16_t Dirent::deletedMimeType; + + Dirent::Dirent(std::unique_ptr buffer) + : Dirent() + { + uint16_t mimeType = buffer->as(offset_t(0)); + bool redirect = (mimeType == Dirent::redirectMimeType); + bool linktarget = (mimeType == Dirent::linktargetMimeType); + bool deleted = (mimeType == Dirent::deletedMimeType); + uint8_t extraLen = buffer->data()[2]; + char ns = buffer->data()[3]; + uint32_t version = buffer->as(offset_t(4)); + setVersion(version); + + offset_t current = offset_t(8); + + if (redirect) + { + article_index_t redirectIndex(buffer->as(current)); + current += sizeof(article_index_t); + + log_debug("redirectIndex=" << redirectIndex); + + setRedirect(article_index_t(redirectIndex)); + } + else if (linktarget || deleted) + { + log_debug("linktarget or deleted entry"); + setArticle(mimeType, cluster_index_t(0), blob_index_t(0)); + } + else + { + log_debug("read article entry"); + + uint32_t clusterNumber = buffer->as(current); + current += sizeof(uint32_t); + uint32_t blobNumber = buffer->as(current); + current += sizeof(uint32_t); + + log_debug("mimeType=" << mimeType << " clusterNumber=" << clusterNumber << " blobNumber=" << blobNumber); + + setArticle(mimeType, cluster_index_t(clusterNumber), blob_index_t(blobNumber)); + } + + std::string url; + std::string title; + std::string parameter; + + log_debug("read url, title and parameters"); + + offset_type url_size = strlen(buffer->data(current)); + if (current.v + url_size >= buffer->size().v) { + throw(InvalidSize()); + } + url = std::string(buffer->data(current), url_size); + current += url_size + 1; + + offset_type title_size = strlen(buffer->data(current)); + if (current.v + title_size >= buffer->size().v) { + throw(InvalidSize()); + } + title = std::string(buffer->data(current), title_size); + current += title_size + 1; + + if (current.v + extraLen > buffer->size().v) { + throw(InvalidSize()); + } + parameter = std::string(buffer->data(current), extraLen); + + setUrl(ns, url); + setTitle(title); + setParameter(parameter); + + } + + std::string Dirent::getLongUrl() const + { + log_trace("Dirent::getLongUrl()"); + log_debug("namespace=" << getNamespace() << " title=" << getTitle()); + + return std::string(1, getNamespace()) + '/' + getUrl(); + } + +} diff --git a/src/endian_tools.h b/src/endian_tools.h new file mode 100644 index 0000000..9bf6bf7 --- /dev/null +++ b/src/endian_tools.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ENDIAN_H +#define ENDIAN_H + +#include +#include +#include + +namespace zim +{ + +template +struct ToLittleEndianImpl; + +template +struct ToLittleEndianImpl{ + static void write(const T& d, char* dst) { + uint16_t v = static_cast(d); + dst[0] = static_cast(v); + dst[1] = static_cast(v>>8); + } +}; + +template +struct ToLittleEndianImpl{ + static void write(const T& d, char* dst) { + uint32_t v = static_cast(d); + dst[0] = static_cast(v); + dst[1] = static_cast(v>>8); + dst[2] = static_cast(v>>16); + dst[3] = static_cast(v>>24); +} +}; + +template +struct ToLittleEndianImpl{ + static void write(const T& d, char* dst) { + uint64_t v = static_cast(d); + dst[0] = static_cast(v); + dst[1] = static_cast(v>>8); + dst[2] = static_cast(v>>16); + dst[3] = static_cast(v>>24); + dst[4] = static_cast(v>>32); + dst[5] = static_cast(v>>40); + dst[6] = static_cast(v>>48); + dst[7] = static_cast(v>>56); + } +}; + +//////////////////////////////////////////////////////////////////////// +template +inline void toLittleEndian(T d, char* dst) +{ + ToLittleEndianImpl::write(d, dst); +} + +template +inline T fromLittleEndian(const char* ptr) +{ + T ret = 0; + for(size_t i=0; i(static_cast(ptr[i])) << (i*8)); + } + return ret; +} + +} + +#endif // ENDIAN_H + diff --git a/src/envvalue.cpp b/src/envvalue.cpp new file mode 100644 index 0000000..1d5c64f --- /dev/null +++ b/src/envvalue.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include + +namespace zim +{ + unsigned envValue(const char* env, unsigned def) + { + const char* v = ::getenv(env); + if (v) + { + std::istringstream s(v); + s >> def; + } + return def; + } + + unsigned envMemSize(const char* env, unsigned def) + { + const char* v = ::getenv(env); + if (v) + { + char unit = '\0'; + std::istringstream s(v); + s >> def >> unit; + + switch (unit) + { + case 'k': + case 'K': def *= 1024; break; + case 'm': + case 'M': def *= 1024 * 1024; break; + case 'g': + case 'G': def *= 1024 * 1024 * 1024; break; + } + } + return def; + } +} + diff --git a/src/envvalue.h b/src/envvalue.h new file mode 100644 index 0000000..d6dffd4 --- /dev/null +++ b/src/envvalue.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ENVVALUE_H +#define ZIM_ENVVALUE_H + +namespace zim +{ + unsigned envValue(const char* env, unsigned def); + unsigned envMemSize(const char* env, unsigned def); +} + +#endif // ZIM_ENVVALUE_H diff --git a/src/file.cpp b/src/file.cpp new file mode 100644 index 0000000..635f4ed --- /dev/null +++ b/src/file.cpp @@ -0,0 +1,301 @@ +/* + * Copyright (C) 2006,2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include "fileimpl.h" +#include +#include +#include "log.h" +#include +#include + +log_define("zim.file") + +namespace zim +{ + namespace + { + int hexval(char ch) + { + if (ch >= '0' && ch <= '9') + return ch - '0'; + if (ch >= 'a' && ch <= 'f') + return ch - 'a' + 10; + if (ch >= 'A' && ch <= 'F') + return ch - 'A' + 10; + return -1; + } + } + + File::File(const std::string& fname) + : impl(new FileImpl(fname)) + { } + + const std::string& File::getFilename() const + { + return impl->getFilename(); + } + + const Fileheader& File::getFileheader() const + { + return impl->getFileheader(); + } + + size_type File::getFilesize() const + { + return impl->getFilesize().v; + } + + article_index_type File::getCountArticles() const + { + return article_index_type(impl->getCountArticles()); + } + + + Article File::getArticle(article_index_type idx) const + { + if (idx >= article_index_type(impl->getCountArticles())) + throw ZimFileFormatError("article index out of range"); + return Article(impl, idx); + } + + Article File::getArticle(char ns, const std::string& url) const + { + log_trace("File::getArticle('" << ns << "', \"" << url << ')'); + std::pair r = impl->findx(ns, url); + return r.first ? Article(impl, article_index_type(r.second)) : Article(); + } + + Article File::getArticleByUrl(const std::string& url) const + { + log_trace("File::getArticle(\"" << url << ')'); + std::pair r = impl->findx(url); + return r.first ? Article(impl, article_index_type(r.second)) : Article(); + } + + Article File::getArticleByTitle(article_index_type idx) const + { + return Article(impl, article_index_type(impl->getIndexByTitle(article_index_t(idx)))); + } + + Article File::getArticleByTitle(char ns, const std::string& title) const + { + log_trace("File::getArticleByTitle('" << ns << "', \"" << title << ')'); + std::pair r = impl->findxByTitle(ns, title); + return r.first + ? Article(impl, article_index_type(impl->getIndexByTitle(r.second))) + : Article(); + } + + std::shared_ptr File::getCluster(cluster_index_type idx) const + { + return impl->getCluster(cluster_index_t(idx)); + } + + cluster_index_type File::getCountClusters() const + { + return cluster_index_type(impl->getCountClusters()); + } + + offset_type File::getClusterOffset(cluster_index_type idx) const + { + return offset_type(impl->getClusterOffset(cluster_index_t(idx))); + } + + Blob File::getBlob(cluster_index_type clusterIdx, blob_index_type blobIdx) const + { + return impl->getCluster(cluster_index_t(clusterIdx))->getBlob(blob_index_t(blobIdx)); + } + + article_index_type File::getNamespaceBeginOffset(char ch) const + { + return article_index_type(impl->getNamespaceBeginOffset(ch)); + } + + article_index_type File::getNamespaceEndOffset(char ch) const + { + return article_index_type(impl->getNamespaceEndOffset(ch)); + } + + article_index_type File::getNamespaceCount(char ns) const + { + return getNamespaceEndOffset(ns) - getNamespaceBeginOffset(ns); + } + + std::string File::getNamespaces() const + { + return impl->getNamespaces(); + } + + bool File::hasNamespace(char ch) const + { + article_index_t off = impl->getNamespaceBeginOffset(ch); + return off < impl->getCountArticles() && impl->getDirent(off)->getNamespace() == ch; + } + + File::const_iterator File::begin() const + { return const_iterator(this, 0); } + + File::const_iterator File::beginByTitle() const + { return const_iterator(this, 0, const_iterator::ArticleIterator); } + + File::const_iterator File::end() const + { return const_iterator(this, getCountArticles()); } + + File::const_iterator File::find(char ns, const std::string& url) const + { + std::pair r = impl->findx(ns, url); + return File::const_iterator(this, article_index_type(r.second)); + } + + File::const_iterator File::find(const std::string& url) const + { + std::pair r = impl->findx(url); + return File::const_iterator(this, article_index_type(r.second)); + } + + File::const_iterator File::findByTitle(char ns, const std::string& title) const + { + std::pair r = impl->findxByTitle(ns, title); + return File::const_iterator(this, article_index_type(r.second), const_iterator::ArticleIterator); + } + + const Search* File::search(const std::string& query, int start, int end) const { + Search* search = new Search(this); + search->set_query(query); + search->set_range(start, end); + return search; + } + + const Search* File::suggestions(const std::string& query, int start, int end) const { + Search* search = new Search(this); + search->set_query(query); + search->set_range(start, end); + search->set_suggestion_mode(true); + return search; + } + + offset_type File::getOffset(cluster_index_type clusterIdx, blob_index_type blobIdx) const + { + return offset_type(impl->getBlobOffset( + cluster_index_t(clusterIdx), + blob_index_t(blobIdx))); + } + + time_t File::getMTime() const + { + return impl->getMTime(); + } + + const std::string& File::getMimeType(uint16_t idx) const + { + return impl->getMimeType(idx); + } + + std::string File::getChecksum() + { + return impl->getChecksum(); + } + + bool File::verify() + { + return impl->verify(); + } + + bool File::is_multiPart() const + { + return impl->is_multiPart(); + } + + + std::string urldecode(const std::string& url) + { + std::string ret; + enum { + state_0, + state_h1, + state_h2 + } state = state_0; + + char ch = '\0'; + for (std::string::const_iterator it = url.begin(); it != url.end(); ++it) + { + switch (state) + { + case state_0: + if (*it == '+') + ret += ' '; + else if (*it == '%') + state = state_h1; + else + ret += *it; + break; + + case state_h1: + if ( (*it >= '0' && *it <= '9') + || (*it >= 'A' && *it <= 'F') + || (*it >= 'a' && *it <= 'f')) + { + ch = *it; + state = state_h2; + } + else + { + ret += '%'; + ret += *it; + state = state_0; + } + break; + + case state_h2: + if ( (*it >= '0' && *it <= '9') + || (*it >= 'A' && *it <= 'F') + || (*it >= 'a' && *it <= 'f')) + { + ret += static_cast(hexval(ch) * 16 + hexval(*it)); + } + else + { + ret += static_cast(hexval(ch)); + ret += *it; + } + state = state_0; + break; + } + + } + + switch (state) + { + case state_0: + break; + + case state_h1: + ret += '%'; + break; + + case state_h2: + ret += '%'; + ret += ch; + break; + } + + return ret; + } +} diff --git a/src/file_compound.cpp b/src/file_compound.cpp new file mode 100644 index 0000000..6d52639 --- /dev/null +++ b/src/file_compound.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "file_compound.h" +#include "buffer.h" + +#include +#include +#include +#include + +#ifdef _WIN32 +# include +#else +# include +#endif + +namespace zim { + +FileCompound::FileCompound(const std::string& filename): + _fsize(0) +{ + try { + auto part = new FilePart<>(filename); + emplace(Range(offset_t(0), offset_t(part->size().v)), part); + _fsize = part->size(); + } catch(...) { + int errnoSave = errno; + _fsize = zsize_t(0); + for (char ch0 = 'a'; ch0 <= 'z'; ++ch0) + { + std::string fname0 = filename + ch0; + for (char ch1 = 'a'; ch1 <= 'z'; ++ch1) + { + std::string fname1 = fname0 + ch1; + + try { + auto currentPart = new FilePart<>(fname1); + emplace(Range(offset_t(_fsize.v), offset_t((_fsize+currentPart->size()).v)), currentPart); + _fsize += currentPart->size(); + } catch (...) { + break; + } + } + } + + if (empty()) + { + std::ostringstream msg; + msg << "error " << errnoSave << " opening file \"" << filename; + throw std::runtime_error(msg.str()); + } + } +} + +FileCompound::FileCompound(FilePart<>* filePart): + _fsize(0) +{ + emplace(Range(offset_t(0), offset_t(filePart->size().v)), filePart); + _fsize = filePart->size(); +} + +FileCompound::~FileCompound() { + for(auto it=begin(); it!=end(); it++) { + auto filepart = it->second; + delete filepart; + } +} + +time_t FileCompound::getMTime() const { + if (mtime || empty()) + return mtime; + + const char* fname = begin()->second->filename().c_str(); + + #if defined(HAVE_STAT64) && ! defined(__APPLE__) + struct stat64 st; + int ret = ::stat64(fname, &st); + #else + struct stat st; + int ret = ::stat(fname, &st); + #endif + if (ret != 0) + { + std::ostringstream msg; + msg << "stat failed with errno " << errno << " : " << strerror(errno); + throw std::runtime_error(msg.str()); + } + mtime = st.st_mtime; + + return mtime; + +} + +} // zim diff --git a/src/file_compound.h b/src/file_compound.h new file mode 100644 index 0000000..a6b7490 --- /dev/null +++ b/src/file_compound.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILE_COMPOUND_H_ +#define ZIM_FILE_COMPOUND_H_ + +#include "file_part.h" +#include "zim_types.h" +#include +#include +#include + +namespace zim { + +class FileReader; + +struct Range { + Range(const offset_t point ) : min(point), max(point) {} + Range(const offset_t min, const offset_t max) : min(min), max(max) {} + const offset_t min; + const offset_t max; +}; + +struct less_range : public std::binary_function< Range, Range, bool> +{ + bool operator()(const Range& lhs, const Range& rhs) const { + return lhs.min < rhs.min && lhs.max <= rhs.min; + } +}; + +class FileCompound : public std::map*, less_range> { + public: + FileCompound(const std::string& filename); + FileCompound(FilePart<>* fpart); + ~FileCompound(); + + zsize_t fsize() const { return _fsize; }; + time_t getMTime() const; + bool fail() const { return empty(); }; + bool is_multiPart() const { return size() > 1; }; + + std::pair + locate(offset_t offset, zsize_t size) const { + return equal_range(Range(offset, offset+size)); + } + + private: + zsize_t _fsize; + mutable time_t mtime; +}; + + +}; + + +#endif //ZIM_FILE_COMPOUND_H_ diff --git a/src/file_part.h b/src/file_part.h new file mode 100644 index 0000000..3867d29 --- /dev/null +++ b/src/file_part.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILE_PART_H_ +#define ZIM_FILE_PART_H_ + +#include +#include + +#include + +#include "zim_types.h" +#include "fs.h" + +namespace zim { + +template +class FilePart { + public: + FilePart(const std::string& filename) : + m_filename(filename), + m_fhandle(FS::openFile(filename)), + m_size(m_fhandle.getSize()) {} + FilePart(int fd) : + m_filename(""), + m_fhandle(fd), + m_size(m_fhandle.getSize()) {} + ~FilePart() = default; + const std::string& filename() const { return m_filename; }; + const typename FS::FD& fhandle() const { return m_fhandle; }; + + zsize_t size() const { return m_size; }; + bool fail() const { return !m_size; }; + bool good() const { return bool(m_size); }; + + private: + const std::string m_filename; + typename FS::FD m_fhandle; + zsize_t m_size; +}; + +}; + +#endif //ZIM_FILE_PART_H_ diff --git a/src/file_reader.cpp b/src/file_reader.cpp new file mode 100644 index 0000000..a752d70 --- /dev/null +++ b/src/file_reader.cpp @@ -0,0 +1,366 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include "file_reader.h" +#include "file_compound.h" +#include "buffer.h" +#include "config.h" +#include "envvalue.h" +#include +#include +#include +#include +#include +#include +#include +#include + + +#if defined(_MSC_VER) +# include +# include + typedef SSIZE_T ssize_t; +#endif + +#if defined(ENABLE_ZLIB) +#include +#endif + +namespace zim { + +FileReader::FileReader(std::shared_ptr source) + : FileReader(source, offset_t(0), source->fsize()) {} + +FileReader::FileReader(std::shared_ptr source, offset_t offset) + : FileReader(source, offset, zsize_t(source->fsize().v-offset.v)) {} + +FileReader::FileReader(std::shared_ptr source, offset_t offset, zsize_t size) + : source(source), + _offset(offset), + _size(size) +{ + ASSERT(offset.v, <, source->fsize().v); + ASSERT(offset.v+size.v, <=, source->fsize().v); +} + +char FileReader::read(offset_t offset) const { + ASSERT(offset.v, <, _size.v); + offset += _offset; + auto part_pair = source->lower_bound(offset); + auto& fhandle = part_pair->second->fhandle(); + offset_t local_offset = offset - part_pair->first.min; + ASSERT(local_offset, <=, part_pair->first.max); + char ret; + try { + fhandle.readAt(&ret, zsize_t(1), local_offset); + } catch (std::runtime_error& e) { + //Error while reading. + std::ostringstream s; + s << "Cannot read a char.\n"; + s << " - File part is " << part_pair->second->filename() << "\n"; + s << " - File part size is " << part_pair->second->size().v << "\n"; + s << " - File part range is " << part_pair->first.min << "-" << part_pair->first.max << "\n"; + s << " - Reading offset at " << offset.v << "\n"; + s << " - local offset is " << local_offset.v << "\n"; + s << " - error is " << strerror(errno) << "\n"; + std::error_code ec(errno, std::generic_category()); + throw std::system_error(ec, s.str()); + }; + return ret; +} + + +void FileReader::read(char* dest, offset_t offset, zsize_t size) const { + ASSERT(offset.v, <, _size.v); + ASSERT(offset.v+size.v, <=, _size.v); + if (! size ) { + return; + } + offset += _offset; + auto found_range = source->locate(offset, size); + for(auto current = found_range.first; current!=found_range.second; current++){ + auto part = current->second; + Range partRange = current->first; + offset_t local_offset = offset-partRange.min; + ASSERT(size.v, >, 0U); + zsize_t size_to_get = zsize_t(std::min(size.v, part->size().v-local_offset.v)); + try { + part->fhandle().readAt(dest, size_to_get, local_offset); + } catch (std::runtime_error& e) { + std::ostringstream s; + s << "Cannot read chars.\n"; + s << " - File part is " << part->filename() << "\n"; + s << " - File part size is " << part->size().v << "\n"; + s << " - File part range is " << partRange.min << "-" << partRange.max << "\n"; + s << " - size_to_get is " << size_to_get.v << "\n"; + s << " - total size is " << size.v << "\n"; + s << " - Reading offset at " << offset.v << "\n"; + s << " - local offset is " << local_offset.v << "\n"; + s << " - error is " << strerror(errno) << "\n"; + std::error_code ec(errno, std::generic_category()); + throw std::system_error(ec, s.str()); + }; + ASSERT(size_to_get, <=, size); + dest += size_to_get.v; + size -= size_to_get; + offset += size_to_get; + } + ASSERT(size.v, ==, 0U); +} + + +std::shared_ptr FileReader::get_buffer(offset_t offset, zsize_t size) const { + ASSERT(size, <=, _size); +#ifdef ENABLE_USE_MMAP + try { + auto found_range = source->locate(_offset+offset, size); + auto first_part_containing_it = found_range.first; + if (++first_part_containing_it != found_range.second) { + throw MMapException(); + } + + // The range is in only one part + auto range = found_range.first->first; + auto part = found_range.first->second; + auto local_offset = offset + _offset - range.min; + ASSERT(size, <=, part->size()); + int fd = part->fhandle().getNativeHandle(); + auto buffer = std::shared_ptr(new MMapBuffer(fd, local_offset, size)); + return buffer; + } catch(MMapException& e) +#endif + { + // The range is several part, or we are on Windows. + // We will have to do some memory copies :/ + // [TODO] Use Windows equivalent for mmap. + char* p = new char[size.v]; + auto ret_buffer = std::shared_ptr(new MemoryBuffer(p, size)); + read(p, offset, size); + return ret_buffer; + } +} + +bool Reader::can_read(offset_t offset, zsize_t size) +{ + return (offset.v <= this->size().v && (offset.v+size.v) <= this->size().v); +} + +char* lzma_uncompress(const char* raw_data, zsize_t raw_size, zsize_t* dest_size) { + // We don't know what will be the result size. + // Let's assume it will be something like the minChunkSize used at creation + zsize_t _dest_size = zsize_t(1024*1024); + char* ret_data = new char[_dest_size.v]; + lzma_stream stream = LZMA_STREAM_INIT; + unsigned memsize = envMemSize("ZIM_LZMA_MEMORY_SIZE", LZMA_MEMORY_SIZE * 1024 * 1024); + auto errcode = lzma_stream_decoder(&stream, memsize, 0); + if (errcode != LZMA_OK) { + throw std::runtime_error("Impossible to allocated needed memory to uncompress lzma stream"); + } + + stream.next_in = (const unsigned char*)raw_data; + stream.avail_in = raw_size.v; + stream.next_out = (unsigned char*) ret_data; + stream.avail_out = _dest_size.v; + do { + errcode = lzma_code(&stream, LZMA_FINISH); + if (errcode == LZMA_BUF_ERROR) { + if (stream.avail_in == 0 && stream.avail_out != 0) { + // End of input stream. + // lzma haven't recognize the end of the input stream but there is no + // more input. + // As we know that we should have all the input stream, it is probably + // because the stream has not been close correctly at zim creation. + // It means that the lzma stream is not full and this is an error in the + // zim file. + } else { + //Not enought output size + _dest_size.v *= 2; + char * new_ret_data = new char[_dest_size.v]; + memcpy(new_ret_data, ret_data, stream.total_out); + stream.next_out = (unsigned char*)(new_ret_data + stream.total_out); + stream.avail_out = _dest_size.v - stream.total_out; + delete [] ret_data; + ret_data = new_ret_data; + continue; + } + } + if (errcode != LZMA_STREAM_END && errcode != LZMA_OK) { + throw ZimFileFormatError("Invalid lzma stream for cluster."); + } + } while (errcode != LZMA_STREAM_END); + dest_size->v = stream.total_out; + lzma_end(&stream); + return ret_data; +} + +#if defined(ENABLE_ZLIB) +char* zip_uncompress(const char* raw_data, zsize_t raw_size, zsize_t* dest_size) { + zsize_t _dest_size = zsize_t(1024*1024); + char* ret_data = new char[_dest_size.v]; + + z_stream stream; + memset(&stream, 0, sizeof(stream)); + + stream.next_in = (unsigned char*) raw_data; + stream.avail_in = raw_size.v; + stream.next_out = (unsigned char*) ret_data; + stream.avail_out = _dest_size.v; + auto errcode = ::inflateInit(&stream); + if (errcode != Z_OK) { + throw std::runtime_error("Impossible to allocated needed memory to uncompress zlib stream"); + } + do { + + errcode = ::inflate(&stream, Z_FINISH); + if (errcode == Z_BUF_ERROR ) { + if (stream.avail_in == 0 && stream.avail_out != 0) { + // End of input stream. + // zlib haven't recognize the end of the input stream but there is no + // more input. + // As we know that we should have all the input stream, it is probably + // because the stream has not been close correctly at zim creation. + // It means that the zlib stream is not full and this is an error in the + // zim file. + } else { + //Not enought output size + _dest_size.v *= 2; + char * new_ret_data = new char[_dest_size.v]; + memcpy(new_ret_data, ret_data, stream.total_out); + stream.next_out = (unsigned char*)(new_ret_data + stream.total_out); + stream.avail_out = _dest_size.v - stream.total_out; + delete [] ret_data; + ret_data = new_ret_data; + continue; + } + } + if (errcode != Z_STREAM_END && errcode != Z_OK) { + throw ZimFileFormatError("Invalid zlib stream for cluster."); + } + } while ( errcode != Z_STREAM_END ); + dest_size->v = stream.total_out; + ::inflateEnd(&stream); + return ret_data; +} +#endif + +std::shared_ptr Reader::get_clusterBuffer(offset_t offset, zsize_t size, CompressionType comp) const +{ + auto raw_buffer = get_buffer(offset, size); + zsize_t uncompressed_size(0); + char* uncompressed_data = nullptr; + switch (comp) { + case zimcompLzma: + uncompressed_data = lzma_uncompress(raw_buffer->data(), size, &uncompressed_size); + break; + case zimcompZip: +#if defined(ENABLE_ZLIB) + uncompressed_data = zip_uncompress(raw_buffer->data(), size, &uncompressed_size); +#else + throw std::runtime_error("zlib not enabled in this library"); +#endif + break; + default: + throw std::logic_error("compressions should not be something else than zimcompLzma or zimComZip."); + } + return std::shared_ptr(new MemoryBuffer(uncompressed_data, uncompressed_size)); +} + +std::unique_ptr Reader::sub_clusterReader(offset_t offset, zsize_t size, CompressionType* comp, bool* extended) const { + uint8_t clusterInfo = read(offset); + *comp = static_cast(clusterInfo & 0x0F); + *extended = clusterInfo & 0x10; + + switch (*comp) { + case zimcompDefault: + case zimcompNone: + { + // No compression, just a sub_reader + return sub_reader(offset+offset_t(1), size-zsize_t(1)); + } + break; + case zimcompLzma: + case zimcompZip: + { + auto buffer = get_clusterBuffer(offset+offset_t(1), size-zsize_t(1), *comp); + return std::unique_ptr(new BufferReader(buffer)); + } + break; + case zimcompBzip2: + throw std::runtime_error("bzip2 not enabled in this library"); + default: + throw ZimFileFormatError("Invalid compression flag"); + } +} + +std::unique_ptr FileReader::sub_reader(offset_t offset, zsize_t size) const +{ + ASSERT(size, <=, _size); + return std::unique_ptr(new FileReader(source, _offset+offset, size)); +} + + +//BufferReader::BufferReader(std::shared_ptr source) +// : source(source) {} + +std::shared_ptr BufferReader::get_buffer(offset_t offset, zsize_t size) const +{ + return source->sub_buffer(offset, size); +} + +std::unique_ptr BufferReader::sub_reader(offset_t offset, zsize_t size) const +{ + //auto source_addr = source->data(0); + auto sub_buff = get_buffer(offset, size); + //auto buff_addr = sub_buff->data(0); + std::unique_ptr sub_read(new BufferReader(sub_buff)); + return sub_read; +} + +zsize_t BufferReader::size() const +{ + return source->size(); +} + +offset_t BufferReader::offset() const +{ + return offset_t((offset_type)(static_cast(source->data(offset_t(0))))); +} + + +void BufferReader::read(char* dest, offset_t offset, zsize_t size) const { + ASSERT(offset.v, <, source->size().v); + ASSERT(offset+offset_t(size.v), <=, offset_t(source->size().v)); + if (! size ) { + return; + } + memcpy(dest, source->data(offset), size.v); +} + + +char BufferReader::read(offset_t offset) const { + ASSERT(offset.v, <, source->size().v); + char dest; + dest = *source->data(offset); + return dest; +} + + +} // zim diff --git a/src/file_reader.h b/src/file_reader.h new file mode 100644 index 0000000..ceec9ce --- /dev/null +++ b/src/file_reader.h @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILE_READER_H_ +#define ZIM_FILE_READER_H_ + +#include + +#include "zim_types.h" +#include "endian_tools.h" +#include "debug.h" + +namespace zim { + +class Buffer; +class FileCompound; + +class Reader { + public: + Reader() {}; + virtual zsize_t size() const = 0; + virtual ~Reader() {}; + + virtual void read(char* dest, offset_t offset, zsize_t size) const = 0; + template + T read(offset_t offset) const { + ASSERT(offset.v, <, size().v); + ASSERT(offset.v+sizeof(T), <=, size().v); + char tmp_buf[sizeof(T)]; + read(tmp_buf, offset, zsize_t(sizeof(T))); + return fromLittleEndian(tmp_buf); + } + virtual char read(offset_t offset) const = 0; + + virtual std::shared_ptr get_buffer(offset_t offset, zsize_t size) const = 0; + std::shared_ptr get_buffer(offset_t offset) const { + return get_buffer(offset, zsize_t(size().v-offset.v)); + } + virtual std::unique_ptr sub_reader(offset_t offset, zsize_t size) const = 0; + std::unique_ptr sub_reader(offset_t offset) const { + return sub_reader(offset, zsize_t(size().v-offset.v)); + } + virtual offset_t offset() const = 0; + + std::unique_ptr sub_clusterReader(offset_t offset, + zsize_t size, + CompressionType* comp, + bool* extented) const; + + bool can_read(offset_t offset, zsize_t size); + + private: + std::shared_ptr get_clusterBuffer(offset_t offset, zsize_t size, CompressionType comp) const; +}; + +class FileReader : public Reader { + public: + FileReader(std::shared_ptr source); + ~FileReader() {}; + + zsize_t size() const { return _size; }; + offset_t offset() const { return _offset; }; + + char read(offset_t offset) const; + void read(char* dest, offset_t offset, zsize_t size) const; + std::shared_ptr get_buffer(offset_t offset, zsize_t size) const; + + std::unique_ptr sub_reader(offset_t offest, zsize_t size) const; + + private: + FileReader(std::shared_ptr source, offset_t offset); + FileReader(std::shared_ptr source, offset_t offset, zsize_t size); + + std::shared_ptr source; + offset_t _offset; + zsize_t _size; +}; + +class BufferReader : public Reader { + public: + BufferReader(std::shared_ptr source) + : source(source) {} + virtual ~BufferReader() {}; + + zsize_t size() const; + offset_t offset() const; + + void read(char* dest, offset_t offset, zsize_t size) const; + char read(offset_t offset) const; + std::shared_ptr get_buffer(offset_t offset, zsize_t size) const; + std::unique_ptr sub_reader(offset_t offset, zsize_t size) const; + + private: + std::shared_ptr source; +}; + +}; + +#endif // ZIM_FILE_READER_H_ diff --git a/src/fileheader.cpp b/src/fileheader.cpp new file mode 100644 index 0000000..9e03c6c --- /dev/null +++ b/src/fileheader.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2008 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include "log.h" +#include "endian_tools.h" +#include "buffer.h" + +log_define("zim.file.header") + +namespace zim +{ + const uint32_t Fileheader::zimMagic = 0x044d495a; // ="ZIM^d" + const uint16_t Fileheader::zimClassicMajorVersion = 5; + const uint16_t Fileheader::zimExtendedMajorVersion = 6; + const uint16_t Fileheader::zimMinorVersion = 0; + const offset_type Fileheader::size = 80; // This is also mimeListPos (so an offset) + + std::ostream& operator<< (std::ostream& out, const Fileheader& fh) + { + char header[Fileheader::size]; + toLittleEndian(Fileheader::zimMagic, header); + toLittleEndian(fh.getMajorVersion(), header + 4); + toLittleEndian(fh.getMinorVersion(), header + 6); + std::copy(fh.getUuid().data, fh.getUuid().data + sizeof(Uuid), header + 8); + toLittleEndian(fh.getArticleCount(), header + 24); + toLittleEndian(fh.getClusterCount(), header + 28); + toLittleEndian(fh.getUrlPtrPos(), header + 32); + toLittleEndian(fh.getTitleIdxPos(), header + 40); + toLittleEndian(fh.getClusterPtrPos(), header + 48); + toLittleEndian(fh.getMimeListPos(), header + 56); + toLittleEndian(fh.getMainPage(), header + 64); + toLittleEndian(fh.getLayoutPage(), header + 68); + toLittleEndian(fh.getChecksumPos(), header + 72); + + out.write(header, Fileheader::size); + + return out; + } + + void Fileheader::read(std::shared_ptr buffer) + { + uint32_t magicNumber = buffer->as(offset_t(0)); + if (magicNumber != Fileheader::zimMagic) + { + log_error("invalid magic number " << magicNumber << " found - " + << Fileheader::zimMagic << " expected"); + throw ZimFileFormatError("Invalid magic number"); + } + + uint16_t major_version = buffer->as(offset_t(4)); + if (major_version != zimClassicMajorVersion && major_version != zimExtendedMajorVersion) + { + log_error("invalid zimfile major version " << major_version << " found - " + << Fileheader::zimMajorVersion << " expected"); + throw ZimFileFormatError("Invalid version"); + } + setMajorVersion(major_version); + + setMinorVersion(buffer->as(offset_t(6))); + + Uuid uuid; + std::copy(buffer->data(offset_t(8)), buffer->data(offset_t(24)), uuid.data); + setUuid(uuid); + + setArticleCount(buffer->as(offset_t(24))); + setClusterCount(buffer->as(offset_t(28))); + setUrlPtrPos(buffer->as(offset_t(32))); + setTitleIdxPos(buffer->as(offset_t(40))); + setClusterPtrPos(buffer->as(offset_t(48))); + setMimeListPos(buffer->as(offset_t(56))); + setMainPage(buffer->as(offset_t(64))); + setLayoutPage(buffer->as(offset_t(68))); + setChecksumPos(buffer->as(offset_t(72))); + + sanity_check(); + } + + void Fileheader::sanity_check() const { + if (!!articleCount != !!clusterCount) { + throw ZimFileFormatError("No article <=> No cluster"); + } + + if (mimeListPos != size && mimeListPos != 72) { + throw ZimFileFormatError("mimelistPos must be 80."); + } + + if (urlPtrPos < mimeListPos) { + throw ZimFileFormatError("urlPtrPos must be > mimelistPos."); + } + if (titleIdxPos < mimeListPos) { + throw ZimFileFormatError("titleIdxPos must be > mimelistPos."); + } + if (clusterPtrPos < mimeListPos) { + throw ZimFileFormatError("clusterPtrPos must be > mimelistPos."); + } + + if (clusterCount > articleCount) { + throw ZimFileFormatError("Cluster count cannot be higher than article count."); + } + + if (checksumPos != 0 && checksumPos < mimeListPos) { + throw ZimFileFormatError("checksumPos must be > mimeListPos."); + } + } + +} diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp new file mode 100644 index 0000000..e255fb3 --- /dev/null +++ b/src/fileimpl.cpp @@ -0,0 +1,585 @@ +/* + * Copyright (C) 2006,2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "fileimpl.h" +#include +#include "_dirent.h" +#include "file_compound.h" +#include "file_reader.h" +#include +#include +#include +#include +#include +#include +#include +#include "config.h" +#include "log.h" +#include "envvalue.h" +#include "md5stream.h" + +log_define("zim.file.impl") + +namespace zim +{ + ////////////////////////////////////////////////////////////////////// + // FileImpl + // + FileImpl::FileImpl(const std::string& fname) + : zimFile(new FileCompound(fname)), + zimReader(new FileReader(zimFile)), + bufferDirentZone(256), + bufferDirentLock(PTHREAD_MUTEX_INITIALIZER), + filename(fname), + direntCache(envValue("ZIM_DIRENTCACHE", DIRENT_CACHE_SIZE)), + direntCacheLock(PTHREAD_MUTEX_INITIALIZER), + clusterCache(envValue("ZIM_CLUSTERCACHE", CLUSTER_CACHE_SIZE)), + clusterCacheLock(PTHREAD_MUTEX_INITIALIZER), + cacheUncompressedCluster(envValue("ZIM_CACHEUNCOMPRESSEDCLUSTER", false)), + namespaceBeginLock(PTHREAD_MUTEX_INITIALIZER), + namespaceEndLock(PTHREAD_MUTEX_INITIALIZER) + { + log_trace("read file \"" << fname << '"'); + + if (zimFile->fail()) + throw ZimFileFormatError(std::string("can't open zim-file \"") + fname + '"'); + + filename = fname; + + // read header + if (size_type(zimReader->size()) < Fileheader::size) { + throw ZimFileFormatError("zim-file is too small to contain a header"); + } + try { + header.read(zimReader->get_buffer(offset_t(0), zsize_t(Fileheader::size))); + } catch (ZimFileFormatError& e) { + throw e; + } catch (...) { + throw ZimFileFormatError("error reading zim-file header."); + } + + // urlPtrOffsetReader + zsize_t size(header.getArticleCount() * 8); + if (!zimReader->can_read(offset_t(header.getUrlPtrPos()), size)) { + throw ZimFileFormatError("Reading out of zim file."); + } +#ifdef ENABLE_USE_BUFFER_HEADER + urlPtrOffsetReader = std::unique_ptr(new BufferReader( + zimReader->get_buffer(offset_t(header.getUrlPtrPos()), size))); +#else + urlPtrOffsetReader = zimReader->sub_reader(offset_t(header.getUrlPtrPos()), size); +#endif + + // Create titleIndexBuffer + size = zsize_t(header.getArticleCount() * 4); + if (!zimReader->can_read(offset_t(header.getTitleIdxPos()), size)) { + throw ZimFileFormatError("Reading out of zim file."); + } +#ifdef ENABLE_USE_BUFFER_HEADER + titleIndexReader = std::unique_ptr(new BufferReader( + zimReader->get_buffer(offset_t(header.getTitleIdxPos()), size))); +#else + titleIndexReader = zimReader->sub_reader(offset_t(header.getTitleIdxPos()), size); +#endif + + // clusterOffsetBuffer + size = zsize_t(header.getClusterCount() * 8); + if (!zimReader->can_read(offset_t(header.getClusterPtrPos()), size)) { + throw ZimFileFormatError("Reading out of zim file."); + } +#ifdef ENABLE_USE_BUFFER_HEADER + clusterOffsetReader = std::unique_ptr(new BufferReader( + zimReader->get_buffer(offset_t(header.getClusterPtrPos()), size))); +#else + clusterOffsetReader = zimReader->sub_reader(offset_t(header.getClusterPtrPos()), size); +#endif + + if (!getCountClusters()) + log_warn("no clusters found"); + else + { + offset_t lastOffset = getClusterOffset(cluster_index_t(cluster_index_type(getCountClusters()) - 1)); + log_debug("last offset=" << lastOffset.v << " file size=" << zimFile->fsize().v); + if (lastOffset.v > zimFile->fsize().v) + { + log_fatal("last offset (" << lastOffset << ") larger than file size (" << zimFile->fsize() << ')'); + throw ZimFileFormatError("last cluster offset larger than file size; file corrupt"); + } + } + + if (header.hasChecksum() && header.getChecksumPos() != (zimFile->fsize().v-16) ) { + throw ZimFileFormatError("Checksum position is not valid"); + } + + // read mime types + size = zsize_t(header.getUrlPtrPos() - header.getMimeListPos()); + // No need to check access, getUrlPtrPos is in the zim file, and we are + // sure that getMimeListPos is 80. + auto buffer = zimReader->get_buffer(offset_t(header.getMimeListPos()), size); + offset_t current = offset_t(0); + while (current.v < size.v) + { + offset_type len = strlen(buffer->data(current)); + + if (len == 0) { + break; + } + + if (current.v + len >= size.v) { + throw(ZimFileFormatError("Error getting mimelists.")); + } + + std::string mimeType(buffer->data(current), len); + mimeTypes.push_back(mimeType); + + current += (len + 1); + } + } + + std::pair FileImpl::findx(char ns, const std::string& url) + { + log_debug("find article by url " << ns << " \"" << url << "\", in file \"" << getFilename() << '"'); + + article_index_type l = article_index_type(getNamespaceBeginOffset(ns)); + article_index_type u = article_index_type(getNamespaceEndOffset(ns)); + + if (l == u) + { + log_debug("namespace " << ns << " not found"); + return std::pair(false, article_index_t(0)); + } + + unsigned itcount = 0; + while (u - l > 1) + { + ++itcount; + article_index_type p = l + (u - l) / 2; + auto d = getDirent(article_index_t(p)); + + int c = ns < d->getNamespace() ? -1 + : ns > d->getNamespace() ? 1 + : url.compare(d->getUrl()); + + if (c < 0) + u = p; + else if (c > 0) + l = p; + else + { + log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << p); + return std::pair(true, article_index_t(p)); + } + } + + auto d = getDirent(article_index_t(l)); + int c = url.compare(d->getUrl()); + + if (c == 0) + { + log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << l); + return std::pair(true, article_index_t(l)); + } + + log_debug("article not found after " << itcount << " iterations (\"" << d.getUrl() << "\" does not match)"); + return std::pair(false, article_index_t(c < 0 ? l : u)); + } + + std::pair FileImpl::findx(const std::string& url) + { + size_t start = 0; + if (url[0] == '/') { + start = 1; + } + if (url.size() < (2+start) || url[1+start] != '/') + return std::pair(false, article_index_t(0)); + return findx(url[start], url.substr(2+start)); + } + + std::pair FileImpl::findxByTitle(char ns, const std::string& title) + { + log_debug("find article by title " << ns << " \"" << title << "\", in file \"" << getFilename() << '"'); + + article_index_type l = article_index_type(getNamespaceBeginOffset(ns)); + article_index_type u = article_index_type(getNamespaceEndOffset(ns)); + + if (l == u) + { + log_debug("namespace " << ns << " not found"); + return std::pair(false, article_index_t(0)); + } + + unsigned itcount = 0; + while (u - l > 1) + { + ++itcount; + article_index_type p = l + (u - l) / 2; + auto d = getDirentByTitle(article_index_t(p)); + + int c = ns < d->getNamespace() ? -1 + : ns > d->getNamespace() ? 1 + : title.compare(d->getTitle()); + + if (c < 0) + u = p; + else if (c > 0) + l = p; + else + { + log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << p); + return std::pair(true, article_index_t(p)); + } + } + + auto d = getDirentByTitle(article_index_t(l)); + int c = title.compare(d->getTitle()); + + if (c == 0) + { + log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << l); + return std::pair(true, article_index_t(l)); + } + + log_debug("article not found after " << itcount << " iterations (\"" << d.getTitle() << "\" does not match)"); + return std::pair(false, article_index_t(c < 0 ? l : u)); + } + + std::pair + FileImpl::getFileParts(offset_t offset, zsize_t size) + { + return zimFile->locate(offset, size); + } + + std::shared_ptr FileImpl::getDirent(article_index_t idx) + { + log_trace("FileImpl::getDirent(" << idx << ')'); + + if (idx >= getCountArticles()) + throw ZimFileFormatError("article index out of range"); + + pthread_mutex_lock(&direntCacheLock); + auto v = direntCache.getx(idx); + if (v.first) + { + log_debug("dirent " << idx << " found in cache; hits " + << direntCache.getHits() << " misses " + << direntCache.getMisses() << " ratio " + << direntCache.hitRatio() * 100 << "% fillfactor " + << direntCache.fillfactor()); + pthread_mutex_unlock(&direntCacheLock); + return v.second; + } + + log_debug("dirent " << idx << " not found in cache; hits " + << direntCache.getHits() << " misses " << direntCache.getMisses() + << " ratio " << direntCache.hitRatio() * 100 << "% fillfactor " + << direntCache.fillfactor()); + pthread_mutex_unlock(&direntCacheLock); + + offset_t indexOffset = getOffset(urlPtrOffsetReader.get(), idx.v); + // We don't know the size of the dirent because it depends of the size of + // the title, url and extra parameters. + // This is a pitty but we have no choices. + // We cannot take a buffer of the size of the file, it would be really inefficient. + // Let's do try, catch and retry while chosing a smart value for the buffer size. + // Most dirent will be "Article" entry (header's size == 16) without extra parameters. + // Let's hope that url + title size will be < 256 and if not try again with a bigger size. + + pthread_mutex_lock(&bufferDirentLock); + zsize_t bufferSize = zsize_t(256); + std::shared_ptr dirent; + while (true) { + bufferDirentZone.reserve(size_type(bufferSize)); + zimReader->read(bufferDirentZone.data(), indexOffset, bufferSize); + auto direntBuffer = std::unique_ptr(new MemoryBuffer(bufferDirentZone.data(), bufferSize)); + try { + dirent = std::make_shared(std::move(direntBuffer)); + } catch (InvalidSize&) { + // buffer size is not enougth, try again : + bufferSize += 256; + continue; + } + // Success ! + break; + } + pthread_mutex_unlock(&bufferDirentLock); + + log_debug("dirent read from " << indexOffset); + pthread_mutex_lock(&direntCacheLock); + direntCache.put(idx, dirent); + pthread_mutex_unlock(&direntCacheLock); + + return dirent; + } + + std::shared_ptr FileImpl::getDirentByTitle(article_index_t idx) + { + if (idx >= getCountArticles()) + throw ZimFileFormatError("article index out of range"); + return getDirent(getIndexByTitle(idx)); + } + + article_index_t FileImpl::getIndexByTitle(article_index_t idx) + { + if (idx >= getCountArticles()) + throw ZimFileFormatError("article index out of range"); + + article_index_t ret(titleIndexReader->read( + offset_t(sizeof(article_index_t)*idx.v))); + + return ret; + } + + std::shared_ptr FileImpl::getCluster(cluster_index_t idx) + { + if (idx >= getCountClusters()) + throw ZimFileFormatError("cluster index out of range"); + + pthread_mutex_lock(&clusterCacheLock); + auto cluster(clusterCache.get(idx)); + pthread_mutex_unlock(&clusterCacheLock); + if (cluster) + { + log_debug("cluster " << idx << " found in cache; hits " << clusterCache.getHits() << " misses " << clusterCache.getMisses() << " ratio " << clusterCache.hitRatio() * 100 << "% fillfactor " << clusterCache.fillfactor()); + return cluster; + } + + offset_t clusterOffset(getClusterOffset(idx)); + cluster_index_t next_idx(idx.v + 1); + offset_t nextClusterOffset( (next_idx < getCountClusters()) + ? getClusterOffset(next_idx).v + : (header.hasChecksum()) + ? header.getChecksumPos() + : zimFile->fsize().v ); + zsize_t clusterSize(nextClusterOffset.v - clusterOffset.v); + log_debug("read cluster " << idx << " from offset " << clusterOffset); + CompressionType comp; + bool extended; + std::shared_ptr reader = zimReader->sub_clusterReader(clusterOffset, clusterSize, &comp, &extended); + cluster = std::shared_ptr(new Cluster(reader, comp, extended)); + + log_debug("put cluster " << idx << " into cluster cache; hits " << clusterCache.getHits() << " misses " << clusterCache.getMisses() << " ratio " << clusterCache.hitRatio() * 100 << "% fillfactor " << clusterCache.fillfactor()); + pthread_mutex_lock(&clusterCacheLock); + clusterCache.put(idx, cluster); + pthread_mutex_unlock(&clusterCacheLock); + + return cluster; + } + + offset_t FileImpl::getOffset(const Reader* reader, size_t idx) + { + offset_t offset(reader->read(offset_t(sizeof(offset_type)*idx))); + return offset; + } + + offset_t FileImpl::getClusterOffset(cluster_index_t idx) + { + return getOffset(clusterOffsetReader.get(), idx.v); + } + + offset_t FileImpl::getBlobOffset(cluster_index_t clusterIdx, blob_index_t blobIdx) + { + auto cluster = getCluster(clusterIdx); + if (cluster->isCompressed()) + return offset_t(0); + return getClusterOffset(clusterIdx) + offset_t(1) + cluster->getBlobOffset(blobIdx); + } + + article_index_t FileImpl::getNamespaceBeginOffset(char ch) + { + log_trace("getNamespaceBeginOffset(" << ch << ')'); + + pthread_mutex_lock(&namespaceBeginLock); + NamespaceCache::const_iterator it = namespaceBeginCache.find(ch); + if (it != namespaceBeginCache.end()) + { + article_index_t ret(it->second); + pthread_mutex_unlock(&namespaceBeginLock); + return ret; + } + pthread_mutex_unlock(&namespaceBeginLock); + + article_index_type lower = 0; + article_index_type upper = article_index_type(getCountArticles()); + auto d = getDirent(article_index_t(0)); + while (upper - lower > 1) + { + article_index_type m = lower + (upper - lower) / 2; + auto d = getDirent(article_index_t(m)); + if (d->getNamespace() >= ch) + upper = m; + else + lower = m; + } + + article_index_t ret = article_index_t(d->getNamespace() < ch ? upper : lower); + pthread_mutex_lock(&namespaceBeginLock); + namespaceBeginCache[ch] = ret; + pthread_mutex_unlock(&namespaceBeginLock); + + return ret; + } + + article_index_t FileImpl::getNamespaceEndOffset(char ch) + { + log_trace("getNamespaceEndOffset(" << ch << ')'); + + pthread_mutex_lock(&namespaceEndLock); + NamespaceCache::const_iterator it = namespaceEndCache.find(ch); + if (it != namespaceEndCache.end()) + { + article_index_t ret = it->second; + pthread_mutex_unlock(&namespaceEndLock); + return ret; + } + pthread_mutex_unlock(&namespaceEndLock); + + article_index_type lower = 0; + article_index_type upper = article_index_type(getCountArticles()); + log_debug("namespace " << ch << " lower=" << lower << " upper=" << upper); + while (upper - lower > 1) + { + article_index_type m = lower + (upper - lower) / 2; + auto d = getDirent(article_index_t(m)); + if (d->getNamespace() > ch) + upper = m; + else + lower = m; + log_debug("namespace " << d->getNamespace() << " m=" << m << " lower=" << lower << " upper=" << upper); + } + + pthread_mutex_lock(&namespaceEndLock); + namespaceEndCache[ch] = article_index_t(upper); + pthread_mutex_unlock(&namespaceEndLock); + + return article_index_t(upper); + + } + + std::string FileImpl::getNamespaces() + { + std::string namespaces; + + auto d = getDirent(article_index_t(0)); + namespaces = d->getNamespace(); + + article_index_t idx(0); + while ((idx = getNamespaceEndOffset(d->getNamespace())) < getCountArticles()) + { + d = getDirent(idx); + namespaces += d->getNamespace(); + } + + return namespaces; + } + + const std::string& FileImpl::getMimeType(uint16_t idx) const + { + if (idx > mimeTypes.size()) + { + std::ostringstream msg; + msg << "unknown mime type code " << idx; + throw std::runtime_error(msg.str()); + } + + return mimeTypes[idx]; + } + + std::string FileImpl::getChecksum() + { + if (!header.hasChecksum()) + return std::string(); + + std::shared_ptr chksum; + try { + chksum = zimReader->get_buffer(offset_t(header.getChecksumPos()), zsize_t(16)); + } catch (...) + { + log_warn("error reading checksum"); + return std::string(); + } + + char hexdigest[33]; + hexdigest[32] = '\0'; + static const char hex[] = "0123456789abcdef"; + char* p = hexdigest; + for (int i = 0; i < 16; ++i) + { + uint8_t v = chksum->at(offset_t(i)); + *p++ = hex[v >> 4]; + *p++ = hex[v & 0xf]; + } + log_debug("chksum=" << hexdigest); + return hexdigest; + } + + bool FileImpl::verify() + { + if (!header.hasChecksum()) + return false; + + Md5stream md5; + + offset_type checksumPos = header.getChecksumPos(); + offset_type currentPos = 0; + for(auto part = zimFile->begin(); + part != zimFile->end(); + part++) { + std::ifstream stream(part->second->filename()); + char ch; + for(/*NOTHING*/ ; currentPos < checksumPos && stream.get(ch).good(); currentPos++) { + md5 << ch; + } + if (stream.bad()) { + perror("error while reading file"); + return false; + } + if (currentPos == checksumPos) { + break; + } + } + + if (currentPos != checksumPos) { + return false; + } + + + unsigned char chksumCalc[16]; + auto chksumFile = zimReader->get_buffer(offset_t(header.getChecksumPos()), zsize_t(16)); + + md5.getDigest(chksumCalc); + if (std::memcmp(chksumFile->data(), chksumCalc, 16) != 0) + { + return false; + } + + return true; + } + + time_t FileImpl::getMTime() const { + return zimFile->getMTime(); + } + + zim::zsize_t FileImpl::getFilesize() const { + return zimFile->fsize(); + } + + bool FileImpl::is_multiPart() const { + return zimFile->is_multiPart(); + } +} diff --git a/src/fileimpl.h b/src/fileimpl.h new file mode 100644 index 0000000..8ec5fa2 --- /dev/null +++ b/src/fileimpl.h @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILEIMPL_H +#define ZIM_FILEIMPL_H + +#include +#include +#include +#include +#include +#include +#include +#include "cache.h" +#include "_dirent.h" +#include "cluster.h" +#include "buffer.h" +#include "file_reader.h" +#include "file_compound.h" +#include "zim_types.h" + +namespace zim +{ + class FileImpl + { + std::shared_ptr zimFile; + std::shared_ptr zimReader; + std::vector bufferDirentZone; + pthread_mutex_t bufferDirentLock; + Fileheader header; + std::string filename; + + std::unique_ptr titleIndexReader; + std::unique_ptr urlPtrOffsetReader; + std::unique_ptr clusterOffsetReader; + + offset_t getOffset(const Reader* reader, size_t idx); + + Cache> direntCache; + pthread_mutex_t direntCacheLock; + + Cache> clusterCache; + pthread_mutex_t clusterCacheLock; + + bool cacheUncompressedCluster; + typedef std::map NamespaceCache; + + NamespaceCache namespaceBeginCache; + pthread_mutex_t namespaceBeginLock; + NamespaceCache namespaceEndCache; + pthread_mutex_t namespaceEndLock; + + typedef std::vector MimeTypes; + MimeTypes mimeTypes; + + public: + + explicit FileImpl(const std::string& fname); + + time_t getMTime() const; + + const std::string& getFilename() const { return filename; } + const Fileheader& getFileheader() const { return header; } + zsize_t getFilesize() const; + + std::pair + getFileParts(offset_t offset, zsize_t size); + std::shared_ptr getDirent(article_index_t idx); + std::shared_ptr getDirentByTitle(article_index_t idx); + article_index_t getIndexByTitle(article_index_t idx); + article_index_t getCountArticles() const { return article_index_t(header.getArticleCount()); } + + + std::pair findx(char ns, const std::string& url); + std::pair findx(const std::string& url); + std::pair findxByTitle(char ns, const std::string& title); + + std::shared_ptr getCluster(cluster_index_t idx); + cluster_index_t getCountClusters() const { return cluster_index_t(header.getClusterCount()); } + offset_t getClusterOffset(cluster_index_t idx); + offset_t getBlobOffset(cluster_index_t clusterIdx, blob_index_t blobIdx); + + article_index_t getNamespaceBeginOffset(char ch); + article_index_t getNamespaceEndOffset(char ch); + article_index_t getNamespaceCount(char ns) + { return getNamespaceEndOffset(ns) - getNamespaceBeginOffset(ns); } + + std::string getNamespaces(); + bool hasNamespace(char ch) const; + + const std::string& getMimeType(uint16_t idx) const; + + std::string getChecksum(); + bool verify(); + bool is_multiPart() const; + }; + +} + +#endif // ZIM_FILEIMPL_H + diff --git a/src/fs.h b/src/fs.h new file mode 100644 index 0000000..5736a5e --- /dev/null +++ b/src/fs.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FS_H_ +#define ZIM_FS_H_ + +#ifdef _WIN32 +# include "fs_windows.h" +#else +# include "fs_unix.h" +#endif + +namespace zim { + +#ifdef _WIN32 +using DEFAULTFS = windows::FS; +#else +using DEFAULTFS = unix::FS; +#endif +}; + +#endif //ZIM_FS_H_ diff --git a/src/fs_unix.cpp b/src/fs_unix.cpp new file mode 100644 index 0000000..e0dee54 --- /dev/null +++ b/src/fs_unix.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "fs_unix.h" +#include + +#include +#include +#include +#include +#include +#include + +namespace zim +{ + +namespace unix { + +zsize_t FD::readAt(char* dest, zsize_t size, offset_t offset) const +{ +#ifdef __APPLE__ +# define PREAD pread +#else +# define PREAD pread64 +#endif + ssize_t full_size_read = 0; + auto size_to_read = size.v; + auto current_offset = offset.v; + errno = 0; + while (size_to_read > 0) { + auto size_read = PREAD(m_fd, dest, size_to_read, current_offset); + if (size_read == -1) { + return zsize_t(-1); + } + size_to_read -= size_read; + current_offset += size_read; + full_size_read += size_read; + } + return zsize_t(full_size_read); +#undef PREAD +} + +zsize_t FD::getSize() const +{ + struct stat sb; + fstat(m_fd, &sb); + return zsize_t(sb.st_size); +} + +bool FD::seek(offset_t offset) +{ + return static_cast(offset.v) == lseek(m_fd, offset.v, SEEK_SET); +} + +bool FD::close() { + if (m_fd != -1) { + return ::close(m_fd); + } + return -1; +} + +FD FS::openFile(path_t filepath) +{ + int fd = open(filepath.c_str(), O_RDONLY); + if (fd == -1) { + throw std::runtime_error(""); + } + return FD(fd); +} + +bool FS::makeDirectory(path_t path) +{ + return !mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); +} + +void FS::rename(path_t old_path, path_t new_path) +{ + ::rename(old_path.c_str(), new_path.c_str()); +} + +std::string FS::join(path_t base, path_t name) +{ + return base + "/" + name; +} + +bool FS::remove(path_t path) +{ + DIR* dir; + /* It's a directory, remove all its entries first */ + if ((dir = opendir(path.c_str())) != NULL) { + struct dirent* ent; + while ((ent = readdir(dir)) != NULL) { + std::string childName = ent->d_name; + if (childName != "." && childName != "..") { + auto childPath = join(path, childName); + remove(childPath); + } + } + closedir(dir); + return removeDir(path); + } + + /* It's a file */ + else { + return removeFile(path); + } +} + +bool FS::removeDir(path_t path) { + return rmdir(path.c_str()); +} + +bool FS::removeFile(path_t path) { + return ::remove(path.c_str()); +} + + +}; // unix namespace + +}; // zim namespace + diff --git a/src/fs_unix.h b/src/fs_unix.h new file mode 100644 index 0000000..8ec69ad --- /dev/null +++ b/src/fs_unix.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FS_UNIX_H_ +#define ZIM_FS_UNIX_H_ + +#include "zim_types.h" + +#include + +#include +#include +#include +#include +#include + +namespace zim { + +namespace unix { + +using path_t = const std::string&; + +class FD { + public: + using fd_t = int; + private: + fd_t m_fd = -1; + + public: + FD() = default; + FD(fd_t fd): + m_fd(fd) {}; + FD(const FD& o) = delete; + FD(FD&& o) : + m_fd(o.m_fd) { o.m_fd = -1; } + FD& operator=(FD&& o) { + m_fd = o.m_fd; + o.m_fd = -1; + return *this; + } + ~FD() { close(); } + zsize_t readAt(char* dest, zsize_t size, offset_t offset) const; + zsize_t getSize() const; + fd_t getNativeHandle() const + { + return m_fd; + } + fd_t release() + { + int ret = m_fd; + m_fd = -1; + return ret; + } + bool seek(offset_t offset); + bool close(); +}; + +struct FS { + using FD = zim::unix::FD; + static std::string join(path_t base, path_t name); + static FD openFile(path_t filepath); + static bool makeDirectory(path_t path); + static void rename(path_t old_path, path_t new_path); + static bool remove(path_t path); + static bool removeDir(path_t path); + static bool removeFile(path_t path); +}; + +}; // unix namespace + +}; // zim namespace + +#endif //ZIM_FS_UNIX_H_ diff --git a/src/fs_windows.cpp b/src/fs_windows.cpp new file mode 100644 index 0000000..e4df1e4 --- /dev/null +++ b/src/fs_windows.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "fs_windows.h" +#include + +#include +#include +#include +#include +#include + +#include +#include + +namespace zim { + +namespace windows { + +struct ImplFD { + HANDLE m_handle = INVALID_HANDLE_VALUE; + CRITICAL_SECTION m_criticalSection; + + ImplFD() { + InitializeCriticalSection(&m_criticalSection); + } + ImplFD(HANDLE handle) : + m_handle(handle) + { + InitializeCriticalSection(&m_criticalSection); + } + + ~ImplFD() { + DeleteCriticalSection(&m_criticalSection); + } +}; + +FD::FD() : + mp_impl(new ImplFD()) {} + +FD::FD(fd_t handle) : + mp_impl(new ImplFD(handle)) {} + +FD::FD(int fd): + mp_impl(new ImplFD(reinterpret_cast(_get_osfhandle(fd)))) {} + +FD::FD(FD&& o) = default; +FD& FD::operator=(FD&& o) = default; + +FD::~FD() +{ + if (mp_impl) + close(); +} + +zsize_t FD::readAt(char* dest, zsize_t size, offset_t offset) const +{ + if (!mp_impl) + return zsize_t(-1); + EnterCriticalSection(&mp_impl->m_criticalSection); + LARGE_INTEGER off; + off.QuadPart = offset.v; + if (!SetFilePointerEx(mp_impl->m_handle, off, NULL, FILE_BEGIN)) { + goto err; + } + + DWORD size_read; + if (!ReadFile(mp_impl->m_handle, dest, size.v, &size_read, NULL)) { + goto err; + } + if (size_read != size.v) { + goto err; + } + LeaveCriticalSection(&mp_impl->m_criticalSection); + return size; +err: + LeaveCriticalSection(&mp_impl->m_criticalSection); + return zsize_t(-1); +} + +bool FD::seek(offset_t offset) +{ + if(!mp_impl) + return false; + LARGE_INTEGER off; + off.QuadPart = offset.v; + return SetFilePointerEx(mp_impl->m_handle, off, NULL, FILE_BEGIN); +} + +zsize_t FD::getSize() const +{ + if(!mp_impl) + return zsize_t(0); + LARGE_INTEGER size; + if (!GetFileSizeEx(mp_impl->m_handle, &size)) { + size.QuadPart = 0; + } + return zsize_t(size.QuadPart); +} + +int FD::release() +{ + if(!mp_impl) + return -1; + int ret = _open_osfhandle(reinterpret_cast(mp_impl->m_handle), 0); + mp_impl->m_handle = INVALID_HANDLE_VALUE; + return ret; +} + +bool FD::close() +{ + if (!mp_impl || mp_impl->m_handle == INVALID_HANDLE_VALUE) { + return false; + } + return CloseHandle(mp_impl->m_handle); +} + +std::unique_ptr FS::toWideChar(path_t path) +{ + auto size = MultiByteToWideChar(CP_UTF8, 0, + path.c_str(), -1, nullptr, 0); + auto wdata = std::unique_ptr(new wchar_t[size]); + auto ret = MultiByteToWideChar(CP_UTF8, 0, + path.c_str(), -1, wdata.get(), size); + if (0 == ret) { + std::ostringstream oss; + oss << "Cannot convert path to wchar : " << GetLastError(); + throw std::runtime_error(oss.str()); + } + return wdata; +} + +FD FS::openFile(path_t filepath) +{ + auto wpath = toWideChar(filepath); + FD::fd_t handle; + handle = CreateFileW(wpath.get(), + GENERIC_READ, + FILE_SHARE_READ, + NULL, + OPEN_EXISTING, + FILE_ATTRIBUTE_READONLY|FILE_FLAG_RANDOM_ACCESS, + NULL); + if (handle == INVALID_HANDLE_VALUE) { + std::ostringstream oss; + oss << "Cannot open file : " << GetLastError(); + throw std::runtime_error(oss.str()); + } + return FD(handle); +} + +bool FS::makeDirectory(path_t path) +{ + auto wpath = toWideChar(path); + auto ret = CreateDirectoryW(wpath.get(), NULL); + return ret; +} + + +void FS::rename(path_t old_path, path_t new_path) +{ + MoveFileW(toWideChar(old_path).get(), toWideChar(new_path).get()); +} + +std::string FS::join(path_t base, path_t name) +{ + return base + "\\" + name; +} + +bool FS::removeDir(path_t path) +{ + return RemoveDirectoryW(toWideChar(path).get()); +} + +bool FS::removeFile(path_t path) +{ + return DeleteFileW(toWideChar(path).get()); +} + +}; // windows namespace + +}; // zim namespace + diff --git a/src/fs_windows.h b/src/fs_windows.h new file mode 100644 index 0000000..60d1062 --- /dev/null +++ b/src/fs_windows.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FS_WINDOWS_H_ +#define ZIM_FS_WINDOWS_H_ + +#include "zim_types.h" + +#include +#include + +typedef void* HANDLE; + +namespace zim { + +namespace windows { + +using path_t = const std::string&; + +struct ImplFD; + +class FD { + public: + typedef HANDLE fd_t; + private: + std::unique_ptr mp_impl; + + public: + FD(); + FD(fd_t handle); + FD(int fd); + FD(const FD& o) = delete; + FD(FD&& o); + FD& operator=(FD&& o); + FD& operator=(const FD& o) = delete; + ~FD(); + zsize_t readAt(char* dest, zsize_t size, offset_t offset) const; + zsize_t getSize() const; + int release(); + bool seek(offset_t offset); + bool close(); +}; + +struct FS { + using FD = zim::windows::FD; + static std::string join(path_t base, path_t name); + static std::unique_ptr toWideChar(path_t path); + static FD openFile(path_t filepath); + static bool makeDirectory(path_t path); + static void rename(path_t old_path, path_t new_path); + static bool remove(path_t path); + static bool removeDir(path_t path); + static bool removeFile(path_t path); +}; + +}; // windows namespace + +}; // zim namespace + +#endif //ZIM_FS_WINDOWS_H_ diff --git a/src/levenshtein.cpp b/src/levenshtein.cpp new file mode 100755 index 0000000..54772fc --- /dev/null +++ b/src/levenshtein.cpp @@ -0,0 +1,33 @@ + +#include "levenshtein.h" +#include +#include + +int levenshtein_distance(const std::string &s1, const std::string &s2) +{ + int s1len = s1.size(); + int s2len = s2.size(); + + auto column_start = (decltype(s1len))1; + + auto column = new decltype(s1len)[s1len + 1]; + std::iota(column + column_start - 1, column + s1len + 1, column_start - 1); + + for (auto x = column_start; x <= s2len; x++) { + column[0] = x; + auto last_diagonal = x - column_start; + for (auto y = column_start; y <= s1len; y++) { + auto old_diagonal = column[y]; + auto possibilities = { + column[y] + 1, + column[y - 1] + 1, + last_diagonal + (s1[y - 1] == s2[x - 1]? 0 : 1) + }; + column[y] = std::min(possibilities); + last_diagonal = old_diagonal; + } + } + auto result = column[s1len]; + delete[] column; + return result; +} diff --git a/src/levenshtein.h b/src/levenshtein.h new file mode 100644 index 0000000..d634693 --- /dev/null +++ b/src/levenshtein.h @@ -0,0 +1,9 @@ + +#ifndef LEVENSHTEIN_H +#define LEVENSHTEIN_H + +#include + +int levenshtein_distance(const std::string &s1, const std::string &s2); + +#endif // LEVENSHTEIN_H diff --git a/src/log.h b/src/log.h new file mode 100644 index 0000000..5fbd81a --- /dev/null +++ b/src/log.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "config.h" + +#ifdef WITH_CXXTOOLS + +#include + +#else + +#define log_define(e) +#define log_fatal(e) +#define log_error(e) +#define log_warn(e) +#define log_info(e) +#define log_debug(e) +#define log_trace(e) +#define log_init() + +#endif diff --git a/src/md5.c b/src/md5.c new file mode 100644 index 0000000..bae002e --- /dev/null +++ b/src/md5.c @@ -0,0 +1,340 @@ +/* MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm + */ + +/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All +rights reserved. + +License to copy and use this software is granted provided that it +is identified as the "RSA Data Security, Inc. MD5 Message-Digest +Algorithm" in all material mentioning or referencing this software +or this function. + +License is also granted to make and use derivative works provided +that such works are identified as "derived from the RSA Data +Security, Inc. MD5 Message-Digest Algorithm" in all material +mentioning or referencing the derived work. + +RSA Data Security, Inc. makes no representations concerning either +the merchantability of this software or the suitability of this +software for any particular purpose. It is provided "as is" +without express or implied warranty of any kind. + +These notices must be retained in any copies of any part of this +documentation and/or software. + */ + +#include "md5.h" +#include + +#define MD5_CTX struct zim_MD5_CTX + +/* Constants for MD5Transform routine. + */ +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + +static void MD5Transform PROTO_LIST ((UINT4 [4], const unsigned char [64])); +static void Encode PROTO_LIST + ((unsigned char *, UINT4 *, unsigned int)); +static void Decode PROTO_LIST + ((UINT4 *, const unsigned char *, unsigned int)); +/* +static void MD5_memcpy PROTO_LIST ((POINTER, POINTER, unsigned int)); +static void MD5_memset PROTO_LIST ((POINTER, int, unsigned int)); +*/ +#define MD5_memcpy memcpy +#define MD5_memset memset + +static unsigned char PADDING[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* F, G, H and I are basic MD5 functions. + */ +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +/* ROTATE_LEFT rotates x left n bits. + */ +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. +Rotation is separate from addition to prevent recomputation. + */ +#define FF(a, b, c, d, x, s, ac) { \ + (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) { \ + (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) { \ + (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) { \ + (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } + +/* MD5 initialization. Begins an MD5 operation, writing a new context. + */ +void zim_MD5Init (MD5_CTX* context) +{ + context->count[0] = context->count[1] = 0; + /* Load magic initialization constants. +*/ + context->state[0] = 0x67452301; + context->state[1] = 0xefcdab89; + context->state[2] = 0x98badcfe; + context->state[3] = 0x10325476; +} + +/* MD5 block update operation. Continues an MD5 message-digest + operation, processing another message block, and updating the + context. + */ +void zim_MD5Update ( +MD5_CTX *context, +const unsigned char *input, /* input block */ +unsigned int inputLen) /* length of input block */ +{ + unsigned int i, index, partLen; + + /* Compute number of bytes mod 64 */ + index = (unsigned int)((context->count[0] >> 3) & 0x3F); + + /* Update number of bits */ + if ((context->count[0] += ((UINT4)inputLen << 3)) + < ((UINT4)inputLen << 3)) + context->count[1]++; + context->count[1] += ((UINT4)inputLen >> 29); + + partLen = 64 - index; + + /* Transform as many times as possible. +*/ + if (inputLen >= partLen) { + MD5_memcpy + ((POINTER)&context->buffer[index], (POINTER)input, partLen); + MD5Transform (context->state, context->buffer); + + for (i = partLen; i + 63 < inputLen; i += 64) + MD5Transform (context->state, &input[i]); + + index = 0; + } + else + i = 0; + + /* Buffer remaining input */ + MD5_memcpy + ((POINTER)&context->buffer[index], (POINTER)&input[i], + inputLen-i); +} + +/* MD5 finalization. Ends an MD5 message-digest operation, writing the + the message digest and zeroizing the context. + */ +void zim_MD5Final ( +unsigned char digest[16], /* message digest */ +MD5_CTX *context) /* context */ +{ + unsigned char bits[8]; + unsigned int index, padLen; + + /* Save number of bits */ + Encode (bits, context->count, 8); + + /* Pad out to 56 mod 64. +*/ + index = (unsigned int)((context->count[0] >> 3) & 0x3f); + padLen = (index < 56) ? (56 - index) : (120 - index); + zim_MD5Update (context, PADDING, padLen); + + /* Append length (before padding) */ + zim_MD5Update (context, bits, 8); + /* Store state in digest */ + Encode (digest, context->state, 16); + + /* Zeroize sensitive information. +*/ + MD5_memset ((POINTER)context, 0, sizeof (*context)); +} + +/* MD5 basic transformation. Transforms state based on block. + */ +static void MD5Transform ( +UINT4 state[4], +const unsigned char block[64]) +{ + UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + Decode (x, block, 64); + + /* Round 1 */ + FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ + FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ + FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ + FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ + FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ + FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ + FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ + FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ + FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ + FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ + FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ + FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ + FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ + FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ + FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ + FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ + + /* Round 2 */ + GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ + GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ + GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ + GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ + GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ + GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ + GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ + GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ + GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ + GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ + GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ + GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ + GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ + GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ + GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ + GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ + + /* Round 3 */ + HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ + HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ + HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ + HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ + HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ + HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ + HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ + HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ + HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ + HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ + HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ + HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ + HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ + HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ + HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ + HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ + + /* Round 4 */ + II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ + II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ + II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ + II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ + II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ + II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ + II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ + II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ + II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ + II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ + II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ + II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ + II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ + II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ + II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ + II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + + /* Zeroize sensitive information. +*/ + MD5_memset ((POINTER)x, 0, sizeof (x)); +} + +/* Encodes input (UINT4) into output (unsigned char). Assumes len is + a multiple of 4. + */ +static void Encode ( +unsigned char *output, +UINT4 *input, +unsigned int len) +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) { + output[j] = (unsigned char)(input[i] & 0xff); + output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); + output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); + output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); + } +} + +/* Decodes input (unsigned char) into output (UINT4). Assumes len is + a multiple of 4. + */ +static void Decode ( +UINT4 *output, +const unsigned char *input, +unsigned int len) +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) + output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) | + (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24); +} + +#if 0 +/* Note: Replace "for loop" with standard memcpy if possible. + */ + +static void MD5_memcpy ( +POINTER output, +POINTER input, +unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; i++) + output[i] = input[i]; +} + +/* Note: Replace "for loop" with standard memset if possible. + */ +static void MD5_memset ( +POINTER output, +int value, +unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; i++) + ((char *)output)[i] = (char)value; +} +#endif diff --git a/src/md5.h b/src/md5.h new file mode 100644 index 0000000..29bdc39 --- /dev/null +++ b/src/md5.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2003 Tommi Maekitalo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * As a special exception, you may use this file as part of a free + * software library without restriction. Specifically, if other files + * instantiate templates or use macros or inline functions from this + * file, or you compile this file and link it with other files to + * produce an executable, this file does not by itself cause the + * resulting executable to be covered by the GNU General Public + * License. This exception does not however invalidate any other + * reasons why the executable file might be covered by the GNU Library + * General Public License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All +rights reserved. + +License to copy and use this software is granted provided that it +is identified as the "RSA Data Security, Inc. MD5 Message-Digest +Algorithm" in all material mentioning or referencing this software +or this function. + +License is also granted to make and use derivative works provided +that such works are identified as "derived from the RSA Data +Security, Inc. MD5 Message-Digest Algorithm" in all material +mentioning or referencing the derived work. + +RSA Data Security, Inc. makes no representations concerning either +the merchantability of this software or the suitability of this +software for any particular purpose. It is provided "as is" +without express or implied warranty of any kind. + +These notices must be retained in any copies of any part of this +documentation and/or software. + */ + +/* RSAREF types and constants + */ + +/* PROTOTYPES should be set to one if and only if the compiler supports + function argument prototyping. +The following makes PROTOTYPES default to 0 if it has not already + been defined with C compiler flags. + */ + +#ifndef ZIM_MD5_H +#define ZIM_MD5_H + +#ifndef PROTOTYPES +#define PROTOTYPES 1 +#endif + +/* POINTER defines a generic pointer type */ +typedef unsigned char *POINTER; + +/* UINT2 defines a two byte word */ +typedef unsigned short int UINT2; + +/* UINT4 defines a four byte word */ +typedef unsigned int UINT4; + +/* PROTO_LIST is defined depending on how PROTOTYPES is defined above. + If using PROTOTYPES, then PROTO_LIST returns the list, otherwise it + returns an empty list. + */ + +#if PROTOTYPES +#define PROTO_LIST(list) list +#else +#define PROTO_LIST(list) () +#endif + +/* MD5 context. */ +struct zim_MD5_CTX { + UINT4 state[4]; /* state (ABCD) */ + UINT4 count[2]; /* number of bits, modulo 2^64 (lsb first) */ + unsigned char buffer[64]; /* input buffer */ +}; + +#ifdef __cplusplus +extern "C" { +#endif + +void zim_MD5Init PROTO_LIST ((struct zim_MD5_CTX *)); +void zim_MD5Update PROTO_LIST + ((struct zim_MD5_CTX *, const unsigned char *, unsigned int)); +void zim_MD5Final PROTO_LIST ((unsigned char [16], struct zim_MD5_CTX *)); + +#ifdef __cplusplus +} +#endif + +#endif /* ZIM_MD5_H */ diff --git a/src/md5stream.cpp b/src/md5stream.cpp new file mode 100644 index 0000000..799f89f --- /dev/null +++ b/src/md5stream.cpp @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2003 Tommi Maekitalo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * As a special exception, you may use this file as part of a free + * software library without restriction. Specifically, if other files + * instantiate templates or use macros or inline functions from this + * file, or you compile this file and link it with other files to + * produce an executable, this file does not by itself cause the + * resulting executable to be covered by the GNU General Public + * License. This exception does not however invalidate any other + * reasons why the executable file might be covered by the GNU Library + * General Public License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "md5stream.h" +#include "md5.h" +#include + +namespace zim +{ + +//////////////////////////////////////////////////////////////////////// +// Md5streambuf +// +Md5streambuf::Md5streambuf() + : context(new zim_MD5_CTX()) +{ + zim_MD5Init(context); +} + +Md5streambuf::~Md5streambuf() +{ + delete context; +} + +std::streambuf::int_type Md5streambuf::overflow( + std::streambuf::int_type ch) +{ + if (pptr() == 0) + { + // Ausgabepuffer ist leer - initialisieren + zim_MD5Init(context); + } + else + { + // konsumiere Zeichen aus dem Puffer + zim_MD5Update(context, + (const unsigned char*)pbase(), + pptr() - pbase()); + } + + // setze Ausgabepuffer + setp(buffer, buffer + bufsize); + + if (ch != traits_type::eof()) + { + // das Zeichen, welches den overflow ausgelöst hat, stecken + // wir in den Puffer. + *pptr() = traits_type::to_char_type(ch); + pbump(1); + } + + return 0; +} + +std::streambuf::int_type Md5streambuf::underflow() +{ + // nur Ausgabestrom + return traits_type::eof(); +} + +int Md5streambuf::sync() +{ + if (pptr() != pbase()) + { + // konsumiere Zeichen aus dem Puffer + zim_MD5Update(context, (const unsigned char*)pbase(), pptr() - pbase()); + + // leere Ausgabepuffer + setp(buffer, buffer + bufsize); + } + + return 0; +} + +void Md5streambuf::getDigest(unsigned char digest_[16]) +{ + if (pptr()) + { + if (pptr() != pbase()) + { + // konsumiere Zeichen aus dem Puffer + zim_MD5Update(context, (const unsigned char*)pbase(), pptr() - pbase()); + } + + // deinitialisiere Ausgabepuffer + setp(0, 0); + } + else + { + zim_MD5Init(context); + } + + zim_MD5Final(digest, context); + + std::memcpy(digest_, digest, 16); +} + +//////////////////////////////////////////////////////////////////////// +// Md5stream +// +const char* Md5stream::getHexDigest() +{ + static const char hex[] = "0123456789abcdef"; + unsigned char md5[16]; + getDigest(md5); + int i; + char* p = hexdigest; + for (i = 0; i < 16; ++i) + { + *p++ = hex[md5[i] >> 4]; + *p++ = hex[md5[i] & 0xf]; + } + *p = '\0'; + return hexdigest; +} + +} diff --git a/src/md5stream.h b/src/md5stream.h new file mode 100644 index 0000000..95bcf55 --- /dev/null +++ b/src/md5stream.h @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2003 Tommi Maekitalo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * As a special exception, you may use this file as part of a free + * software library without restriction. Specifically, if other files + * instantiate templates or use macros or inline functions from this + * file, or you compile this file and link it with other files to + * produce an executable, this file does not by itself cause the + * resulting executable to be covered by the GNU General Public + * License. This exception does not however invalidate any other + * reasons why the executable file might be covered by the GNU Library + * General Public License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef ZIM_MD5STREAM_H +#define ZIM_MD5STREAM_H + +#include + +struct zim_MD5_CTX; + +namespace zim +{ + +class Md5streambuf : public std::streambuf +{ + public: + Md5streambuf(); + ~Md5streambuf(); + + void getDigest(unsigned char digest[16]); + + private: + static const unsigned int bufsize = 64; + char buffer[bufsize]; + zim_MD5_CTX* context; + unsigned char digest[16]; + + std::streambuf::int_type overflow(std::streambuf::int_type ch); + std::streambuf::int_type underflow(); + int sync(); +}; + +/** + This is a easy and safe interface to MD5-calculation. + + To get a MD5-sum of data, instantiate a md5stream, copy your data + into it and read the digest. + + After calling getDigest or getHexDigest, the class can be reused + for another md5-calculation. The algorithm is automatically + reinitialized when the first character is received. + + example: + \code + int main(int argc, char* argv[]) + { + Md5stream s; + for (int i = 1; i < argc; ++i) + { + std::ifstream in(argv[i]); + if (in) + { + s << in.rdbuf(); + std::cout << s.getHexDigest() << " " << argv[i] << std::endl; + } + } + } + \endcode + */ +class Md5stream : public std::ostream +{ + public: + typedef std::ostreambuf_iterator iterator; + + private: + Md5streambuf streambuf; + char hexdigest[33]; + + public: + /// initializes md5-calculation + Md5stream() + : std::ostream(0) + { + init(&streambuf); + } + + /// ends md5-calculation and returns 16 bytes digest + void getDigest(unsigned char digest[16]) + { streambuf.getDigest(digest); } + /// ends md5-calculation and digest as 32 bytes hex + const char* getHexDigest(); + + /// returns output-iterator to Md5stream + iterator begin() + { return iterator(&streambuf); } +}; + +} + +#endif // ZIM_MD5STREAM_H diff --git a/src/meson.build b/src/meson.build new file mode 100644 index 0000000..671a23f --- /dev/null +++ b/src/meson.build @@ -0,0 +1,78 @@ + +configure_file(output : 'config.h', + configuration : conf, + input : 'config.h.in') + +src_directory = include_directories('.') + +common_sources = [ +# 'config.h', + 'article.cpp', + 'cluster.cpp', + 'dirent.cpp', + 'envvalue.cpp', + 'file.cpp', + 'fileheader.cpp', + 'fileimpl.cpp', + 'file_compound.cpp', + 'file_reader.cpp', + 'blob.cpp', + 'buffer.cpp', + 'md5.c', + 'md5stream.cpp', + 'search.cpp', + 'search_iterator.cpp', + 'template.cpp', + 'uuid.cpp', + 'levenshtein.cpp', + 'tools.cpp', + 'writer/zimcreator.cpp', + 'writer/lzmastream.cpp', + 'writer/article.cpp', + 'writer/cluster.cpp', + 'writer/dirent.cpp', + 'writer/xapianIndexer.cpp', + 'writer/tee.cpp' +] + +if host_machine.system() == 'windows' + common_sources += 'fs_windows.cpp' +else + common_sources += 'fs_unix.cpp' +endif + +zlib_sources = [ + 'writer/deflatestream.cpp' +] + +xapian_sources = [ + 'xapian/htmlparse.cc', + 'xapian/myhtmlparse.cc' +] + +sources = common_sources +deps = [thread_dep, lzma_dep] + +if zlib_dep.found() + sources += zlib_sources + deps += [zlib_dep] +endif + +if xapian_dep.found() + sources += xapian_sources + sources += lib_resources + deps += [xapian_dep, icu_dep] +endif + +libzim = library('zim', + sources, + include_directories : inc, + dependencies : deps, + link_args : extra_link_args, + cpp_args : extra_cpp_args, + version: meson.project_version(), + install : true, + build_rpath : join_paths(get_option('prefix'), get_option('libdir')), + install_rpath: '$ORIGIN') +libzim_dep = declare_dependency(link_with: libzim, + include_directories: include_directory) diff --git a/src/search.cpp b/src/search.cpp new file mode 100644 index 0000000..01bf4b7 --- /dev/null +++ b/src/search.cpp @@ -0,0 +1,423 @@ +/* + * Copyright (C) 2007 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include "search_internal.h" +#include "levenshtein.h" +#include "fs.h" + +#include + +#include +#include +#if !defined(_WIN32) +# include +#else +# include +#endif +#include + +#if defined(ENABLE_XAPIAN) +#include "xapian.h" +#include +#endif + +#define MAX_MATCHES_TO_SORT 10000 + +namespace zim +{ + +#if defined(ENABLE_XAPIAN) +namespace { +/* Split string in a token array */ +std::vector split(const std::string & str, + const std::string & delims=" *-") +{ + std::string::size_type lastPos = str.find_first_not_of(delims, 0); + std::string::size_type pos = str.find_first_of(delims, lastPos); + std::vector tokens; + + while (std::string::npos != pos || std::string::npos != lastPos) + { + tokens.push_back(str.substr(lastPos, pos - lastPos)); + lastPos = str.find_first_not_of(delims, pos); + pos = str.find_first_of(delims, lastPos); + } + + return tokens; +} + +std::map read_valuesmap(const std::string &s) { + std::map result; + std::vector elems = split(s, ";"); + for(std::vector::iterator elem = elems.begin(); + elem != elems.end(); + elem++) + { + std::vector tmp_elems = split(*elem, ":"); + result.insert( std::pair(tmp_elems[0], atoi(tmp_elems[1].c_str())) ); + } + return result; +} + + +void +setup_queryParser(Xapian::QueryParser* queryparser, + Xapian::Database& database, + const std::string& language, + const std::string& stopwords) { + queryparser->set_database(database); + if ( ! language.empty() ) + { + /* Build ICU Local object to retrieve ISO-639 language code (from + ISO-639-3) */ + icu::Locale languageLocale(language.c_str()); + + /* Configuring language base steemming */ + try { + Xapian::Stem stemmer = Xapian::Stem(languageLocale.getLanguage()); + queryparser->set_stemmer(stemmer); + queryparser->set_stemming_strategy(Xapian::QueryParser::STEM_ALL); + } catch (...) { + std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl; + } + } + + if ( ! stopwords.empty() ) + { + std::string stopWord; + std::istringstream file(stopwords); + Xapian::SimpleStopper* stopper = new Xapian::SimpleStopper(); + while (std::getline(file, stopWord, '\n')) { + stopper->add(stopWord); + } + stopper->release(); + queryparser->set_stopper(stopper); + } +} + +class LevenshteinDistanceMaker : public Xapian::KeyMaker { + public: + LevenshteinDistanceMaker(const std::string& query, size_t value_index): + query(query), + value_index(value_index) {} + ~LevenshteinDistanceMaker() = default; + + virtual std::string operator() (const Xapian::Document &doc) const { + auto document_value = doc.get_value(value_index); + return Xapian::sortable_serialise( + levenshtein_distance(document_value, query)); + } + private: + std::string query; + size_t value_index; +}; + +} +#endif + +Search::Search(const std::vector zimfiles) : + internal(new InternalData), + zimfiles(zimfiles), + prefixes(""), query(""), + latitude(0), longitude(0), distance(0), + range_start(0), range_end(0), + suggestion_mode(false), + geo_query(false), + search_started(false), + has_database(false), + verbose(false), + estimated_matches_number(0) +{} + +Search::Search(const File* zimfile) : + internal(new InternalData), + prefixes(""), query(""), + latitude(0), longitude(0), distance(0), + range_start(0), range_end(0), + suggestion_mode(false), + geo_query(false), + search_started(false), + has_database(false), + verbose(false), + estimated_matches_number(0) +{ + zimfiles.push_back(zimfile); +} + +Search::Search(const Search& it) : + internal(new InternalData), + zimfiles(it.zimfiles), + prefixes(it.prefixes), + query(it.query), + latitude(it.latitude), longitude(it.longitude), distance(it.distance), + range_start(it.range_start), range_end(it.range_end), + suggestion_mode(it.suggestion_mode), + geo_query(it.geo_query), + search_started(false), + has_database(false), + verbose(it.verbose), + estimated_matches_number(0) +{ } + +Search& Search::operator=(const Search& it) +{ + if ( internal ) internal.reset(); + zimfiles = it.zimfiles; + prefixes = it.prefixes; + query = it.query; + latitude = it.latitude; + longitude = it.longitude; + distance = it.distance; + range_start = it.range_start; + range_end = it.range_end; + suggestion_mode = it.suggestion_mode; + geo_query = it.geo_query; + search_started = false; + has_database = false; + verbose = it.verbose; + estimated_matches_number = 0; + return *this; +} + +Search::Search(Search&& it) = default; +Search& Search::operator=(Search&& it) = default; +Search::~Search() = default; + +void Search::set_verbose(bool verbose) { + std::cout << "set verbose" << std::endl; + this->verbose = verbose; +} + +Search& Search::add_zimfile(const File* zimfile) { + zimfiles.push_back(zimfile); + return *this; +} + +Search& Search::set_query(const std::string& query) { + this->query = query; + return *this; +} + +Search& Search::set_georange(float latitude, float longitude, float distance) { + this->latitude = latitude; + this->longitude = longitude; + this->distance = distance; + geo_query = true; + return *this; +} + +Search& Search::set_range(int start, int end) { + this->range_start = start; + this->range_end = end; + return *this; +} + +Search& Search::set_suggestion_mode(const bool suggestion_mode) { + this->suggestion_mode = suggestion_mode; + return *this; +} + +Search::iterator Search::begin() const { +#if defined(ENABLE_XAPIAN) + if ( this->search_started ) { + return new search_iterator::InternalData(this, internal->results.begin()); + } + + std::vector::const_iterator it; + bool first = true; + std::string language; + std::string stopwords; + for(it=zimfiles.begin(); it!=zimfiles.end(); it++) + { + const File* zimfile = *it; + if (zimfile->is_multiPart()) { + continue; + } + zim::Article xapianArticle = zimfile->getArticle('X', "fulltext/xapian"); + if (!xapianArticle.good()) { + xapianArticle = zimfile->getArticle('Z', "/fulltextIndex/xapian"); + } + if (!xapianArticle.good()) { + continue; + } + auto dbOffset = xapianArticle.getOffset(); + if (dbOffset == 0) { + continue; + } + std::cerr << "Try to open " << zimfile->getFilename() << " at offset " << dbOffset; + DEFAULTFS::FD databasefd; + try { + databasefd = DEFAULTFS::openFile(zimfile->getFilename()); + } catch (...) { + std::cerr << "Impossible to open " << zimfile->getFilename() << std::endl; + std::cerr << strerror(errno) << std::endl; + continue; + } + if (!databasefd.seek(offset_t(dbOffset))) { + std::cerr << "Something went wrong seeking databasedb " + << zimfile->getFilename() << std::endl; + std::cerr << "dbOffest = " << dbOffset << std::endl; + continue; + } + Xapian::Database database; + try { + database = Xapian::Database(databasefd.release()); + } catch( Xapian::DatabaseError& e) { + std::cerr << "Something went wrong opening xapian database for zimfile " + << zimfile->getFilename() << std::endl; + std::cerr << "dbOffest = " << dbOffset << std::endl; + std::cerr << "error = " << e.get_msg() << std::endl; + continue; + } + + if ( first ) { + this->valuesmap = read_valuesmap(database.get_metadata("valuesmap")); + language = database.get_metadata("language"); + if (language.empty() ) { + // Database created before 2017/03 has no language metadata. + // However, term were stemmed anyway and we need to stem our + // search query the same the database was created. + // So we need a language, let's use the one of the zim. + // If zimfile has no language metadata, we can't do lot more here :/ + auto article = zimfile->getArticle('M', "Language"); + if ( article.good() ) { + language = article.getData(); + } + } + stopwords = database.get_metadata("stopwords"); + this->prefixes = database.get_metadata("prefixes"); + } else { + std::map valuesmap = read_valuesmap(database.get_metadata("valuesmap")); + if (this->valuesmap != valuesmap ) { + // [TODO] Ignore the database, raise a error ? + } + } + internal->xapian_databases.push_back(database); + internal->database.add_database(database); + has_database = true; + } + + if ( ! has_database ) { + if (verbose) { + std::cout << "No database, no result" << std::endl; + } + estimated_matches_number = 0; + return nullptr; + } + + Xapian::QueryParser* queryParser = new Xapian::QueryParser(); + if (verbose) { + std::cout << "Setup queryparser using language " << language << std::endl; + } + queryParser->set_default_op(Xapian::Query::op::OP_AND); + setup_queryParser(queryParser, internal->database, language, stopwords); + + std::string prefix = ""; + unsigned flags = Xapian::QueryParser::FLAG_DEFAULT; + if (suggestion_mode) { + if (verbose) { + std::cout << "Mark query as 'partial'" << std::endl; + } + flags |= Xapian::QueryParser::FLAG_PARTIAL; + if (this->prefixes.find("S") != std::string::npos ) { + if (verbose) { + std::cout << "Searching in title namespace" << std::endl; + } + prefix = "S"; + } + } + Xapian::Query query; + try { + query = queryParser->parse_query(this->query, flags, prefix); + } catch (Xapian::QueryParserError& e) { + estimated_matches_number = 0; + return nullptr; + } + if (verbose) { + std::cout << "Parsed query '" << this->query << "' to " << query.get_description() << std::endl; + } + delete queryParser; + + Xapian::Enquire enquire(internal->database); + Xapian::KeyMaker* keyMaker(nullptr); + + if (geo_query && valuesmap.find("geo.position") != valuesmap.end()) { + Xapian::GreatCircleMetric metric; + Xapian::LatLongCoord centre(latitude, longitude); + Xapian::LatLongDistancePostingSource ps(valuesmap["geo.position"], centre, metric, distance); + if ( this->query.empty()) { + query = Xapian::Query(&ps); + } else { + query = Xapian::Query(Xapian::Query::OP_FILTER, query, Xapian::Query(&ps)); + } + } + + enquire.set_query(query); + + if (suggestion_mode) { + size_t value_index = 0; + bool has_custom_distance_maker = true; + if ( !valuesmap.empty() ) { + if ( valuesmap.find("title") != valuesmap.end() ) { + value_index = valuesmap["title"]; + } else { + // This should not happen as valuesmap has a title entry, but let's + // be tolerent. + has_custom_distance_maker = false; + } + } + auto temp_results = enquire.get_mset(0,0); + if ( has_custom_distance_maker + && temp_results.get_matches_estimated() <= MAX_MATCHES_TO_SORT ) { + keyMaker = new LevenshteinDistanceMaker(this->query, value_index); + enquire.set_sort_by_key(keyMaker, false); + } + } + + internal->results = enquire.get_mset(this->range_start, this->range_end-this->range_start); + search_started = true; + estimated_matches_number = internal->results.get_matches_estimated(); + delete keyMaker; + return new search_iterator::InternalData(this, internal->results.begin()); +#else + estimated_matches_number = 0; + return nullptr; +#endif +} + +Search::iterator Search::end() const { +#if defined(ENABLE_XAPIAN) + if ( ! has_database ) { + return nullptr; + } + return new search_iterator::InternalData(this, internal->results.end()); +#else + return nullptr; +#endif +} + +int Search::get_matches_estimated() const { + // Ensure that the search as begin + begin(); + return estimated_matches_number; +} + +} //namespace zim diff --git a/src/search_internal.h b/src/search_internal.h new file mode 100644 index 0000000..8781463 --- /dev/null +++ b/src/search_internal.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_SEARCH_INTERNAL_H +#define ZIM_SEARCH_INTERNAL_H + +#include "config.h" + +#if defined(ENABLE_XAPIAN) +#include +#endif + +namespace zim { + +struct Search::InternalData { +#if defined(ENABLE_XAPIAN) + std::vector xapian_databases; + Xapian::Database database; + Xapian::MSet results; +#endif +}; + +struct search_iterator::InternalData { +#if defined(ENABLE_XAPIAN) + const Search* search; + Xapian::MSetIterator iterator; + Xapian::Document _document; + bool document_fetched; +#endif + Article _article; + bool article_fetched; + + +#if defined(ENABLE_XAPIAN) + InternalData(const Search* search, Xapian::MSetIterator iterator) : + search(search), + iterator(iterator), + document_fetched(false), + article_fetched(false) + {}; + + Xapian::Document get_document() { + if ( !document_fetched ) { + if (iterator != search->internal->results.end()) { + _document = iterator.get_document(); + } + document_fetched = true; + } + return _document; + } +#endif + + int get_databasenumber() { +#if defined(ENABLE_XAPIAN) + Xapian::docid docid = *iterator; + return (docid - 1) % search->zimfiles.size(); +#endif + return 0; + } + + Article& get_article() { +#if defined(ENABLE_XAPIAN) + if ( !article_fetched ) { + int databasenumber = get_databasenumber(); + const File* file = search->zimfiles[databasenumber]; + if ( ! file ) + _article = Article(); + else + _article = file->getArticleByUrl(get_document().get_data()); + article_fetched = true; + } +#endif + return _article; + } +}; + + + +}; //namespace zim + +#endif //ZIM_SEARCH_INTERNAL_H diff --git a/src/search_iterator.cpp b/src/search_iterator.cpp new file mode 100644 index 0000000..c950305 --- /dev/null +++ b/src/search_iterator.cpp @@ -0,0 +1,239 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "xapian/myhtmlparse.h" +#include +#include +#include +#include "search_internal.h" + +namespace zim { + + +search_iterator::~search_iterator() = default; +search_iterator::search_iterator(search_iterator&& it) = default; +search_iterator& search_iterator::operator=(search_iterator&& it) = default; + +search_iterator::search_iterator() : search_iterator(nullptr) +{}; + +search_iterator::search_iterator(InternalData* internal_data) + : internal(internal_data) +{} + +search_iterator::search_iterator(const search_iterator& it) + : internal(nullptr) +{ + if (it.internal) internal = std::unique_ptr(new InternalData(*it.internal)); +} + +search_iterator & search_iterator::operator=(const search_iterator& it) { + if ( ! it.internal ) internal.reset(); + else if ( ! internal ) internal = std::unique_ptr(new InternalData(*it.internal)); + else *internal = *it.internal; + + return *this; +} + +bool search_iterator::operator==(const search_iterator& it) const { +#if defined(ENABLE_XAPIAN) + if ( ! internal && ! it.internal) + return true; + if ( ! internal || ! it.internal) + return false; + return (internal->search == it.internal->search + && internal->iterator == it.internal->iterator); +#else + // If there is no xapian, there is no search. There is only one iterator: end. + // So all iterators are equal. + return true; +#endif +} + +bool search_iterator::operator!=(const search_iterator& it) const { + return ! (*this == it); +} + +search_iterator& search_iterator::operator++() { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return *this; + } + ++(internal->iterator); + internal->document_fetched = false; + internal->article_fetched = false; +#endif + return *this; +} + +search_iterator search_iterator::operator++(int) { + search_iterator it = *this; + operator++(); + return it; +} + +search_iterator& search_iterator::operator--() { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return *this; + } + --(internal->iterator); + internal->document_fetched = false; + internal->article_fetched = false; +#endif + return *this; +} + +search_iterator search_iterator::operator--(int) { + search_iterator it = *this; + operator--(); + return it; +} + +std::string search_iterator::get_url() const { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return ""; + } + return internal->get_document().get_data(); +#else + return ""; +#endif +} + +std::string search_iterator::get_title() const { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return ""; + } + if ( internal->search->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + return internal->get_document().get_value(0); + } + else if ( internal->search->valuesmap.find("title") != internal->search->valuesmap.end() ) + { + return internal->get_document().get_value(internal->search->valuesmap["title"]); + } +#endif + return ""; +} + +int search_iterator::get_score() const { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return 0; + } + return internal->iterator.get_percent(); +#else + return 0; +#endif +} + +std::string search_iterator::get_snippet() const { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return ""; + } + if ( internal->search->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + std::string stored_snippet = internal->get_document().get_value(1); + if ( ! stored_snippet.empty() ) + return stored_snippet; + /* Let's continue here, and see if we can genenate one */ + } + else if ( internal->search->valuesmap.find("snippet") != internal->search->valuesmap.end() ) + { + return internal->get_document().get_value(internal->search->valuesmap["snippet"]); + } + /* No reader, no snippet */ + Article& article = internal->get_article(); + if ( ! article.good() ) + return ""; + /* Get the content of the article to generate a snippet. + We parse it and use the html dump to avoid remove html tags in the + content and be able to nicely cut the text at random place. */ + zim::MyHtmlParser htmlParser; + std::string content = article.getData(); + try { + htmlParser.parse_html(content, "UTF-8", true); + } catch (...) {} + return internal->search->internal->results.snippet(htmlParser.dump, 500); +#else + return ""; +#endif +} + +int search_iterator::get_size() const { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return -1; + } + if ( internal->search->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + return internal->get_document().get_value(2).empty() == true ? -1 : atoi(internal->get_document().get_value(2).c_str()); + } + else if ( internal->search->valuesmap.find("size") != internal->search->valuesmap.end() ) + { + return atoi(internal->get_document().get_value(internal->search->valuesmap["size"]).c_str()); + } +#endif + /* The size is never used. Do we really want to get the content and + calculate the size ? */ + return -1; +} + +int search_iterator::get_wordCount() const { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return -1; + } + if ( internal->search->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + return internal->get_document().get_value(3).empty() == true ? -1 : atoi(internal->get_document().get_value(3).c_str()); + } + else if ( internal->search->valuesmap.find("wordcount") != internal->search->valuesmap.end() ) + { + return atoi(internal->get_document().get_value(internal->search->valuesmap["wordcount"]).c_str()); + } +#endif + return -1; +} + +int search_iterator::get_fileIndex() const { +#if defined(ENABLE_XAPIAN) + if ( internal ) { + return internal->get_databasenumber(); + } +#endif + return 0; +} + +search_iterator::reference search_iterator::operator*() const { + return internal->get_article(); +} + +search_iterator::pointer search_iterator::operator->() const { + return &internal->get_article(); +} + +} // namespace zim diff --git a/src/template.cpp b/src/template.cpp new file mode 100644 index 0000000..75e4bb8 --- /dev/null +++ b/src/template.cpp @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "template.h" + +namespace zim +{ + void TemplateParser::state_data(char ch) + { + data += ch; + + if (ch == '<') + { + state = &TemplateParser::state_lt; + save = data.size() - 1; + } + } + + void TemplateParser::state_lt(char ch) + { + data += ch; + + if (ch == '%') + state = &TemplateParser::state_token0; + else + state = &TemplateParser::state_data; + } + + void TemplateParser::state_token0(char ch) + { + data += ch; + + if (ch == '/') + state = &TemplateParser::state_link0; + else + { + token = data.size() - 1; + state = &TemplateParser::state_token; + } + } + + void TemplateParser::state_token(char ch) + { + data += ch; + + if (ch == '%') + state = &TemplateParser::state_token_end; + } + + void TemplateParser::state_token_end(char ch) + { + if (ch == '>') + { + if (event) + { + event->onData(data.substr(0, save)); + event->onToken(data.substr(token, data.size() - token - 1)); + data.clear(); + } + + state = &TemplateParser::state_data; + } + else + { + data += ch; + state = &TemplateParser::state_data; + } + } + + void TemplateParser::state_link0(char ch) + { + data += ch; + + ns = ch; + state = &TemplateParser::state_link; + } + + void TemplateParser::state_link(char ch) + { + data += ch; + + if (ch == '/') + { + token = data.size(); + state = &TemplateParser::state_title; + } + else + state = &TemplateParser::state_data; + } + + void TemplateParser::state_title(char ch) + { + data += ch; + + if (ch == '%') + { + token_e = data.size() - 1; + state = &TemplateParser::state_title_end; + } + } + + void TemplateParser::state_title_end(char ch) + { + data += ch; + + if (ch == '>') + { + if (event) + { + event->onData(data.substr(0, save)); + event->onLink(ns, data.substr(token, token_e - token)); + } + + data.clear(); + state = &TemplateParser::state_data; + } + } + + void TemplateParser::flush() + { + if (event) + event->onData(data); + data.clear(); + state = &TemplateParser::state_data; + } +} diff --git a/src/template.h b/src/template.h new file mode 100644 index 0000000..2c01a42 --- /dev/null +++ b/src/template.h @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_TEMPLATE_H +#define ZIM_TEMPLATE_H + +#include + +namespace zim +{ + class TemplateParser + { + public: + class Event + { + public: + virtual void onData(const std::string& data) = 0; + virtual void onToken(const std::string& token) = 0; + virtual void onLink(char ns, const std::string& url) = 0; + virtual ~Event() = default; + }; + + private: + Event* event; + + std::string data; + std::string::size_type save; + std::string::size_type token; + std::string::size_type token_e; + char ns; + typedef void (TemplateParser::*state_type)(char); + + state_type state; + + void state_data(char ch); + void state_lt(char ch); + void state_token0(char ch); + void state_token(char ch); + void state_token_end(char ch); + void state_link0(char ch); + void state_link(char ch); + void state_title(char ch); + void state_title_end(char ch); + + public: + explicit TemplateParser(Event* ev) + : event(ev), + state(&TemplateParser::state_data) + { } + + void parse(char ch) + { + (this->*state)(ch); + } + + void parse(const std::string& s) + { + for (std::string::const_iterator ch = s.begin(); ch != s.end(); ++ch) + parse(*ch); + } + + void flush(); + + }; + +} + +#endif // ZIM_TEMPLATE_H diff --git a/src/tools.cpp b/src/tools.cpp new file mode 100644 index 0000000..83a2101 --- /dev/null +++ b/src/tools.cpp @@ -0,0 +1,58 @@ +/* + * Copyright 2013-2016 Emmanuel Engelhart + * Copyright 2016 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "tools.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef _WIN32 +# include +# include +# include +# include +# define SEPARATOR "\\" +#else +# include +# define SEPARATOR "/" +#endif + + +std::string zim::removeAccents(const std::string& text) +{ + ucnv_setDefaultName("UTF-8"); + static UErrorCode status = U_ZERO_ERROR; + static std::unique_ptr removeAccentsTrans(icu::Transliterator::createInstance( + "Lower; NFD; [:M:] remove; NFC", UTRANS_FORWARD, status)); + icu::UnicodeString ustring(text.c_str()); + removeAccentsTrans->transliterate(ustring); + std::string unaccentedText; + ustring.toUTF8String(unaccentedText); + return unaccentedText; +} diff --git a/src/tools.h b/src/tools.h new file mode 100644 index 0000000..294a44b --- /dev/null +++ b/src/tools.h @@ -0,0 +1,32 @@ +/* + * Copyright 2013-2016 Emmanuel Engelhart + * Copyright 2016 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_LIBZIM_TOOLS_H +#define OPENZIM_LIBZIM_TOOLS_H + +#include + +namespace zim { + + std::string removeAccents(const std::string& text); + +} + +#endif // OPENZIM_LIBZIM_TOOLS_H diff --git a/src/uuid.cpp b/src/uuid.cpp new file mode 100644 index 0000000..b14d16b --- /dev/null +++ b/src/uuid.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include // necessary to have the new types +#include "log.h" +#include "md5stream.h" + +#ifdef _WIN32 + +# include +# include +int gettimeofday(struct timeval* tp, void* tzp) { + DWORD t; + t = timeGetTime(); + tp->tv_sec = t / 1000; + tp->tv_usec = t % 1000; + return 0; +} + +#define getpid GetCurrentProcessId + +#else +# include +#endif + +log_define("zim.uuid") + +namespace zim +{ + namespace + { + char hex[] = "0123456789abcdef"; + inline char hi(char v) + { return hex[(v >> 4) & 0xf]; } + + inline char lo(char v) + { return hex[v & 0xf]; } + } + + Uuid Uuid::generate(std::string value) + { + Uuid ret; + Md5stream m; + + if ( value.empty() ) { + struct timeval tv; + gettimeofday(&tv, 0); + + clock_t c = clock(); + + m << c << tv.tv_sec << tv.tv_usec; + } else { + m << value; + } + m.getDigest(reinterpret_cast(&ret.data[0])); + + log_debug("generated uuid: " << ret.data); + + return ret; + } + + std::ostream& operator<< (std::ostream& out, const Uuid& uuid) + { + for (unsigned n = 0; n < 4; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + out << '-'; + for (unsigned n = 4; n < 6; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + out << '-'; + for (unsigned n = 6; n < 8; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + out << '-'; + for (unsigned n = 8; n < 10; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + out << '-'; + for (unsigned n = 10; n < 16; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + return out; + } + +} diff --git a/src/writer/_dirent.h b/src/writer/_dirent.h new file mode 100644 index 0000000..65ad0d9 --- /dev/null +++ b/src/writer/_dirent.h @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_DIRENT_H +#define ZIM_WRITER_DIRENT_H + +#include "../_dirent.h" +#include "cluster.h" + +namespace zim +{ + namespace writer + { + class Dirent : public zim::Dirent + { + Cluster* cluster = nullptr; + std::string aid; + std::string redirectAid; + article_index_t idx = article_index_t(0); + + public: + Dirent() {} + + Dirent(const std::string& aid_) + : aid(aid_) + {} + + Dirent(char ns, const std::string& url) + { setUrl(ns, url); } + + void setAid(const std::string& aid_) { aid = aid_; } + const std::string& getAid() const { return aid; } + + void setRedirectAid(const std::string& aid_) { redirectAid = aid_; } + const std::string& getRedirectAid() const { return redirectAid; } + + void setIdx(article_index_t idx_) { idx = idx_; } + article_index_t getIdx() const { return idx; } + + void setCluster(zim::writer::Cluster* _cluster) + { cluster = _cluster; blobNumber = _cluster->count(); } + + cluster_index_t getClusterNumber() const { return cluster ? cluster->getClusterIndex() : clusterNumber; } + }; + + std::ostream& operator<< (std::ostream& out, const Dirent& d); + + inline bool compareUrl(const Dirent& d1, const Dirent& d2) + { + return d1.getNamespace() < d2.getNamespace() + || (d1.getNamespace() == d2.getNamespace() + && d1.getUrl() < d2.getUrl()); + } + + inline bool compareAid(const Dirent& d1, const Dirent& d2) + { + return d1.getAid() < d2.getAid(); + } + + } +} + +#endif // ZIM_WRITER_DIRENT_H + diff --git a/src/writer/article.cpp b/src/writer/article.cpp new file mode 100644 index 0000000..14aac4c --- /dev/null +++ b/src/writer/article.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include + +namespace zim +{ + namespace writer + { + bool Article::isLinktarget() const + { + return false; + } + + bool Article::isDeleted() const + { + return false; + } + std::string Article::getParameter() const + { + return std::string(); + } + + std::string Article::getNextCategory() + { + return std::string(); + } + + } +} diff --git a/src/writer/cluster.cpp b/src/writer/cluster.cpp new file mode 100644 index 0000000..08d30f3 --- /dev/null +++ b/src/writer/cluster.cpp @@ -0,0 +1,296 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "cluster.h" +#include "../log.h" +#include "../endian_tools.h" +#include "../debug.h" + +#include +#include + +#if defined(ENABLE_ZLIB) +#include "deflatestream.h" +#endif + +#include "lzmastream.h" + +#ifdef _WIN32 +#define SEPARATOR "\\" +#else +#define SEPARATOR "/" +#endif + +namespace zim { +namespace writer { + +Cluster::Cluster(CompressionType compression) + : compression(compression), + isExtended(false), + _size(0) +{ + offsets.push_back(offset_t(0)); + pthread_mutex_init(&m_closedMutex,NULL); +} + +void Cluster::clear() { + offsets.clear(); + _data.clear(); +} + +void Cluster::close() { + pthread_mutex_lock(&m_closedMutex); + closed = true; + pthread_mutex_unlock(&m_closedMutex); +} + +bool Cluster::isClosed() const{ + bool v; + pthread_mutex_lock(&m_closedMutex); + v = closed; + pthread_mutex_unlock(&m_closedMutex); + return v; +} + +zsize_t Cluster::size() const +{ + if (isClosed()) { + throw std::runtime_error("oups"); + } + if (isExtended) { + return zsize_t(offsets.size() * sizeof(uint64_t)) + _size; + } else { + return zsize_t(offsets.size() * sizeof(uint32_t)) + _size; + } +} + +zsize_t Cluster::getFinalSize() const +{ + return finalSize; +} + +template +void Cluster::write_offsets(std::ostream& out) const +{ + size_type delta = offsets.size() * sizeof(OFFSET_TYPE); + for (auto offset : offsets) + { + offset.v += delta; + char out_buf[sizeof(OFFSET_TYPE)]; + toLittleEndian(static_cast(offset.v), out_buf); + out.write(out_buf, sizeof(OFFSET_TYPE)); + } +} + +void Cluster::write_final(std::ostream& out) const +{ + if(getCompression() == zim::zimcompNone) + { + dump(out); + } else { + std::ifstream clustersFile(tmp_filename, std::ios::binary); + out << clustersFile.rdbuf(); + } + if (!out) { + throw std::runtime_error("failed to write cluster"); + } +} + +void Cluster::dump_tmp(const std::string& directoryPath) +{ + if(getCompression() == zim::zimcompNone) + { + //No real dump, store inmemory data in file + size_t file_index = 0; + for (auto& data: _data) + { + ASSERT(data.value.empty(), ==, false); + if (data.type == DataType::plain) { + std::ostringstream ss; + ss << directoryPath << SEPARATOR << "file_" << index << "_" << file_index << ".tmp"; + auto filename = ss.str(); + { + std::ofstream out(filename, std::ios::binary); + out << data.value; + if (!out) { + throw std::runtime_error( + std::string("failed to write temporary cluster file ") + + filename); + } + } + data.type = DataType::file; + data.value = filename; + } + file_index++; + } + finalSize = zsize_t(size().v+1); + } else { + std::ostringstream ss; + ss << directoryPath << SEPARATOR << "cluster_" << index << ".clt"; + tmp_filename = ss.str(); + std::ofstream out(tmp_filename, std::ios::binary); + dump(out); + if (!out) { + throw std::runtime_error( + std::string("failed to write temporary cluster file ") + + tmp_filename); + } + finalSize = zsize_t(out.tellp()); + clear(); + } +} + +void Cluster::write(std::ostream& out) const +{ + if (isExtended) { + write_offsets(out); + } else { + write_offsets(out); + } + write_data(out); +} + +void Cluster::dump(std::ostream& out) const +{ + // write clusterInfo + char clusterInfo = 0; + if (isExtended) { + clusterInfo = 0x10; + } + clusterInfo += getCompression(); + out.put(clusterInfo); + + // Open a comprestion stream if needed + switch(getCompression()) + { + case zim::zimcompDefault: + case zim::zimcompNone: + write(out); + break; + + case zim::zimcompZip: + { +#if defined(ENABLE_ZLIB) + log_debug("compress data (zlib)"); + zim::writer::DeflateStream os(out); + os.exceptions(std::ios::failbit | std::ios::badbit); + write(os); + os.flush(); + os.end(); +#else + throw std::runtime_error("zlib not enabled in this library"); +#endif + break; + } + + case zim::zimcompBzip2: + { + throw std::runtime_error("bzip2 not enabled in this library"); + break; + } + + case zim::zimcompLzma: + { + uint32_t lzmaPreset = 3 | LZMA_PRESET_EXTREME; + /** + * read lzma preset from environment + * ZIM_LZMA_PRESET is a number followed optionally by a + * suffix 'e'. The number gives the preset and the suffix tells, + * if LZMA_PRESET_EXTREME should be set. + * e.g.: + * ZIM_LZMA_LEVEL=9 => 9 + * ZIM_LZMA_LEVEL=3e => 3 + extreme + */ + const char* e = ::getenv("ZIM_LZMA_LEVEL"); + if (e) + { + char flag = '\0'; + std::istringstream s(e); + s >> lzmaPreset >> flag; + if (flag == 'e') + lzmaPreset |= LZMA_PRESET_EXTREME; + } + + log_debug("compress data (lzma, " << std::hex << lzmaPreset << ")"); + zim::writer::LzmaStream os(out, lzmaPreset); + os.exceptions(std::ios::failbit | std::ios::badbit); + write(os); + os.end(); + break; + } + + default: + std::ostringstream msg; + msg << "invalid compression flag " << getCompression(); + log_error(msg.str()); + throw std::runtime_error(msg.str()); + } +} + +void Cluster::addArticle(const zim::writer::Article* article) +{ + auto filename = article->getFilename(); + auto size = article->getSize(); + _size += size; + offsets.push_back(offset_t(_size.v)); + isExtended |= (size>UINT32_MAX); + if (size == 0) + return; + + if (filename.empty()) { + _data.emplace_back(DataType::plain, article->getData()); + } + else { + _data.emplace_back(DataType::file, filename); + } +} + +void Cluster::addData(const char* data, zsize_t size) +{ + _size += size; + offsets.push_back(offset_t(_size.v)); + isExtended |= (size.v>UINT32_MAX); + if (size.v == 0) + return; + + _data.emplace_back(DataType::plain, data, size.v); +} + +void Cluster::write_data(std::ostream& out) const +{ + for (auto& data: _data) + { + ASSERT(data.value.empty(), ==, false); + if (data.type == DataType::plain) { + out << data.value; + } else { + std::ifstream stream(data.value, std::ios::binary); + if (!stream) { + throw std::runtime_error(std::string("cannot open ") + data.value); + } + out << stream.rdbuf(); + if (!out) { + throw std::runtime_error(std::string("failed to write file ") + data.value); + } + } + } +} + +} // writer +} // zim diff --git a/src/writer/cluster.h b/src/writer/cluster.h new file mode 100644 index 0000000..6d6b01f --- /dev/null +++ b/src/writer/cluster.h @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_CLUSTER_H_ +#define ZIM_WRITER_CLUSTER_H_ + +#include +#include +#include +#include +#include + +#include +#include "../zim_types.h" + +namespace zim { + +namespace writer { + +enum class DataType { plain, file }; +struct Data { + Data(zim::writer::DataType type, const std::string& value) : + type(type), value(value) {} + Data(zim::writer::DataType type, const char* data, zim::size_type size) : + type(type), value(data, size) {} + DataType type; + std::string value; +}; + +class Cluster { + typedef std::vector Offsets; + typedef std::vector ClusterData; + + + public: + Cluster(CompressionType compression); + virtual ~Cluster() { pthread_mutex_destroy(&m_closedMutex);} + + void setCompression(CompressionType c) { compression = c; } + CompressionType getCompression() const { return compression; } + + void addArticle(const zim::writer::Article* article); + void addData(const char* data, zsize_t size); + + blob_index_t count() const { return blob_index_t(offsets.size() - 1); } + zsize_t size() const; + zsize_t getFinalSize() const; + bool is_extended() const { return isExtended; } + void clear(); + void close(); + bool isClosed() const; + + void setClusterIndex(cluster_index_t idx) { index = idx; } + cluster_index_t getClusterIndex() const { return index; } + + zsize_t getBlobSize(blob_index_t n) const + { return zsize_t(offsets[blob_index_type(n)+1].v - offsets[blob_index_type(n)].v); } + + void write_final(std::ostream& out) const; + void dump_tmp(const std::string& directoryPath); + void dump(std::ostream& out) const; + + protected: + CompressionType compression; + cluster_index_t index; + bool isExtended; + Offsets offsets; + zsize_t _size; + zsize_t finalSize; + ClusterData _data; + std::string tmp_filename; + mutable pthread_mutex_t m_closedMutex; + bool closed = false; + + private: + void write(std::ostream& out) const; + template + void write_offsets(std::ostream& out) const; + void write_data(std::ostream& out) const; + +}; + +}; + +}; + + +#endif //ZIM_WRITER_CLUSTER_H_ diff --git a/src/writer/deflatestream.cpp b/src/writer/deflatestream.cpp new file mode 100644 index 0000000..4a96521 --- /dev/null +++ b/src/writer/deflatestream.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (C) 2003-2005 Tommi Maekitalo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + + +#include "deflatestream.h" +#include "log.h" +#include +#include + +log_define("zim.deflatestream") + +namespace zim +{ + +namespace writer +{ + namespace + { + int checkError(int ret, z_stream& stream) + { + if (ret != Z_OK && ret != Z_STREAM_END) + { + log_error("DeflateError " << ret << ": \"" << (stream.msg ? stream.msg : "") << '"'); + std::ostringstream msg; + msg << "deflate-error " << ret; + if (stream.msg) + msg << ": " << stream.msg; + throw DeflateError(ret, msg.str()); + } + return ret; + } + } + + DeflateStreamBuf::DeflateStreamBuf(std::streambuf* sink_, int level, unsigned bufsize_) + : obuffer(bufsize_), + sink(sink_) + { + memset(&stream, 0, sizeof(z_stream)); + stream.zalloc = Z_NULL; + stream.zfree = Z_NULL; + stream.opaque = 0; + stream.total_out = 0; + stream.total_in = 0; + stream.next_in = Z_NULL; + stream.next_out = Z_NULL; + stream.avail_in = 0; + stream.avail_out = 0; + + checkError(::deflateInit(&stream, level), stream); + setp(&obuffer[0], &obuffer[0] + obuffer.size()); + } + + DeflateStreamBuf::~DeflateStreamBuf() + { + ::deflateEnd(&stream); + } + + DeflateStreamBuf::int_type DeflateStreamBuf::overflow(int_type c) + { + // initialize input-stream + stream.next_in = reinterpret_cast(&obuffer[0]); + stream.avail_in = pptr() - &obuffer[0]; + + // initialize zbuffer for deflated data + char zbuffer[8192]; + stream.next_out = reinterpret_cast(zbuffer); + stream.avail_out = sizeof(zbuffer); + + // deflate + checkError(::deflate(&stream, Z_NO_FLUSH), stream); + + // copy zbuffer to sink / consume deflated data + std::streamsize count = sizeof(zbuffer) - stream.avail_out; + if (count > 0) + { + std::streamsize n = sink->sputn(zbuffer, count); + if (n < count) + return traits_type::eof(); + } + + // move remaining characters to start of obuffer + if (stream.avail_in > 0) + memmove(&obuffer[0], stream.next_in, stream.avail_in); + + // reset outbuffer + setp(&obuffer[0] + stream.avail_in, &obuffer[0] + obuffer.size()); + if (c != traits_type::eof()) + sputc(traits_type::to_char_type(c)); + + return 0; + } + + DeflateStreamBuf::int_type DeflateStreamBuf::underflow() + { + return traits_type::eof(); + } + + int DeflateStreamBuf::sync() + { + // initialize input-stream for + stream.next_in = reinterpret_cast(&obuffer[0]); + stream.avail_in = pptr() - &obuffer[0]; + char zbuffer[8192]; + while (stream.avail_in > 0) + { + // initialize zbuffer + stream.next_out = (Bytef*)zbuffer; + stream.avail_out = sizeof(zbuffer); + + checkError(::deflate(&stream, Z_SYNC_FLUSH), stream); + + // copy zbuffer to sink + std::streamsize count = sizeof(zbuffer) - stream.avail_out; + if (count > 0) + { + std::streamsize n = sink->sputn(zbuffer, count); + if (n < count) + return -1; + } + }; + + // reset outbuffer + setp(&obuffer[0], &obuffer[0] + obuffer.size()); + return 0; + } + + int DeflateStreamBuf::end() + { + char zbuffer[8192]; + // initialize input-stream for + stream.next_in = reinterpret_cast(&obuffer[0]); + stream.avail_in = pptr() - &obuffer[0]; + while (true) + { + // initialize zbuffer + stream.next_out = (Bytef*)zbuffer; + stream.avail_out = sizeof(zbuffer); + + int ret = checkError(::deflate(&stream, Z_FINISH), stream); + + // copy zbuffer to sink + std::streamsize count = sizeof(zbuffer) - stream.avail_out; + if (count > 0) + { + std::streamsize n = sink->sputn(zbuffer, count); + if (n < count) + throw DeflateError(0, "failed to send compressed data to sink in deflatestream"); + } + if (ret == Z_STREAM_END) + break; + }; + + // reset outbuffer + setp(&obuffer[0], &obuffer[0] + obuffer.size()); + return 0; + } + + void DeflateStream::end() + { + if (streambuf.end() != 0) + setstate(failbit); + } + +} +} diff --git a/src/writer/deflatestream.h b/src/writer/deflatestream.h new file mode 100644 index 0000000..80ed610 --- /dev/null +++ b/src/writer/deflatestream.h @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2005-2008 Tommi Maekitalo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + + +#ifndef ZIM_WRITER_DEFLATESTREAM_H +#define ZIM_WRITER_DEFLATESTREAM_H + +#include +#include +#include +#include + +namespace zim +{ + +namespace writer +{ + class DeflateError : public std::runtime_error + { + int zRet; + + public: + DeflateError(int zRet_, const std::string& msg) + : std::runtime_error(msg), + zRet(zRet_) + { } + + int getRet() const { return zRet; } + }; + + class DeflateStreamBuf : public std::streambuf + { + z_stream stream; + std::vector obuffer; + std::streambuf* sink; + + public: + explicit DeflateStreamBuf(std::streambuf* sink_, int level = Z_DEFAULT_COMPRESSION, + unsigned bufsize = 8192); + ~DeflateStreamBuf(); + + /// see std::streambuf + int_type overflow(int_type c); + /// see std::streambuf + int_type underflow(); + /// see std::streambuf + int sync(); + + /// end deflate-stream + int end(); + void setSink(std::streambuf* sink_) { sink = sink_; } + uLong getAdler() const { return stream.adler; } + }; + + class DeflateStream : public std::ostream + { + DeflateStreamBuf streambuf; + + public: + explicit DeflateStream(std::streambuf* sink, int level = Z_DEFAULT_COMPRESSION) + : std::ostream(0), + streambuf(sink, level) + { init(&streambuf); } + explicit DeflateStream(std::ostream& sink, int level = Z_DEFAULT_COMPRESSION) + : std::ostream(0), + streambuf(sink.rdbuf(), level) + { init(&streambuf); } + + void end(); + void setSink(std::streambuf* sink) { streambuf.setSink(sink); } + void setSink(std::ostream& sink) { streambuf.setSink(sink.rdbuf()); } + uLong getAdler() const { return streambuf.getAdler(); } + }; + +} +} + +#endif // ZIM_WRITER_DEFLATESTREAM_H + diff --git a/src/writer/dirent.cpp b/src/writer/dirent.cpp new file mode 100644 index 0000000..8125e50 --- /dev/null +++ b/src/writer/dirent.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "_dirent.h" +#include +#include "buffer.h" +#include "endian_tools.h" +#include "log.h" +#include +#include + +log_define("zim.dirent") + +std::ostream& zim::writer::operator<< (std::ostream& out, const zim::writer::Dirent& dirent) +{ + union + { + char d[16]; + long a; + } header; + zim::toLittleEndian(dirent.getMimeType(), header.d); + header.d[2] = static_cast(dirent.getParameter().size()); + header.d[3] = dirent.getNamespace(); + + log_debug("title=" << dirent.getTitle() << " title.size()=" << dirent.getTitle().size()); + + zim::toLittleEndian(dirent.getVersion(), header.d + 4); + + if (dirent.isRedirect()) + { + zim::toLittleEndian(dirent.getRedirectIndex().v, header.d + 8); + out.write(header.d, 12); + } + else if (dirent.isLinktarget() || dirent.isDeleted()) + { + out.write(header.d, 8); + } + else + { + zim::toLittleEndian(zim::cluster_index_type(dirent.getClusterNumber()), header.d + 8); + zim::toLittleEndian(zim::blob_index_type(dirent.getBlobNumber()), header.d + 12); + out.write(header.d, 16); + } + + out << dirent.getUrl() << '\0'; + + std::string t = dirent.getTitle(); + if (t != dirent.getUrl()) + out << t; + out << '\0' << dirent.getParameter(); + + return out; +} diff --git a/src/writer/lzmastream.cpp b/src/writer/lzmastream.cpp new file mode 100644 index 0000000..403ad7d --- /dev/null +++ b/src/writer/lzmastream.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "lzmastream.h" +#include +#include "log.h" +#include +#include + +log_define("zim.lzma.compress") + +namespace zim +{ + +namespace writer +{ + namespace + { + lzma_ret checkError(lzma_ret ret) + { + if (ret != LZMA_OK && ret != LZMA_STREAM_END) + { + std::ostringstream msg; + msg << "lzma-error " << ret; + switch (ret) + { + case LZMA_OK: msg << ": LZMA_OK"; break; + case LZMA_STREAM_END: msg << ": LZMA_STREAM_END"; break; + case LZMA_NO_CHECK: msg << ": LZMA_NO_CHECK"; break; + case LZMA_UNSUPPORTED_CHECK: msg << ": LZMA_UNSUPPORTED_CHECK"; break; + case LZMA_GET_CHECK: msg << ": LZMA_GET_CHECK"; break; + case LZMA_MEM_ERROR: msg << ": LZMA_MEM_ERROR"; break; + case LZMA_MEMLIMIT_ERROR: msg << ": LZMA_MEMLIMIT_ERROR"; break; + case LZMA_FORMAT_ERROR: msg << ": LZMA_FORMAT_ERROR"; break; + case LZMA_OPTIONS_ERROR: msg << ": LZMA_OPTIONS_ERROR"; break; + case LZMA_DATA_ERROR: msg << ": LZMA_DATA_ERROR"; break; + case LZMA_BUF_ERROR: msg << ": LZMA_BUF_ERROR"; break; + case LZMA_PROG_ERROR: msg << ": LZMA_PROG_ERROR"; break; + } + log_error(msg.str()); + throw LzmaError(ret, msg.str()); + } + return ret; + } + } + + LzmaStreamBuf::LzmaStreamBuf(std::streambuf* sink_, uint32_t preset, lzma_check check, unsigned bufsize_) + : obuffer(bufsize_), + sink(sink_) + { + std::memset(reinterpret_cast(&stream), 0, sizeof(stream)); + + checkError( + ::lzma_easy_encoder(&stream, preset, check)); + + setp(&obuffer[0], &obuffer[0] + obuffer.size()); + } + + LzmaStreamBuf::~LzmaStreamBuf() + { + ::lzma_end(&stream); + } + + LzmaStreamBuf::int_type LzmaStreamBuf::overflow(int_type c) + { + // initialize input-stream + stream.next_in = reinterpret_cast(&obuffer[0]); + stream.avail_in = pptr() - &obuffer[0]; + + // initialize zbuffer for compressed data + char zbuffer[8192]; + stream.next_out = reinterpret_cast(zbuffer); + stream.avail_out = sizeof(zbuffer); + + // compress + checkError(::lzma_code(&stream, LZMA_RUN)); + + // copy zbuffer to sink / consume deflated data + std::streamsize count = sizeof(zbuffer) - stream.avail_out; + if (count > 0) + { + std::streamsize n = sink->sputn(zbuffer, count); + if (n < count) + return traits_type::eof(); + } + + // move remaining characters to start of obuffer + if (stream.avail_in > 0) + memmove(&obuffer[0], stream.next_in, stream.avail_in); + + // reset outbuffer + setp(&obuffer[0] + stream.avail_in, &obuffer[0] + obuffer.size()); + if (c != traits_type::eof()) + sputc(traits_type::to_char_type(c)); + + return 0; + } + + LzmaStreamBuf::int_type LzmaStreamBuf::underflow() + { + return traits_type::eof(); + } + + int LzmaStreamBuf::sync() + { + // initialize input-stream for + stream.next_in = reinterpret_cast(&obuffer[0]); + stream.avail_in = pptr() - &obuffer[0]; + char zbuffer[8192]; + while (stream.avail_in > 0) + { + // initialize zbuffer + stream.next_out = (uint8_t*)zbuffer; + stream.avail_out = sizeof(zbuffer); + + checkError(::lzma_code(&stream, LZMA_FINISH)); + + // copy zbuffer to sink + std::streamsize count = sizeof(zbuffer) - stream.avail_out; + if (count > 0) + { + std::streamsize n = sink->sputn(zbuffer, count); + if (n < count) + return -1; + } + }; + + // reset outbuffer + setp(&obuffer[0], &obuffer[0] + obuffer.size()); + return 0; + } + + int LzmaStreamBuf::end() + { + char zbuffer[8192]; + // initialize input-stream for + stream.next_in = reinterpret_cast(&obuffer[0]); + stream.avail_in = pptr() - &obuffer[0]; + lzma_ret ret; + do + { + // initialize zbuffer + stream.next_out = (uint8_t*)zbuffer; + stream.avail_out = sizeof(zbuffer); + + ret = checkError(::lzma_code(&stream, LZMA_FINISH)); + + // copy zbuffer to sink + std::streamsize count = sizeof(zbuffer) - stream.avail_out; + if (count > 0) + { + std::streamsize n = sink->sputn(zbuffer, count); + if (n < count) + throw LzmaError(static_cast(0), "failed to send compressed data to sink in lzmastream"); + } + } while (ret != LZMA_STREAM_END); + + // reset outbuffer + setp(&obuffer[0], &obuffer[0] + obuffer.size()); + return 0; + } + + void LzmaStream::end() + { + if (streambuf.end() != 0) + setstate(failbit); + } +} +} diff --git a/src/writer/lzmastream.h b/src/writer/lzmastream.h new file mode 100644 index 0000000..ccf9ded --- /dev/null +++ b/src/writer/lzmastream.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_LZMASTREAM_H +#define ZIM_WRITER_LZMASTREAM_H + +#include +#include +#include +#include + +namespace zim +{ +namespace writer +{ + class LzmaError : public std::runtime_error + { + lzma_ret ret; + + public: + LzmaError(lzma_ret ret_, const std::string& msg) + : std::runtime_error(msg), + ret(ret_) + { } + + lzma_ret getRetcode() const { return ret; } + }; + + class LzmaStreamBuf : public std::streambuf + { + lzma_stream stream; + std::vector obuffer; + std::streambuf* sink; + + public: + LzmaStreamBuf(std::streambuf* sink_, + uint32_t preset = 3 | LZMA_PRESET_EXTREME, + lzma_check check = LZMA_CHECK_CRC32 /* LZMA_CHECK_NONE */, + unsigned bufsize = 8192); + ~LzmaStreamBuf(); + + /// see std::streambuf + int_type overflow(int_type c); + /// see std::streambuf + int_type underflow(); + /// see std::streambuf + int sync(); + /// end stream + int end(); + + void setSink(std::streambuf* sink_) { sink = sink_; } + }; + + class LzmaStream : public std::ostream + { + LzmaStreamBuf streambuf; + + public: + explicit LzmaStream(std::streambuf* sink, + uint32_t preset = 3 | LZMA_PRESET_EXTREME, + lzma_check check = LZMA_CHECK_CRC32 /* LZMA_CHECK_NONE */, + unsigned bufsize = 8192) + : std::ostream(0), + streambuf(sink, preset, check, bufsize) + { init(&streambuf); } + explicit LzmaStream(std::ostream& sink, + uint32_t preset = 3 | LZMA_PRESET_EXTREME, + lzma_check check = LZMA_CHECK_CRC32 /* LZMA_CHECK_NONE */, + unsigned bufsize = 8192) + : std::ostream(0), + streambuf(sink.rdbuf(), preset, check, bufsize) + { init(&streambuf); } + + void end(); + void setSink(std::streambuf* sink) { streambuf.setSink(sink); } + void setSink(std::ostream& sink) { streambuf.setSink(sink.rdbuf()); } + }; +} +} + +#endif // ZIM_WRITER_LZMASTREAM_H diff --git a/src/writer/queue.h b/src/writer/queue.h new file mode 100644 index 0000000..bd1375c --- /dev/null +++ b/src/writer/queue.h @@ -0,0 +1,90 @@ +/* + * Copyright 2016 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_LIBZIM_QUEUE_H +#define OPENZIM_LIBZIM_QUEUE_H + +#define MAX_QUEUE_SIZE 10 + +#include +#include +#include +#include + +template +class Queue { + public: + Queue() {pthread_mutex_init(&m_queueMutex,NULL);}; + virtual ~Queue() {pthread_mutex_destroy(&m_queueMutex);}; + virtual bool isEmpty(); + virtual void pushToQueue(const T& element); + virtual bool popFromQueue(T &filename); + + protected: + std::queue m_realQueue; + pthread_mutex_t m_queueMutex; + + private: + // Make this queue non copyable + Queue(const Queue&); + Queue& operator=(const Queue&); +}; + +template +bool Queue::isEmpty() { + pthread_mutex_lock(&m_queueMutex); + bool retVal = m_realQueue.empty(); + pthread_mutex_unlock(&m_queueMutex); + return retVal; +} + +template +void Queue::pushToQueue(const T &element) { + unsigned int wait = 0; + unsigned int queueSize = 0; + + do { + std::this_thread::sleep_for(std::chrono::microseconds(wait)); + pthread_mutex_lock(&m_queueMutex); + queueSize = m_realQueue.size(); + pthread_mutex_unlock(&m_queueMutex); + wait += 10; + } while (queueSize > MAX_QUEUE_SIZE); + + pthread_mutex_lock(&m_queueMutex); + m_realQueue.push(element); + pthread_mutex_unlock(&m_queueMutex); +} + +template +bool Queue::popFromQueue(T &element) { + pthread_mutex_lock(&m_queueMutex); + if (m_realQueue.empty()) { + pthread_mutex_unlock(&m_queueMutex); + return false; + } + + element = m_realQueue.front(); + m_realQueue.pop(); + pthread_mutex_unlock(&m_queueMutex); + + return true; +} + +#endif // OPENZIM_LIBZIM_QUEUE_H diff --git a/src/writer/tee.cpp b/src/writer/tee.cpp new file mode 100644 index 0000000..70d13fb --- /dev/null +++ b/src/writer/tee.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2003 Tommi Maekitalo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * As a special exception, you may use this file as part of a free + * software library without restriction. Specifically, if other files + * instantiate templates or use macros or inline functions from this + * file, or you compile this file and link it with other files to + * produce an executable, this file does not by itself cause the + * resulting executable to be covered by the GNU General Public + * License. This exception does not however invalidate any other + * reasons why the executable file might be covered by the GNU Library + * General Public License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "tee.h" + +namespace zim +{ + +std::streambuf::int_type Teestreambuf::overflow(std::streambuf::int_type ch) +{ + if(ch != traits_type::eof()) + { + if(streambuf1 && streambuf1->sputc(ch) == traits_type::eof()) + return traits_type::eof(); + + if(streambuf2 && streambuf2->sputc(ch) == traits_type::eof()) + return traits_type::eof(); + } + + return 0; +} + +std::streambuf::int_type Teestreambuf::underflow() +{ + return traits_type::eof(); +} + +int Teestreambuf::sync() +{ + if(streambuf1 && streambuf1->pubsync() == traits_type::eof()) + return traits_type::eof(); + + if(streambuf2 && streambuf2->pubsync() == traits_type::eof()) + return traits_type::eof(); + + return 0; +} + +///////////////////////////////////////////////////////////////////////////// +void Tee::assign(std::ostream& s1, std::ostream& s2) +{ + Teestreambuf* buf = dynamic_cast(rdbuf()); + if(buf) + buf->tie(s1.rdbuf(), s2.rdbuf()); +} + +void Tee::assign_single(std::ostream& s) +{ + Teestreambuf* buf = dynamic_cast(rdbuf()); + if(buf) + buf->tie(s.rdbuf()); +} + +} diff --git a/src/writer/tee.h b/src/writer/tee.h new file mode 100644 index 0000000..b26005c --- /dev/null +++ b/src/writer/tee.h @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2003 Tommi Maekitalo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * As a special exception, you may use this file as part of a free + * software library without restriction. Specifically, if other files + * instantiate templates or use macros or inline functions from this + * file, or you compile this file and link it with other files to + * produce an executable, this file does not by itself cause the + * resulting executable to be covered by the GNU General Public + * License. This exception does not however invalidate any other + * reasons why the executable file might be covered by the GNU Library + * General Public License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef ZIM_TEE_H +#define ZIM_TEE_H + +#include + +namespace zim +{ + +class Teestreambuf : public std::streambuf +{ + public: + Teestreambuf(std::streambuf* buf1 = 0, std::streambuf* buf2 = 0) + : streambuf1(buf1), + streambuf2(buf2) + { setp(0, 0); } + + void tie(std::streambuf* buf1, std::streambuf* buf2 = 0) + { + streambuf1 = buf1; + streambuf2 = buf2; + } + + private: + std::streambuf::int_type overflow(std::streambuf::int_type ch); + std::streambuf::int_type underflow(); + int sync(); + + std::streambuf* streambuf1; + std::streambuf* streambuf2; +}; + +///////////////////////////////////////////////////////////////////////////// + +class Tee : public std::ostream +{ + typedef std::ostream base_class; + Teestreambuf streambuf; + + public: + Tee() + : std::ostream(0), + streambuf(std::cout.rdbuf()) + { + init(&streambuf); + } + Tee(std::ostream& s1, std::ostream& s2) + : std::ostream(0), + streambuf(s1.rdbuf(), s2.rdbuf()) + { + init(&streambuf); + } + Tee(std::ostream& s) + : std::ostream(0), + streambuf(s.rdbuf(), std::cout.rdbuf()) + { + init(&streambuf); + } + + void assign(std::ostream& s1, std::ostream& s2); + void assign(std::ostream& s) + { assign(s, std::cout); } + void assign_single(std::ostream& s); +}; + +} + +#endif // ZIM_TEE_H diff --git a/src/writer/xapianIndexer.cpp b/src/writer/xapianIndexer.cpp new file mode 100644 index 0000000..6c8af05 --- /dev/null +++ b/src/writer/xapianIndexer.cpp @@ -0,0 +1,196 @@ +/* + * Copyright 2011 Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "xapianIndexer.h" +#include "libzim-resources.h" +#include "fs.h" +#include "tools.h" +#include +#include +#include + +/* Count word */ +unsigned int countWords(const string& text) +{ + unsigned int numWords = 1; + unsigned int length = text.size(); + + for (unsigned int i = 0; i < length;) { + while (i < length && text[i] != ' ') { + i++; + } + numWords++; + i++; + } + + return numWords; +} + +/* Constructor */ +XapianIndexer::XapianIndexer(const std::string& language, const bool verbose) + : language(language) +{ + /* Build ICU Local object to retrieve ISO-639 language code (from + ISO-639-3) */ + icu::Locale languageLocale(language.c_str()); + + /* Configuring language base steemming */ + try { + this->stemmer = Xapian::Stem(languageLocale.getLanguage()); + this->indexer.set_stemmer(this->stemmer); + this->indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL); + } catch (...) { + std::cout << "No steemming for language '" << languageLocale.getLanguage() + << "'" << std::endl; + } + + /* Read the stopwords */ + std::string stopWord; + try { + this->stopwords = getResource("stopwords/" + language); + } catch(ResourceNotFound& e) {} + std::istringstream file(this->stopwords); + while (std::getline(file, stopWord, '\n')) { + this->stopper.add(stopWord); + } + + this->indexer.set_stopper(&(this->stopper)); + this->indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL); +} + +XapianIndexer::~XapianIndexer() +{ + if (!indexPath.empty()) { + try { +#ifndef _WIN32 +//[TODO] Implement remove for windows + zim::DEFAULTFS::remove(indexPath + ".tmp"); + zim::DEFAULTFS::remove(indexPath); +#endif + } catch (...) { + /* Do not raise */ + } + } +} + +void XapianIndexer::indexingPrelude(const string indexPath_) +{ + indexPath = indexPath_; + this->writableDatabase = Xapian::WritableDatabase( + indexPath + ".tmp", Xapian::DB_CREATE_OR_OVERWRITE); + this->writableDatabase.set_metadata("valuesmap", "title:0;wordcount:1;geo.position:2"); + this->writableDatabase.set_metadata("language", language); + this->writableDatabase.set_metadata("stopwords", stopwords); + this->writableDatabase.set_metadata("prefixes", "S"); + this->writableDatabase.begin_transaction(true); +} + +void XapianIndexer::index(const zim::writer::Article* article) +{ + /* Put the data in the document */ + Xapian::Document currentDocument; + currentDocument.clear_values(); + currentDocument.set_data(std::string(1, article->getNamespace()) + "/" + article->getUrl()); + indexer.set_document(currentDocument); + zim::MyHtmlParser htmlParser; + + try { + htmlParser.parse_html(article->getData(), "UTF-8", true); + } catch (...) { + } + + if (htmlParser.dump.find("NOINDEX") != string::npos) + { + return; + } + + std::string accentedTitle = (htmlParser.title.empty() ? article->getTitle() : htmlParser.title); + std::string title = zim::removeAccents(accentedTitle); + std::string keywords = zim::removeAccents(htmlParser.keywords); + std::string content = zim::removeAccents(htmlParser.dump); + + currentDocument.add_value(0, title); + + std::stringstream countWordStringStream; + countWordStringStream << countWords(htmlParser.dump); + currentDocument.add_value(1, countWordStringStream.str()); + + if (htmlParser.has_geoPosition) { + auto geoPosition = Xapian::LatLongCoord( + htmlParser.latitude, htmlParser.longitude).serialise(); + currentDocument.add_value(2, geoPosition); + } + + /* Index the title */ + if (!title.empty()) { + this->indexer.index_text_without_positions( + title, this->getTitleBoostFactor(content.size())); + this->indexer.index_text(title, 1, "S"); + } + + /* Index the keywords */ + if (!keywords.empty()) { + this->indexer.index_text_without_positions(keywords, keywordsBoostFactor); + } + + /* Index the content */ + if (!content.empty()) { + this->indexer.index_text_without_positions(content); + } + + /* add to the database */ + this->writableDatabase.add_document(currentDocument); +} + +void XapianIndexer::flush() +{ + this->writableDatabase.commit_transaction(); + this->writableDatabase.begin_transaction(true); +} + +void XapianIndexer::indexingPostlude() +{ + this->flush(); + this->writableDatabase.commit_transaction(); + this->writableDatabase.commit(); + this->writableDatabase.compact(indexPath, Xapian::DBCOMPACT_SINGLE_FILE); + this->writableDatabase.close(); +} + +XapianMetaArticle* XapianIndexer::getMetaArticle() +{ + return new XapianMetaArticle(this); +} + +zim::size_type XapianMetaArticle::getSize() const +{ + std::ifstream in(indexer->getIndexPath(), std::ios::binary|std::ios::ate); + return in.tellg(); +} + +std::string XapianMetaArticle::getFilename() const +{ + return indexer->getIndexPath(); +} + +zim::Blob XapianMetaArticle::getData() const +{ + throw std::logic_error("We should not pass here."); + return zim::Blob(); +} diff --git a/src/writer/xapianIndexer.h b/src/writer/xapianIndexer.h new file mode 100644 index 0000000..3b6be49 --- /dev/null +++ b/src/writer/xapianIndexer.h @@ -0,0 +1,85 @@ +/* + * Copyright 2011 Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef LIBZIM_WRITER_XAPIANINDEXER_H +#define LIBZIM_WRITER_XAPIANINDEXER_H + +#include +#include + +#include +#include +#include +#include "xapian/myhtmlparse.h" + +class XapianIndexer; + +class XapianMetaArticle : public zim::writer::Article +{ + private: + XapianIndexer* indexer; + mutable std::string data; + + public: + XapianMetaArticle(XapianIndexer* indexer) : indexer(indexer) + {} + virtual ~XapianMetaArticle() = default; + virtual zim::Blob getData() const; + virtual std::string getAid() const { return "/fulltextIndex/xapian"; } + virtual char getNamespace() const { return 'Z';} + virtual std::string getUrl() const { return "/fulltextIndex/xapian"; } + virtual std::string getTitle() const { return "Xapian Fulltext Index"; } + virtual std::string getMimeType() const { return "application/octet-stream+xapian"; } + virtual bool isRedirect() const { return false; } + virtual bool shouldIndex() const { return false; } + virtual bool shouldCompress() const { return false; } + virtual std::string getRedirectAid() const { return ""; } + virtual zim::size_type getSize() const; + virtual std::string getFilename() const; +}; + +class XapianIndexer +{ + public: + XapianIndexer(const std::string& language, bool verbose); + virtual ~XapianIndexer(); + std::string getIndexPath() { return indexPath; } + void indexingPrelude(const string indexPath); + void index(const zim::writer::Article* article); + void flush(); + void indexingPostlude(); + XapianMetaArticle* getMetaArticle(); + + protected: + unsigned int keywordsBoostFactor; + inline unsigned int getTitleBoostFactor(const unsigned int contentLength) + { + return contentLength / 500 + 1; + } + + Xapian::WritableDatabase writableDatabase; + Xapian::Stem stemmer; + Xapian::SimpleStopper stopper; + Xapian::TermGenerator indexer; + std::string indexPath; + std::string language; + std::string stopwords; +}; + +#endif // LIBZIM_WRITER_XAPIANINDEXER_H diff --git a/src/writer/zimcreator.cpp b/src/writer/zimcreator.cpp new file mode 100644 index 0000000..fef797d --- /dev/null +++ b/src/writer/zimcreator.cpp @@ -0,0 +1,713 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "config.h" + +#include "zimcreatordata.h" +#include "cluster.h" +#include +#include +#include "../endian_tools.h" +#include +#include + +#if defined(ENABLE_XAPIAN) + #include "xapianIndexer.h" +#endif + +#ifdef _WIN32 +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include "md5stream.h" +#include "tee.h" +#include "log.h" +#include "../fs.h" + +log_define("zim.writer.creator") + +#define INFO(e) \ + do { \ + log_info(e); \ + std::cout << e << std::endl; \ + } while(false) + +namespace +{ + class CompareTitle + { + zim::writer::ZimCreatorData::DirentsType& dirents; + + public: + explicit CompareTitle(zim::writer::ZimCreatorData::DirentsType& dirents_) + : dirents(dirents_) + { } + bool operator() (zim::article_index_t titleIdx1, zim::article_index_t titleIdx2) const + { + auto d1 = dirents[zim::article_index_type(titleIdx1)]; + auto d2 = dirents[zim::article_index_type(titleIdx2)]; + return d1.getNamespace() < d2.getNamespace() + || (d1.getNamespace() == d2.getNamespace() + && d1.getTitle() < d2.getTitle()); + } + }; +} + +namespace zim +{ + namespace writer + { + void* ZimCreator::clusterWriter(void* arg) { + auto zimCreator = static_cast(arg); + zim::writer::Cluster* clusterToWrite; + unsigned int wait = 0; + + while(true) { + std::this_thread::sleep_for(std::chrono::microseconds(wait)); + if (zimCreator->data->clustersToWrite.popFromQueue(clusterToWrite)) { + wait = 0; + clusterToWrite->dump_tmp(zimCreator->data->tmpfname); + clusterToWrite->close(); + continue; + } + wait += 10; + } + return nullptr; + } + + ZimCreator::ZimCreator(bool verbose) + : verbose(verbose) + {} + + ZimCreator::~ZimCreator() = default; + + void ZimCreator::startZimCreation(const std::string& fname) + { + data = std::unique_ptr(new ZimCreatorData(fname, verbose, withIndex, indexingLanguage)); + data->setMinChunkSize(minChunkSize); + + for(unsigned i=0; irunningWriters.push_back(thread); + } + } + + void ZimCreator::addArticle(const Article& article) + { + Dirent dirent = data->createDirentFromArticle(&article); + data->addDirent(dirent, &article); + data->nbArticles++; + if (article.shouldCompress()) + data->nbCompArticles++; + else + data->nbUnCompArticles++; + if (!article.getFilename().empty()) + data->nbFileArticles++; + if (article.shouldIndex()) + data->nbIndexArticles++; + + if (verbose && data->nbArticles%1000 == 0){ + std::cout << "A:" << data->nbArticles + << "; CA:" << data->nbCompArticles + << "; UA:" << data->nbUnCompArticles + << "; FA:" << data->nbFileArticles + << "; IA:" << data->nbIndexArticles + << "; C:" << data->nbClusters + << "; CC:" << data->nbCompClusters + << "; UC:" << data->nbUnCompClusters + << std::endl; + } + +#if defined(ENABLE_XAPIAN) + if(withIndex && article.shouldIndex()) { + data->indexer->index(&article); + } +#endif + } + + void ZimCreator::finishZimCreation() + { + if (verbose) { + std::cout << "A:" << data->nbArticles + << "; CA:" << data->nbCompArticles + << "; UA:" << data->nbUnCompArticles + << "; FA:" << data->nbFileArticles + << "; IA:" << data->nbIndexArticles + << "; C:" << data->nbClusters + << "; CC:" << data->nbCompClusters + << "; UC:" << data->nbUnCompClusters + << std::endl; + } + +#if defined(ENABLE_XAPIAN) + if (withIndex) { + data->indexer->indexingPostlude(); + std::this_thread::sleep_for(std::chrono::microseconds(100)); + auto article = data->indexer->getMetaArticle(); + Dirent dirent = data->createDirentFromArticle(article); + data->addDirent(dirent, article); + delete article; + } +#endif + + // When we've seen all articles, write any remaining clusters. + if (data->compCluster->count()) + data->closeCluster(true); + + if (data->uncompCluster->count()) + data->closeCluster(false); + + // wait all cluster writing has been done + unsigned int wait = 0; + do { + std::this_thread::sleep_for(std::chrono::microseconds(wait)); + wait += 10; + } while(!data->clustersToWrite.isEmpty()); + + // Be sure that all cluster are closed + wait = 0; + bool closed = true; + do { + closed = true; + std::this_thread::sleep_for(std::chrono::microseconds(wait)); + wait += 10; + for(auto cluster: data->clustersList) { + if (!cluster->isClosed()) { + closed = false; + break; + } + } + } while(!closed); + +// [FIXME] pthread_cancel is not defined in android NDK. +// As we don't create zim on android platform, +// let's simply skip this code to still allow +// compilation of libzim on android. +#if !defined(__ANDROID__) + for(auto& thread: data->runningWriters) + { + pthread_cancel(thread); + } +#endif + + data->generateClustersOffsets(); + + data->removeInvalidRedirects(); + data->setArticleIndexes(); + data->resolveRedirectIndexes(); + + data->resolveMimeTypes(); + + INFO("create title index"); + data->createTitleIndex(); + INFO(data->dirents.size() << " title index created"); + INFO(data->clusterOffsets.size() << " clusters created"); + + INFO("fill header"); + Fileheader header; + fillHeader(&header); + + // sort + log_debug("sort " << dirents.size() << " directory entries (url)"); + std::sort(data->dirents.begin(), data->dirents.end(), compareUrl); + + INFO("write zimfile"); + write(header, data->basename + ".zim.tmp"); + zim::DEFAULTFS::rename(data->basename + ".zim.tmp", data->basename + ".zim"); + + INFO("ready"); + } + + void ZimCreator::fillHeader(Fileheader* header) + { + std::string mainAid = getMainPage(); + std::string layoutAid = getLayoutPage(); + + log_debug("main aid=" << mainAid << " layout aid=" << layoutAid); + + if (data->isExtended) { + header->setMajorVersion(Fileheader::zimExtendedMajorVersion); + } else { + header->setMajorVersion(Fileheader::zimClassicMajorVersion); + } + header->setMinorVersion(Fileheader::zimMinorVersion); + header->setMainPage(std::numeric_limits::max()); + header->setLayoutPage(std::numeric_limits::max()); + + if (!mainAid.empty() || !layoutAid.empty()) + { + for (auto& dirent: data->dirents) + { + if (mainAid == dirent.getAid()) + { + log_debug("main idx=" << dirent.getIdx()); + header->setMainPage(article_index_type(dirent.getIdx())); + } + + if (layoutAid == dirent.getAid()) + { + log_debug("layout idx=" << dirent.getIdx()); + header->setLayoutPage(article_index_type(dirent.getIdx())); + } + } + } + + header->setUuid( getUuid() ); + header->setArticleCount( data->dirents.size() ); + + offset_type offset(Fileheader::size); + header->setMimeListPos( offset ); + + offset += data->mimeListSize().v; + header->setUrlPtrPos( offset ); + + offset += data->urlPtrSize().v; + header->setTitleIdxPos( offset ); + header->setClusterCount( data->clusterOffsets.size() ); + + offset += data->titleIdxSize().v + data->indexSize().v; + header->setClusterPtrPos( offset ); + + offset += data->clusterPtrSize().v + data->clustersSize.v; + header->setChecksumPos( offset ); + } + + void ZimCreator::write(const Fileheader& header, const std::string& fname) const + { + std::ofstream zimfile(fname); + Md5stream md5; + Tee out(zimfile, md5); + + out << header; + + log_debug("after writing header - pos=" << zimfile.tellp()); + + // write mime type list + for(auto& mimeType: data->mimeTypesList) + { + out << mimeType << '\0'; + } + + out << '\0'; + + // write url ptr list + offset_t off(header.getTitleIdxPos() + data->titleIdxSize().v); + for (auto& dirent: data->dirents) + { + char tmp_buff[sizeof(offset_type)]; + toLittleEndian(off.v, tmp_buff); + out.write(tmp_buff, sizeof(offset_type)); + off += dirent.getDirentSize(); + } + + log_debug("after writing direntPtr - pos=" << out.tellp()); + + // write title index + for (auto titleid: data->titleIdx) + { + char tmp_buff[sizeof(article_index_type)]; + toLittleEndian(titleid.v, tmp_buff); + out.write(tmp_buff, sizeof(article_index_type)); + } + + log_debug("after writing fileIdxList - pos=" << out.tellp()); + + // write directory entries + for (auto& dirent: data->dirents) + { + out << dirent; + log_debug("write " << dirent.getTitle() << " dirent.size()=" << dirent.getDirentSize() << " pos=" << out.tellp()); + } + + log_debug("after writing dirents - pos=" << out.tellp()); + + // write cluster offset list + off += data->clusterPtrSize(); + for (auto clusterOffset : data->clusterOffsets) + { + offset_t o(off + clusterOffset); + char tmp_buff[sizeof(offset_type)]; + toLittleEndian(o.v, tmp_buff); + out.write(tmp_buff, sizeof(offset_type)); + } + + log_debug("after writing clusterOffsets - pos=" << out.tellp()); + + // write cluster data + if (!data->isEmpty) + { + for(auto& cluster: data->clustersList) + { + ASSERT(cluster->isClosed(), ==, true); + cluster->write_final(out); + } + } + else + log_warn("no data found"); + + if (!out) + throw std::runtime_error("failed to write zimfile"); + + log_debug("after writing clusterData - pos=" << out.tellp()); + unsigned char digest[16]; + md5.getDigest(digest); + zimfile.write(reinterpret_cast(digest), 16); + } + + ZimCreatorData::ZimCreatorData(const std::string& fname, + bool verbose, + bool withIndex, + std::string language) + : withIndex(withIndex), + indexingLanguage(language), + verbose(verbose) + { + basename = (fname.size() > 4 && fname.compare(fname.size() - 4, 4, ".zim") == 0) + ? fname.substr(0, fname.size() - 4) + : fname; + tmpfname = basename + ".tmp"; + if(!DEFAULTFS::makeDirectory(tmpfname)) { + throw std::runtime_error( + std::string("failed to create temporary directory ") + + tmpfname); + } + + // We keep both a "compressed cluster" and an "uncompressed cluster" + // because we don't know which one will fill up first. We also need + // to track the dirents currently in each, so we can fix up the + // cluster index if the other one ends up written first. + compCluster = new Cluster(compression); + uncompCluster = new Cluster(zimcompNone); + +#if defined(ENABLE_XAPIAN) + if (withIndex) { + indexer = new XapianIndexer(indexingLanguage, true); + indexer->indexingPrelude(tmpfname+".idx"); + } +#endif + } + + ZimCreatorData::~ZimCreatorData() + { + if (compCluster) + delete compCluster; + if (uncompCluster) + delete uncompCluster; + for(auto& cluster: clustersList) { + delete cluster; + } +#ifndef _WIN32 +//[TODO] Implement remove for windows + DEFAULTFS::remove(tmpfname); +#endif +#if defined(ENABLE_XAPIAN) + if (indexer) + delete indexer; +#endif + } + + void ZimCreatorData::addDirent(const Dirent& dirent, const Article* article) + { + dirents.push_back(dirent); + + // If this is a redirect, we're done: there's no blob to add. + if (dirent.isRedirect()) + { + return; + } + + // Add blob data to compressed or uncompressed cluster. + auto articleSize = article->getSize(); + if (articleSize > 0) + { + isEmpty = false; + } + + Cluster *cluster; + if (article->shouldCompress()) + { + cluster = compCluster; + } + else + { + cluster = uncompCluster; + } + + // If cluster will be too large, write it to dis, and open a new + // one for the content. + if ( cluster->count() + && cluster->size().v+articleSize >= minChunkSize * 1024 + ) + { + log_info("cluster with " << cluster->count() << " articles, " << + cluster->size() << " bytes; current title \"" << + dirent.getTitle() << '\"'); + cluster = closeCluster(article->shouldCompress()); + } + + dirents.back().setCluster(cluster); + cluster->addArticle(article); + } + + Dirent ZimCreatorData::createDirentFromArticle(const Article* article) + { + Dirent dirent; + dirent.setAid(article->getAid()); + dirent.setUrl(article->getNamespace(), article->getUrl()); + dirent.setTitle(article->getTitle()); + dirent.setParameter(article->getParameter()); + + log_debug("article " << dirent.getLongUrl() << " fetched"); + + if (article->isRedirect()) + { + dirent.setRedirect(article_index_t(0)); + dirent.setRedirectAid(article->getRedirectAid()); + log_debug("is redirect to " << dirent.getRedirectAid()); + } + else if (article->isLinktarget()) + { + dirent.setLinktarget(); + } + else if (article->isDeleted()) + { + dirent.setDeleted(); + } + else + { + auto mimetype = article->getMimeType(); + if (mimetype.empty()) { + std::cerr << "Warning, " << article->getUrl() << " have empty mimetype." << std::endl; + mimetype = "application/octet-stream"; + } + dirent.setMimeType(getMimeTypeIdx(mimetype)); + log_debug("is article; mimetype " << dirent.getMimeType()); + } + return dirent; + } + + Cluster* ZimCreatorData::closeCluster(bool compressed) + { + Cluster *cluster; + nbClusters++; + if (compressed ) + { + cluster = compCluster; + nbCompClusters++; + } else { + cluster = uncompCluster; + nbUnCompClusters++; + } + cluster->setClusterIndex(cluster_index_t(clustersList.size())); + clustersList.push_back(cluster); + clustersToWrite.pushToQueue(cluster); + + log_debug("cluster written"); + if (cluster->is_extended() ) + isExtended = true; + if (compressed) + { + cluster = compCluster = new Cluster(compression); + } else { + cluster = uncompCluster = new Cluster(zimcompNone); + } + return cluster; + } + + void ZimCreatorData::generateClustersOffsets() + { + clustersSize = zsize_t(0); + for(auto& cluster: clustersList) + { + clusterOffsets.push_back(offset_t(clustersSize.v)); + clustersSize += cluster->getFinalSize(); + } + } + + void ZimCreatorData::removeInvalidRedirects() + { + // sort + INFO("sort " << dirents.size() << " directory entries (aid)"); + std::sort(dirents.begin(), dirents.end(), compareAid); + + // remove invalid redirects + INFO("remove invalid redirects from " << dirents.size() << " directory entries"); + ZimCreatorData::DirentsType::size_type di = 0; + while (di < dirents.size()) + { + if (di % 10000 == 0) + INFO(di << "/" << dirents.size() << " directory entries checked for invalid redirects"); + + if (dirents[di].isRedirect()) + { + log_debug("check " << dirents[di].getTitle() << " redirect to " << dirents[di].getRedirectAid() << " (" << di << '/' << dirents.size() << ')'); + + if (!std::binary_search(dirents.begin(), dirents.end(), Dirent(dirents[di].getRedirectAid()), compareAid)) + { + INFO("remove invalid redirection " << dirents[di].getUrl() << " redirecting to (missing) " << dirents[di].getRedirectAid()); + dirents.erase(dirents.begin() + di); + continue; + } + } + + ++di; + } + } + + void ZimCreatorData::setArticleIndexes() + { + // sort + INFO("sort " << dirents.size() << " directory entries (url)"); + std::sort(dirents.begin(), dirents.end(), compareUrl); + + // set index + INFO("set index"); + article_index_t idx(0); + for (auto& dirent: dirents) { + dirent.setIdx(idx); + idx += 1; + } + } + + void ZimCreatorData::resolveRedirectIndexes() + { + // sort + log_debug("sort " << dirents.size() << " directory entries (aid)"); + std::sort(dirents.begin(), dirents.end(), compareAid); + + // translate redirect aid to index + INFO("translate redirect aid to index"); + for (auto& di: dirents) + { + if (di.isRedirect()) + { + auto ddi = std::lower_bound(dirents.begin(), dirents.end(), di.getRedirectAid(), compareAid); + if (ddi != dirents.end() && ddi->getAid() == di.getRedirectAid()) + { + log_debug("redirect aid=" << ddi->getAid() << " redirect index=" << ddi->getIdx()); + di.setRedirect(ddi->getIdx()); + } + else + { + std::ostringstream msg; + msg << "internal error: redirect aid " << di.getRedirectAid() << " not found"; + log_fatal(msg.str()); + throw std::runtime_error(msg.str()); + } + } + } + } + + void ZimCreatorData::createTitleIndex() + { + // Sort works on dirents sorted by url. + std::sort(dirents.begin(), dirents.end(), compareUrl); + titleIdx.resize(0); + titleIdx.reserve(dirents.size()); + for (auto dirent: dirents) + titleIdx.push_back(dirent.getIdx()); + + CompareTitle compareTitle(dirents); + std::sort(titleIdx.begin(), titleIdx.end(), compareTitle); + } + + void ZimCreatorData::resolveMimeTypes() + { + std::vector oldMImeList; + std::vector mapping; + + for (auto& rmimeType: rmimeTypesMap) + { + oldMImeList.push_back(rmimeType.second); + mimeTypesList.push_back(rmimeType.second); + } + + mapping.resize(oldMImeList.size()); + std::sort(mimeTypesList.begin(), mimeTypesList.end()); + + for (unsigned i=0; i(j); + } + } + + for (auto& dirent: dirents) + { + if (dirent.isArticle()) + dirent.setMimeType(mapping[dirent.getMimeType()]); + } + } + + uint16_t ZimCreatorData::getMimeTypeIdx(const std::string& mimeType) + { + auto it = mimeTypesMap.find(mimeType); + if (it == mimeTypesMap.end()) + { + if (nextMimeIdx >= std::numeric_limits::max()) + throw std::runtime_error("too many distinct mime types"); + mimeTypesMap[mimeType] = nextMimeIdx; + rmimeTypesMap[nextMimeIdx] = mimeType; + return nextMimeIdx++; + } + + return it->second; + } + + const std::string& ZimCreatorData::getMimeType(uint16_t mimeTypeIdx) const + { + auto it = rmimeTypesMap.find(mimeTypeIdx); + if (it == rmimeTypesMap.end()) + throw std::runtime_error("mime type index not found"); + return it->second; + } + + zsize_t ZimCreatorData::mimeListSize() const + { + size_type ret = 1; + for (auto& rmimeType: rmimeTypesMap) + ret += (rmimeType.second.size() + 1); + return zsize_t(ret); + } + + zsize_t ZimCreatorData::indexSize() const + { + size_type s = 0; + + for (auto& dirent: dirents) + s += dirent.getDirentSize(); + + return zsize_t(s); + } + + } +} diff --git a/src/writer/zimcreatordata.h b/src/writer/zimcreatordata.h new file mode 100644 index 0000000..538c6d7 --- /dev/null +++ b/src/writer/zimcreatordata.h @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_ZIMCREATOR_DATA_H +#define ZIM_WRITER_ZIMCREATOR_DATA_H + +#include +#include +#include "queue.h" +#include "_dirent.h" +#include "xapianIndexer.h" +#include +#include +#include +#include "config.h" + +#if defined(ENABLE_XAPIAN) + class XapianIndexer; +#endif + +namespace zim +{ + namespace writer + { + class Cluster; + class ZimCreatorData + { + public: + typedef std::vector DirentsType; + typedef std::vector ArticleIdxVectorType; + typedef std::vector OffsetsType; + typedef std::map MimeTypesMap; + typedef std::map RMimeTypesMap; + typedef std::vector MimeTypesList; + typedef std::vector ClusterList; + typedef Queue ClusterQueue; + typedef std::vector ThreadList; + + ZimCreatorData(const std::string& fname, bool verbose, + bool withIndex, std::string language); + virtual ~ZimCreatorData(); + + void addDirent(const Dirent& dirent, const Article* article); + Dirent createDirentFromArticle(const Article* article); + Cluster* closeCluster(bool compressed); + + void generateClustersOffsets(); + void removeInvalidRedirects(); + void setArticleIndexes(); + void resolveRedirectIndexes(); + void createTitleIndex(); + void resolveMimeTypes(); + + uint16_t getMimeTypeIdx(const std::string& mimeType); + const std::string& getMimeType(uint16_t mimeTypeIdx) const; + + size_t minChunkSize = 1024-64; + + DirentsType dirents; + ArticleIdxVectorType titleIdx; + OffsetsType clusterOffsets; + + MimeTypesMap mimeTypesMap; + RMimeTypesMap rmimeTypesMap; + MimeTypesList mimeTypesList; + uint16_t nextMimeIdx = 0; + + ClusterList clustersList; + ClusterQueue clustersToWrite; + ThreadList runningWriters; + CompressionType compression = zimcompLzma; + std::string basename; + bool isEmpty = true; + bool isExtended = false; + zsize_t clustersSize; + Cluster *compCluster = nullptr; + Cluster *uncompCluster = nullptr; + std::string tmpfname; + + bool withIndex; + std::string indexingLanguage; +#if defined(ENABLE_XAPIAN) + XapianIndexer* indexer = nullptr; +#endif + + // Some stats + bool verbose; + article_index_type nbArticles; + article_index_type nbCompArticles; + article_index_type nbUnCompArticles; + article_index_type nbFileArticles; + article_index_type nbIndexArticles; + cluster_index_type nbClusters; + cluster_index_type nbCompClusters; + cluster_index_type nbUnCompClusters; + + cluster_index_t clusterCount() const + { return cluster_index_t(clusterOffsets.size()); } + + article_index_t articleCount() const + { return article_index_t(dirents.size()); } + + zsize_t mimeListSize() const; + + zsize_t urlPtrSize() const + { return zsize_t(article_index_type(articleCount()) * sizeof(offset_type)); } + + zsize_t titleIdxSize() const + { return zsize_t(article_index_type(articleCount()) * sizeof(article_index_type)); } + + zsize_t indexSize() const; + + zsize_t clusterPtrSize() const + { return zsize_t(cluster_index_type(clusterCount()) * sizeof(offset_type)); } + + size_t getMinChunkSize() { return minChunkSize; } + void setMinChunkSize(size_t s) { minChunkSize = s; } + }; + + } + +} + +#endif // ZIM_WRITER_ZIMCREATOR_DATA_H diff --git a/src/xapian/htmlparse.cc b/src/xapian/htmlparse.cc new file mode 100644 index 0000000..a6515c7 --- /dev/null +++ b/src/xapian/htmlparse.cc @@ -0,0 +1,373 @@ +/* htmlparse.cc: simple HTML parser for omega indexer + * + * Copyright 1999,2000,2001 BrightStation PLC + * Copyright 2001 Ananova Ltd + * Copyright 2002,2006,2007,2008 Olly Betts + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +// #include + +#include "htmlparse.h" + +#include + +// #include "utf8convert.h" + +#include + +#include +#include +#include +#include + +using namespace std; + +inline void +lowercase_string(string &str) +{ + for (string::iterator i = str.begin(); i != str.end(); ++i) { + *i = tolower(static_cast(*i)); + } +} + +map zim::HtmlParser::named_ents; + +inline static bool +p_notdigit(char c) +{ + return !isdigit(static_cast(c)); +} + +inline static bool +p_notxdigit(char c) +{ + return !isxdigit(static_cast(c)); +} + +inline static bool +p_notalnum(char c) +{ + return !isalnum(static_cast(c)); +} + +inline static bool +p_notwhitespace(char c) +{ + return !isspace(static_cast(c)); +} + +inline static bool +p_nottag(char c) +{ + return !isalnum(static_cast(c)) && + c != '.' && c != '-' && c != ':'; // ':' for XML namespaces. +} + +inline static bool +p_whitespacegt(char c) +{ + return isspace(static_cast(c)) || c == '>'; +} + +inline static bool +p_whitespaceeqgt(char c) +{ + return isspace(static_cast(c)) || c == '=' || c == '>'; +} + +bool +zim::HtmlParser::get_parameter(const string & param, string & value) +{ + map::const_iterator i = parameters.find(param); + if (i == parameters.end()) return false; + value = i->second; + return true; +} + +zim::HtmlParser::HtmlParser() +{ + static const struct ent { const char *n; unsigned int v; } ents[] = { +#include "namedentities.h" + { NULL, 0 } + }; + if (named_ents.empty()) { + const struct ent *i = ents; + while (i->n) { + named_ents[string(i->n)] = i->v; + ++i; + } + } +} + +void +zim::HtmlParser::decode_entities(string &s) +{ + // We need a const_iterator version of s.end() - otherwise the + // find() and find_if() templates don't work... + string::const_iterator amp = s.begin(), s_end = s.end(); + while ((amp = find(amp, s_end, '&')) != s_end) { + unsigned int val = 0; + string::const_iterator end, p = amp + 1; + if (p != s_end && *p == '#') { + p++; + if (p != s_end && (*p == 'x' || *p == 'X')) { + // hex + p++; + end = find_if(p, s_end, p_notxdigit); + sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val); + } else { + // number + end = find_if(p, s_end, p_notdigit); + val = atoi(s.substr(p - s.begin(), end - p).c_str()); + } + } else { + end = find_if(p, s_end, p_notalnum); + string code = s.substr(p - s.begin(), end - p); + map::const_iterator i; + i = named_ents.find(code); + if (i != named_ents.end()) val = i->second; + } + if (end < s_end && *end == ';') end++; + if (val) { + string::size_type amp_pos = amp - s.begin(); + if (val < 0x80) { + s.replace(amp_pos, end - amp, 1u, char(val)); + } else { + // Convert unicode value val to UTF-8. + char seq[4]; + unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq); + s.replace(amp_pos, end - amp, seq, len); + } + s_end = s.end(); + // We've modified the string, so the iterators are no longer + // valid... + amp = s.begin() + amp_pos + 1; + } else { + amp = end; + } + } +} + +void +zim::HtmlParser::parse_html(const string &body) +{ + in_script = false; + + parameters.clear(); + string::const_iterator start = body.begin(); + + while (true) { + // Skip through until we find an HTML tag, a comment, or the end of + // document. Ignore isolated occurrences of `<' which don't start + // a tag or comment. + string::const_iterator p = start; + while (true) { + p = find(p, body.end(), '<'); + if (p == body.end()) break; + unsigned char ch = *(p + 1); + + // Tag, closing tag, or comment (or SGML declaration). + if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break; + + if (ch == '?') { + // PHP code or XML declaration. + // XML declaration is only valid at the start of the first line. + // FIXME: need to deal with BOMs... + if (p != body.begin() || body.size() < 20) break; + + // XML declaration looks something like this: + // + if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break; + if (strchr(" \t\r\n", p[5]) == NULL) break; + + string::const_iterator decl_end = find(p + 6, body.end(), '?'); + if (decl_end == body.end()) break; + + // Default charset for XML is UTF-8. + charset = "UTF-8"; + + string decl(p + 6, decl_end); + size_t enc = decl.find("encoding"); + if (enc == string::npos) break; + + enc = decl.find_first_not_of(" \t\r\n", enc + 8); + if (enc == string::npos || enc == decl.size()) break; + + if (decl[enc] != '=') break; + + enc = decl.find_first_not_of(" \t\r\n", enc + 1); + if (enc == string::npos || enc == decl.size()) break; + + if (decl[enc] != '"' && decl[enc] != '\'') break; + + char quote = decl[enc++]; + size_t enc_end = decl.find(quote, enc); + + if (enc != string::npos) + charset = decl.substr(enc, enc_end - enc); + + break; + } + p++; + } + + // Process text up to start of tag. + if (p > start) { + string text = body.substr(start - body.begin(), p - start); + // convert_to_utf8(text, charset); + decode_entities(text); + process_text(text); + } + + if (p == body.end()) break; + + start = p + 1; + + if (start == body.end()) break; + + if (*start == '!') { + if (++start == body.end()) break; + if (++start == body.end()) break; + // comment or SGML declaration + if (*(start - 1) == '-' && *start == '-') { + ++start; + string::const_iterator close = find(start, body.end(), '>'); + // An unterminated comment swallows rest of document + // (like Netscape, but unlike MSIE IIRC) + if (close == body.end()) break; + + p = close; + // look for --> + while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-')) + p = find(p + 1, body.end(), '>'); + + if (p != body.end()) { + // Check for htdig's "ignore this bit" comments. + if (p - start == 15 && string(start, p - 2) == "htdig_noindex") { + string::size_type i; + i = body.find("", p + 1 - body.begin()); + if (i == string::npos) break; + start = body.begin() + i + 21; + continue; + } + // If we found --> skip to there. + start = p; + } else { + // Otherwise skip to the first > we found (as Netscape does). + start = close; + } + } else { + // just an SGML declaration, perhaps giving the DTD - ignore it + start = find(start - 1, body.end(), '>'); + if (start == body.end()) break; + } + ++start; + } else if (*start == '?') { + if (++start == body.end()) break; + // PHP - swallow until ?> or EOF + start = find(start + 1, body.end(), '>'); + + // look for ?> + while (start != body.end() && *(start - 1) != '?') + start = find(start + 1, body.end(), '>'); + + // unterminated PHP swallows rest of document (rather arbitrarily + // but it avoids polluting the database when things go wrong) + if (start != body.end()) ++start; + } else { + // opening or closing tag + int closing = 0; + + if (*start == '/') { + closing = 1; + start = find_if(start + 1, body.end(), p_notwhitespace); + } + + p = start; + start = find_if(start, body.end(), p_nottag); + string tag = body.substr(p - body.begin(), start - p); + // convert tagname to lowercase + lowercase_string(tag); + + if (closing) { + closing_tag(tag); + if (in_script && tag == "script") in_script = false; + + /* ignore any bogus parameters on closing tags */ + p = find(start, body.end(), '>'); + if (p == body.end()) break; + start = p + 1; + } else { + // FIXME: parse parameters lazily. + while (start < body.end() && *start != '>') { + string name, value; + + p = find_if(start, body.end(), p_whitespaceeqgt); + + name.assign(body, start - body.begin(), p - start); + + p = find_if(p, body.end(), p_notwhitespace); + + start = p; + if (start != body.end() && *start == '=') { + start = find_if(start + 1, body.end(), p_notwhitespace); + + p = body.end(); + + int quote = *start; + if (quote == '"' || quote == '\'') { + start++; + p = find(start, body.end(), quote); + } + + if (p == body.end()) { + // unquoted or no closing quote + p = find_if(start, body.end(), p_whitespacegt); + } + value.assign(body, start - body.begin(), p - start); + start = find_if(p, body.end(), p_notwhitespace); + + if (!name.empty()) { + // convert parameter name to lowercase + lowercase_string(name); + // in case of multiple entries, use the first + // (as Netscape does) + parameters.insert(make_pair(name, value)); + } + } + } +#if 0 + cout << "<" << tag; + map::const_iterator x; + for (x = parameters.begin(); x != parameters.end(); x++) { + cout << " " << x->first << "=\"" << x->second << "\""; + } + cout << ">\n"; +#endif + opening_tag(tag); + parameters.clear(); + + // In