use std::env;
use std::fmt;
use std::fs::{self, File};
+use std::mem;
use std::path::{Path, PathBuf};
+use std::process::Command;
+use curl::easy::{Easy, List};
use git2::{self, ObjectType};
use serde::ser::{self, Serialize};
use url::Url;
pub fn checkout(&self, into: &Path, cargo_config: &Config) -> CargoResult<GitDatabase> {
let repo = match git2::Repository::open(into) {
- Ok(repo) => {
- self.fetch_into(&repo, cargo_config).chain_err(|| {
+ Ok(mut repo) => {
+ self.fetch_into(&mut repo, cargo_config).chain_err(|| {
format!("failed to fetch into {}", into.display())
})?;
repo
})
}
- fn fetch_into(&self, dst: &git2::Repository, cargo_config: &Config) -> CargoResult<()> {
+ fn fetch_into(&self, dst: &mut git2::Repository, cargo_config: &Config) -> CargoResult<()> {
// Create a local anonymous remote in the repository to fetch the url
- let url = self.url.to_string();
let refspec = "refs/heads/*:refs/heads/*";
- fetch(dst, &url, refspec, cargo_config)
+ fetch(dst, &self.url, refspec, cargo_config)
}
fn clone_into(&self, dst: &Path, cargo_config: &Config) -> CargoResult<git2::Repository> {
- let url = self.url.to_string();
if fs::metadata(&dst).is_ok() {
fs::remove_dir_all(dst)?;
}
fs::create_dir_all(dst)?;
- let repo = git2::Repository::init_bare(dst)?;
- fetch(&repo, &url, "refs/heads/*:refs/heads/*", cargo_config)?;
+ let mut repo = git2::Repository::init_bare(dst)?;
+ fetch(&mut repo, &self.url, "refs/heads/*:refs/heads/*", cargo_config)?;
Ok(repo)
}
}
-> CargoResult<GitCheckout> {
let checkout = match git2::Repository::open(dest) {
Ok(repo) => {
- let checkout = GitCheckout::new(dest, self, rev, repo);
+ let mut checkout = GitCheckout::new(dest, self, rev, repo);
if !checkout.is_fresh() {
checkout.fetch(cargo_config)?;
checkout.reset()?;
let url = source.to_url()?;
let url = url.to_string();
- let repo = git2::Repository::clone(&url, into)
+ let repo = git2::Repository::clone(&url, into)
.chain_err(|| {
internal(format!("failed to clone {} into {}", source.display(),
into.display()))
}
}
- fn fetch(&self, cargo_config: &Config) -> CargoResult<()> {
+ fn fetch(&mut self, cargo_config: &Config) -> CargoResult<()> {
info!("fetch {}", self.repo.path().display());
let url = self.database.path.to_url()?;
- let url = url.to_string();
let refspec = "refs/heads/*:refs/heads/*";
- fetch(&self.repo, &url, refspec, cargo_config)?;
+ fetch(&mut self.repo, &url, refspec, cargo_config)?;
Ok(())
}
let target = repo.head()?.target();
Ok((target, repo))
});
- let repo = match head_and_repo {
+ let mut repo = match head_and_repo {
Ok((head, repo)) => {
if child.head_id() == head {
return Ok(())
// Fetch data from origin and reset to the head commit
let refspec = "refs/heads/*:refs/heads/*";
- fetch(&repo, url, refspec, cargo_config).chain_err(|| {
+ let url = url.to_url()?;
+ fetch(&mut repo, &url, refspec, cargo_config).chain_err(|| {
internal(format!("failed to fetch submodule `{}` from {}",
child.name().unwrap_or(""), url))
})?;
})
}
-pub fn fetch(repo: &git2::Repository,
- url: &str,
+pub fn fetch(repo: &mut git2::Repository,
+ url: &Url,
refspec: &str,
config: &Config) -> CargoResult<()> {
if !config.network_allowed() {
was specified")
}
- with_authentication(url, &repo.config()?, |f| {
+ // If we're fetching from github, attempt github's special fast path for
+ // testing if we've already got an up-to-date copy of the repository
+ if url.host_str() == Some("github.com") {
+ if let Ok(oid) = repo.refname_to_id("refs/remotes/origin/master") {
+ let mut handle = config.http()?.borrow_mut();
+ debug!("attempting github fast path for {}", url);
+ if github_up_to_date(&mut handle, url, &oid) {
+ return Ok(())
+ } else {
+ debug!("fast path failed, falling back to a git fetch");
+ }
+ }
+ }
+
+ // We reuse repositories quite a lot, so before we go through and update the
+ // repo check to see if it's a little too old and could benefit from a gc.
+ // In theory this shouldn't be too too expensive compared to the network
+ // request we're about to issue.
+ maybe_gc_repo(repo)?;
+
+ debug!("doing a fetch for {}", url);
+ with_authentication(url.as_str(), &repo.config()?, |f| {
let mut cb = git2::RemoteCallbacks::new();
cb.credentials(f);
// Create a local anonymous remote in the repository to fetch the url
- let mut remote = repo.remote_anonymous(url)?;
+ let mut remote = repo.remote_anonymous(url.as_str())?;
let mut opts = git2::FetchOptions::new();
opts.remote_callbacks(cb)
.download_tags(git2::AutotagOption::All);
network::with_retry(config, || {
+ debug!("initiating fetch of {} from {}", refspec, url);
remote.fetch(&[refspec], Some(&mut opts), None)
.map_err(CargoError::from)
})?;
Ok(())
})
}
+
+/// Cargo has a bunch of long-lived git repositories in its global cache and
+/// some, like the index, are updated very frequently. Right now each update
+/// creates a new "pack file" inside the git database, and over time this can
+/// cause bad performance and bad current behavior in libgit2.
+///
+/// One pathological use case today is where libgit2 opens hundreds of file
+/// descriptors, getting us dangerously close to blowing out the OS limits of
+/// how many fds we can have open. This is detailed in #4403.
+///
+/// To try to combat this problem we attempt a `git gc` here. Note, though, that
+/// we may not even have `git` installed on the system! As a result we
+/// opportunistically try a `git gc` when the pack directory looks too big, and
+/// failing that we just blow away the repository and start over.
+fn maybe_gc_repo(repo: &mut git2::Repository) -> CargoResult<()> {
+ // Here we arbitrarily declare that if you have more than 100 files in your
+ // `pack` folder that we need to do a gc.
+ let entries = match repo.path().join("objects/pack").read_dir() {
+ Ok(e) => e.count(),
+ Err(_) => {
+ debug!("skipping gc as pack dir appears gone");
+ return Ok(())
+ }
+ };
+ let max = env::var("__CARGO_PACKFILE_LIMIT").ok()
+ .and_then(|s| s.parse::<usize>().ok())
+ .unwrap_or(100);
+ if entries < max {
+ debug!("skipping gc as there's only {} pack files", entries);
+ return Ok(())
+ }
+
+ // First up, try a literal `git gc` by shelling out to git. This is pretty
+ // likely to fail though as we may not have `git` installed. Note that
+ // libgit2 doesn't currently implement the gc operation, so there's no
+ // equivalent there.
+ match Command::new("git").arg("gc").current_dir(repo.path()).output() {
+ Ok(out) => {
+ debug!("git-gc status: {}\n\nstdout ---\n{}\nstderr ---\n{}",
+ out.status,
+ String::from_utf8_lossy(&out.stdout),
+ String::from_utf8_lossy(&out.stderr));
+ if out.status.success() {
+ let new = git2::Repository::open(repo.path())?;
+ mem::replace(repo, new);
+ return Ok(())
+ }
+ }
+ Err(e) => debug!("git-gc failed to spawn: {}", e),
+ }
+
+ // Alright all else failed, let's start over.
+ //
+ // Here we want to drop the current repository object pointed to by `repo`,
+ // so we initialize temporary repository in a sub-folder, blow away the
+ // existing git folder, and then recreate the git repo. Finally we blow away
+ // the `tmp` folder we allocated.
+ let path = repo.path().to_path_buf();
+ let tmp = path.join("tmp");
+ mem::replace(repo, git2::Repository::init(&tmp)?);
+ for entry in path.read_dir()? {
+ let entry = entry?;
+ if entry.file_name().to_str() == Some("tmp") {
+ continue
+ }
+ let path = entry.path();
+ drop(fs::remove_file(&path).or_else(|_| fs::remove_dir_all(&path)));
+ }
+ if repo.is_bare() {
+ mem::replace(repo, git2::Repository::init_bare(path)?);
+ } else {
+ mem::replace(repo, git2::Repository::init(path)?);
+ }
+ fs::remove_dir_all(&tmp).chain_err(|| {
+ format!("failed to remove {:?}", tmp)
+ })?;
+ Ok(())
+}
+
+/// Updating the index is done pretty regularly so we want it to be as fast as
+/// possible. For registries hosted on github (like the crates.io index) there's
+/// a fast path available to use [1] to tell us that there's no updates to be
+/// made.
+///
+/// This function will attempt to hit that fast path and verify that the `oid`
+/// is actually the current `master` branch of the repository. If `true` is
+/// returned then no update needs to be performed, but if `false` is returned
+/// then the standard update logic still needs to happen.
+///
+/// [1]: https://developer.github.com/v3/repos/commits/#get-the-sha-1-of-a-commit-reference
+///
+/// Note that this function should never cause an actual failure because it's
+/// just a fast path. As a result all errors are ignored in this function and we
+/// just return a `bool`. Any real errors will be reported through the normal
+/// update path above.
+fn github_up_to_date(handle: &mut Easy, url: &Url, oid: &git2::Oid) -> bool {
+ macro_rules! try {
+ ($e:expr) => (match $e {
+ Some(e) => e,
+ None => return false,
+ })
+ }
+
+ // This expects github urls in the form `github.com/user/repo` and nothing
+ // else
+ let mut pieces = try!(url.path_segments());
+ let username = try!(pieces.next());
+ let repo = try!(pieces.next());
+ if pieces.next().is_some() {
+ return false
+ }
+
+ let url = format!("https://api.github.com/repos/{}/{}/commits/master",
+ username, repo);
+ try!(handle.get(true).ok());
+ try!(handle.url(&url).ok());
+ try!(handle.useragent("cargo").ok());
+ let mut headers = List::new();
+ try!(headers.append("Accept: application/vnd.github.3.sha").ok());
+ try!(headers.append(&format!("If-None-Match: \"{}\"", oid)).ok());
+ try!(handle.http_headers(headers).ok());
+ try!(handle.perform().ok());
+
+ try!(handle.response_code().ok()) == 304
+}
use std::mem;
use std::path::Path;
-use curl::easy::{Easy, List};
use git2;
use hex::ToHex;
use serde_json;
-use url::Url;
use core::{PackageId, SourceId};
use ops;
cache_path: Filesystem,
source_id: SourceId,
config: &'cfg Config,
- handle: LazyCell<RefCell<Easy>>,
tree: RefCell<Option<git2::Tree<'static>>>,
repo: LazyCell<git2::Repository>,
head: Cell<Option<git2::Oid>>,
source_id: source_id.clone(),
config: config,
tree: RefCell::new(None),
- handle: LazyCell::new(),
repo: LazyCell::new(),
head: Cell::new(None),
}
}
- fn easy(&self) -> CargoResult<&RefCell<Easy>> {
- self.handle.get_or_try_init(|| {
- ops::http_handle(self.config).map(RefCell::new)
- })
- }
-
fn repo(&self) -> CargoResult<&git2::Repository> {
self.repo.get_or_try_init(|| {
let path = self.index_path.clone().into_path_unlocked();
// hit the index, which may not actually read this configuration.
ops::http_handle(self.config)?;
- let repo = self.repo()?;
+ self.repo()?;
+ self.head.set(None);
+ *self.tree.borrow_mut() = None;
let _lock = self.index_path.open_rw(Path::new(INDEX_LOCK),
self.config,
"the registry index")?;
self.config.shell().status("Updating",
format!("registry `{}`", self.source_id.url()))?;
- let mut needs_fetch = true;
-
- if self.source_id.url().host_str() == Some("github.com") {
- if let Ok(oid) = self.head() {
- let mut handle = self.easy()?.borrow_mut();
- debug!("attempting github fast path for {}",
- self.source_id.url());
- if github_up_to_date(&mut handle, self.source_id.url(), &oid) {
- needs_fetch = false;
- } else {
- debug!("fast path failed, falling back to a git fetch");
- }
- }
- }
-
- if needs_fetch {
- // git fetch origin master
- let url = self.source_id.url().to_string();
- let refspec = "refs/heads/master:refs/remotes/origin/master";
- git::fetch(&repo, &url, refspec, self.config).chain_err(|| {
- format!("failed to fetch `{}`", url)
- })?;
- }
- self.head.set(None);
- *self.tree.borrow_mut() = None;
+ // git fetch origin master
+ let url = self.source_id.url();
+ let refspec = "refs/heads/master:refs/remotes/origin/master";
+ let repo = self.repo.borrow_mut().unwrap();
+ git::fetch(repo, url, refspec, self.config).chain_err(|| {
+ format!("failed to fetch `{}`", url)
+ })?;
Ok(())
}
// download we should resume either from the start or the middle
// on the next time
let url = url.to_string();
- let mut handle = self.easy()?.borrow_mut();
+ let mut handle = self.config.http()?.borrow_mut();
handle.get(true)?;
handle.url(&url)?;
handle.follow_location(true)?;
self.tree.borrow_mut().take();
}
}
-
-/// Updating the index is done pretty regularly so we want it to be as fast as
-/// possible. For registries hosted on github (like the crates.io index) there's
-/// a fast path available to use [1] to tell us that there's no updates to be
-/// made.
-///
-/// This function will attempt to hit that fast path and verify that the `oid`
-/// is actually the current `master` branch of the repository. If `true` is
-/// returned then no update needs to be performed, but if `false` is returned
-/// then the standard update logic still needs to happen.
-///
-/// [1]: https://developer.github.com/v3/repos/commits/#get-the-sha-1-of-a-commit-reference
-///
-/// Note that this function should never cause an actual failure because it's
-/// just a fast path. As a result all errors are ignored in this function and we
-/// just return a `bool`. Any real errors will be reported through the normal
-/// update path above.
-fn github_up_to_date(handle: &mut Easy, url: &Url, oid: &git2::Oid) -> bool {
- macro_rules! try {
- ($e:expr) => (match $e {
- Some(e) => e,
- None => return false,
- })
- }
-
- // This expects github urls in the form `github.com/user/repo` and nothing
- // else
- let mut pieces = try!(url.path_segments());
- let username = try!(pieces.next());
- let repo = try!(pieces.next());
- if pieces.next().is_some() {
- return false
- }
-
- let url = format!("https://api.github.com/repos/{}/{}/commits/master",
- username, repo);
- try!(handle.get(true).ok());
- try!(handle.url(&url).ok());
- try!(handle.useragent("cargo").ok());
- let mut headers = List::new();
- try!(headers.append("Accept: application/vnd.github.3.sha").ok());
- try!(headers.append(&format!("If-None-Match: \"{}\"", oid)).ok());
- try!(handle.http_headers(headers).ok());
- try!(handle.perform().ok());
-
- try!(handle.response_code().ok()) == 304
-}
--- /dev/null
+extern crate cargotest;
+extern crate git2;
+extern crate hamcrest;
+extern crate url;
+
+use std::env;
+use std::ffi::OsStr;
+use std::path::PathBuf;
+use std::process::Command;
+
+use cargotest::support::{execs, project};
+use cargotest::support::registry::Package;
+use cargotest::support::paths;
+use cargotest::support::git;
+use hamcrest::assert_that;
+
+use url::Url;
+
+fn find_index() -> PathBuf {
+ let dir = paths::home().join(".cargo/registry/index");
+ dir.read_dir().unwrap().next().unwrap().unwrap().path()
+}
+
+fn run_test(path_env: Option<&OsStr>) {
+ const N: usize = 50;
+
+ let foo = project("foo")
+ .file("Cargo.toml", r#"
+ [package]
+ name = "foo"
+ version = "0.0.1"
+ authors = []
+
+ [dependencies]
+ bar = "*"
+ "#)
+ .file("src/lib.rs", "");
+ Package::new("bar", "0.1.0").publish();
+
+ assert_that(foo.cargo_process("build"),
+ execs().with_status(0));
+
+ let index = find_index();
+ let path = paths::home().join("tmp");
+ let url = Url::from_file_path(&path).unwrap().to_string();
+ let repo = git2::Repository::init(&path).unwrap();
+ let index = git2::Repository::open(&index).unwrap();
+ let mut cfg = repo.config().unwrap();
+ cfg.set_str("user.email", "foo@bar.com").unwrap();
+ cfg.set_str("user.name", "Foo Bar").unwrap();
+ let mut cfg = index.config().unwrap();
+ cfg.set_str("user.email", "foo@bar.com").unwrap();
+ cfg.set_str("user.name", "Foo Bar").unwrap();
+
+ for _ in 0..N {
+ git::commit(&repo);
+ index.remote_anonymous(&url).unwrap()
+ .fetch(&["refs/heads/master:refs/remotes/foo/master"],
+ None,
+ None).unwrap();
+ }
+ drop((repo, index));
+ Package::new("bar", "0.1.1").publish();
+
+ let before = find_index().join(".git/objects/pack")
+ .read_dir().unwrap()
+ .count();
+ assert!(before > N);
+
+ let mut cmd = foo.cargo("update");
+ cmd.env("__CARGO_PACKFILE_LIMIT", "10");
+ if let Some(path) = path_env {
+ cmd.env("PATH", path);
+ }
+ cmd.env("RUST_LOG", "trace");
+ assert_that(cmd, execs().with_status(0));
+ let after = find_index().join(".git/objects/pack")
+ .read_dir().unwrap()
+ .count();
+ assert!(after < before,
+ "packfiles before: {}\n\
+ packfiles after: {}", before, after);
+}
+
+#[test]
+fn use_git_gc() {
+ if Command::new("git").arg("--version").output().is_err() {
+ return
+ }
+ run_test(None);
+}
+
+#[test]
+// it looks like this test passes on some windows machines but not others,
+// notably not on AppVeyor's machines. Sounds like another but for another day.
+#[cfg_attr(windows, ignore)]
+fn avoid_using_git() {
+ let path = env::var_os("PATH").unwrap_or(Default::default());
+ let mut paths = env::split_paths(&path).collect::<Vec<_>>();
+ let idx = paths.iter().position(|p| {
+ p.join("git").exists() || p.join("git.exe").exists()
+ });
+ match idx {
+ Some(i) => { paths.remove(i); }
+ None => return,
+ }
+ run_test(Some(&env::join_paths(&paths).unwrap()));
+}