diff --git a/src/applications/diffusion/query/DiffusionCommitQuery.php b/src/applications/diffusion/query/DiffusionCommitQuery.php index 05c908e838..cb5ae37c41 100644 --- a/src/applications/diffusion/query/DiffusionCommitQuery.php +++ b/src/applications/diffusion/query/DiffusionCommitQuery.php @@ -1,190 +1,193 @@ identifiers = $identifiers; return $this; } /** * If a default repository is provided, ambiguous commit identifiers will * be assumed to belong to the default repository. * * For example, "r123" appearing in a commit message in repository X is * likely to be unambiguously "rX123". Normally the reference would be * considered ambiguous, but if you provide a default repository it will * be correctly resolved. */ public function withDefaultRepository(PhabricatorRepository $repository) { $this->defaultRepository = $repository; return $this; } public function withIDs(array $ids) { $this->ids = $ids; return $this; } public function withPHIDs(array $phids) { $this->phids = $phids; return $this; } protected function loadPage() { $table = new PhabricatorRepositoryCommit(); $conn_r = $table->establishConnection('r'); $data = queryfx_all( $conn_r, 'SELECT * FROM %T %Q %Q %Q', $table->getTableName(), $this->buildWhereClause($conn_r), $this->buildOrderClause($conn_r), $this->buildLimitClause($conn_r)); return $table->loadAllFromArray($data); } public function willFilterPage(array $commits) { if (!$commits) { return array(); } $repository_ids = mpull($commits, 'getRepositoryID', 'getRepositoryID'); $repos = id(new PhabricatorRepositoryQuery()) ->setViewer($this->getViewer()) ->withIDs($repository_ids) ->execute(); foreach ($commits as $key => $commit) { $repo = idx($repos, $commit->getRepositoryID()); if ($repo) { $commit->attachRepository($repo); } else { unset($commits[$key]); } } return $commits; } private function buildWhereClause(AphrontDatabaseConnection $conn_r) { $where = array(); if ($this->identifiers) { $min_unqualified = PhabricatorRepository::MINIMUM_UNQUALIFIED_HASH; $min_qualified = PhabricatorRepository::MINIMUM_QUALIFIED_HASH; $refs = array(); $bare = array(); foreach ($this->identifiers as $identifier) { $matches = null; preg_match('/^(?:r([A-Z]+))?(.*)$/', $identifier, $matches); $repo = nonempty($matches[1], null); $identifier = nonempty($matches[2], null); if ($repo === null) { if ($this->defaultRepository) { $repo = $this->defaultRepository->getCallsign(); } } if ($repo === null) { if (strlen($identifier) < $min_unqualified) { continue; } $bare[] = $identifier; } else { $refs[] = array( 'callsign' => $repo, 'identifier' => $identifier, ); } } $sql = array(); foreach ($bare as $identifier) { $sql[] = qsprintf( $conn_r, '(commitIdentifier LIKE %> AND LENGTH(commitIdentifier) = 40)', $identifier); } if ($refs) { $callsigns = ipull($refs, 'callsign'); $repos = id(new PhabricatorRepositoryQuery()) ->setViewer($this->getViewer()) ->withCallsigns($callsigns) ->execute(); $repos = mpull($repos, null, 'getCallsign'); foreach ($refs as $key => $ref) { $repo = idx($repos, $ref['callsign']); if (!$repo) { continue; } if ($repo->isSVN()) { if (!ctype_digit($ref['identifier'])) { continue; } $sql[] = qsprintf( $conn_r, - '(repositoryID = %d AND commitIdentifier = %d)', + '(repositoryID = %d AND commitIdentifier = %s)', $repo->getID(), - $ref['identifier']); + // NOTE: Because the 'commitIdentifier' column is a string, MySQL + // ignores the index if we hand it an integer. Hand it a string. + // See T3377. + (int)$ref['identifier']); } else { if (strlen($ref['identifier']) < $min_qualified) { continue; } $sql[] = qsprintf( $conn_r, '(repositoryID = %d AND commitIdentifier LIKE %>)', $repo->getID(), $ref['identifier']); } } } if (!$sql) { // If we discarded all possible identifiers (e.g., they all referenced // bogus repositories or were all too short), make sure the query finds // nothing. throw new PhabricatorEmptyQueryException('No commit identifiers.'); } $where[] = '('.implode(' OR ', $sql).')'; } if ($this->ids) { $where[] = qsprintf( $conn_r, 'id IN (%Ld)', $this->ids); } if ($this->phids) { $where[] = qsprintf( $conn_r, 'phid IN (%Ls)', $this->phids); } return $this->formatWhereClause($where); } } diff --git a/src/applications/repository/daemon/PhabricatorRepositoryPullLocalDaemon.php b/src/applications/repository/daemon/PhabricatorRepositoryPullLocalDaemon.php index 5fc45162f6..cac1946506 100644 --- a/src/applications/repository/daemon/PhabricatorRepositoryPullLocalDaemon.php +++ b/src/applications/repository/daemon/PhabricatorRepositoryPullLocalDaemon.php @@ -1,709 +1,709 @@ repair = $repair; return $this; } /* -( Pulling Repositories )----------------------------------------------- */ /** * @task pull */ public function run() { $argv = $this->getArgv(); array_unshift($argv, __CLASS__); $args = new PhutilArgumentParser($argv); $args->parse( array( array( 'name' => 'no-discovery', 'help' => 'Pull only, without discovering commits.', ), array( 'name' => 'not', 'param' => 'repository', 'repeat' => true, 'help' => 'Do not pull __repository__.', ), array( 'name' => 'repositories', 'wildcard' => true, 'help' => 'Pull specific __repositories__ instead of all.', ), )); $no_discovery = $args->getArg('no-discovery'); $repo_names = $args->getArg('repositories'); $exclude_names = $args->getArg('not'); // Each repository has an individual pull frequency; after we pull it, // wait that long to pull it again. When we start up, try to pull everything // serially. $retry_after = array(); $min_sleep = 15; while (true) { $repositories = $this->loadRepositories($repo_names); if ($exclude_names) { $exclude = $this->loadRepositories($exclude_names); $repositories = array_diff_key($repositories, $exclude); } // Shuffle the repositories, then re-key the array since shuffle() // discards keys. This is mostly for startup, we'll use soft priorities // later. shuffle($repositories); $repositories = mpull($repositories, null, 'getID'); // If any repositories were deleted, remove them from the retry timer map // so we don't end up with a retry timer that never gets updated and // causes us to sleep for the minimum amount of time. $retry_after = array_select_keys( $retry_after, array_keys($repositories)); // Assign soft priorities to repositories based on how frequently they // should pull again. asort($retry_after); $repositories = array_select_keys( $repositories, array_keys($retry_after)) + $repositories; foreach ($repositories as $id => $repository) { $after = idx($retry_after, $id, 0); if ($after > time()) { continue; } $tracked = $repository->isTracked(); if (!$tracked) { continue; } try { $callsign = $repository->getCallsign(); $this->log("Updating repository '{$callsign}'."); id(new PhabricatorRepositoryPullEngine()) ->setRepository($repository) ->pullRepository(); if (!$no_discovery) { // TODO: It would be nice to discover only if we pulled something, // but this isn't totally trivial. $lock_name = get_class($this).':'.$callsign; $lock = PhabricatorGlobalLock::newLock($lock_name); $lock->lock(); try { $this->discoverRepository($repository); } catch (Exception $ex) { $lock->unlock(); throw $ex; } $lock->unlock(); } $sleep_for = $repository->getDetail('pull-frequency', $min_sleep); $retry_after[$id] = time() + $sleep_for; } catch (PhutilLockException $ex) { $retry_after[$id] = time() + $min_sleep; $this->log("Failed to acquire lock."); } catch (Exception $ex) { $retry_after[$id] = time() + $min_sleep; phlog($ex); } $this->stillWorking(); } if ($retry_after) { $sleep_until = max(min($retry_after), time() + $min_sleep); } else { $sleep_until = time() + $min_sleep; } $this->sleep($sleep_until - time()); } } /** * @task pull */ protected function loadRepositories(array $names) { if (!count($names)) { return id(new PhabricatorRepository())->loadAll(); } else { return PhabricatorRepository::loadAllByPHIDOrCallsign($names); } } public function discoverRepository(PhabricatorRepository $repository) { $vcs = $repository->getVersionControlSystem(); switch ($vcs) { case PhabricatorRepositoryType::REPOSITORY_TYPE_GIT: return $this->executeGitDiscover($repository); case PhabricatorRepositoryType::REPOSITORY_TYPE_SVN: $refs = $this->getDiscoveryEngine($repository) ->discoverCommits(); break; case PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL: return $this->executeHgDiscover($repository); default: throw new Exception("Unknown VCS '{$vcs}'!"); } foreach ($refs as $ref) { $this->recordCommit( $repository, $ref->getIdentifier(), $ref->getEpoch(), $ref->getBranch()); } return (bool)count($refs); } private function getDiscoveryEngine(PhabricatorRepository $repository) { $id = $repository->getID(); if (empty($this->discoveryEngines[$id])) { $engine = id(new PhabricatorRepositoryDiscoveryEngine()) ->setRepository($repository) ->setVerbose($this->getVerbose()) ->setRepairMode($this->repair); $this->discoveryEngines[$id] = $engine; } return $this->discoveryEngines[$id]; } private function isKnownCommit( PhabricatorRepository $repository, $target) { if ($this->getCache($repository, $target)) { return true; } if ($this->repair) { // In repair mode, rediscover the entire repository, ignoring the // database state. We can hit the local cache above, but if we miss it // stop the script from going to the database cache. return false; } $commit = id(new PhabricatorRepositoryCommit())->loadOneWhere( - 'repositoryID = %s AND commitIdentifier = %s', + 'repositoryID = %d AND commitIdentifier = %s', $repository->getID(), $target); if (!$commit) { return false; } $this->setCache($repository, $target); while (count($this->commitCache) > 2048) { array_shift($this->commitCache); } return true; } private function isKnownCommitOnAnyAutocloseBranch( PhabricatorRepository $repository, $target) { $commit = id(new PhabricatorRepositoryCommit())->loadOneWhere( - 'repositoryID = %s AND commitIdentifier = %s', + 'repositoryID = %d AND commitIdentifier = %s', $repository->getID(), $target); if (!$commit) { $callsign = $repository->getCallsign(); $console = PhutilConsole::getConsole(); $console->writeErr( "WARNING: Repository '%s' is missing commits ('%s' is missing from ". "history). Run '%s' to repair the repository.\n", $callsign, $target, "bin/repository discover --repair {$callsign}"); return false; } $data = $commit->loadCommitData(); if (!$data) { return false; } if ($repository->shouldAutocloseCommit($commit, $data)) { return true; } return false; } private function recordCommit( PhabricatorRepository $repository, $commit_identifier, $epoch, $branch = null) { $commit = new PhabricatorRepositoryCommit(); $commit->setRepositoryID($repository->getID()); $commit->setCommitIdentifier($commit_identifier); $commit->setEpoch($epoch); $data = new PhabricatorRepositoryCommitData(); if ($branch) { $data->setCommitDetail('seenOnBranches', array($branch)); } try { $commit->openTransaction(); $commit->save(); $data->setCommitID($commit->getID()); $data->save(); $commit->saveTransaction(); $event = new PhabricatorTimelineEvent( 'cmit', array( 'id' => $commit->getID(), )); $event->recordEvent(); $this->insertTask($repository, $commit); queryfx( $repository->establishConnection('w'), 'INSERT INTO %T (repositoryID, size, lastCommitID, epoch) VALUES (%d, 1, %d, %d) ON DUPLICATE KEY UPDATE size = size + 1, lastCommitID = IF(VALUES(epoch) > epoch, VALUES(lastCommitID), lastCommitID), epoch = IF(VALUES(epoch) > epoch, VALUES(epoch), epoch)', PhabricatorRepository::TABLE_SUMMARY, $repository->getID(), $commit->getID(), $epoch); if ($this->repair) { // Normally, the query should throw a duplicate key exception. If we // reach this in repair mode, we've actually performed a repair. $this->log("Repaired commit '{$commit_identifier}'."); } $this->setCache($repository, $commit_identifier); PhutilEventEngine::dispatchEvent( new PhabricatorEvent( PhabricatorEventType::TYPE_DIFFUSION_DIDDISCOVERCOMMIT, array( 'repository' => $repository, 'commit' => $commit, ))); } catch (AphrontQueryDuplicateKeyException $ex) { $commit->killTransaction(); // Ignore. This can happen because we discover the same new commit // more than once when looking at history, or because of races or // data inconsistency or cosmic radiation; in any case, we're still // in a good state if we ignore the failure. $this->setCache($repository, $commit_identifier); } } private function updateCommit( PhabricatorRepository $repository, $commit_identifier, $branch) { $commit = id(new PhabricatorRepositoryCommit())->loadOneWhere( - 'repositoryID = %s AND commitIdentifier = %s', + 'repositoryID = %d AND commitIdentifier = %s', $repository->getID(), $commit_identifier); if (!$commit) { // This can happen if the phabricator DB doesn't have the commit info, // or the commit is so big that phabricator couldn't parse it. In this // case we just ignore it. return; } $data = id(new PhabricatorRepositoryCommitData())->loadOneWhere( 'commitID = %d', $commit->getID()); if (!$data) { $data = new PhabricatorRepositoryCommitData(); $data->setCommitID($commit->getID()); } $branches = $data->getCommitDetail('seenOnBranches', array()); $branches[] = $branch; $data->setCommitDetail('seenOnBranches', $branches); $data->save(); $this->insertTask( $repository, $commit, array( 'only' => true )); } private function insertTask( PhabricatorRepository $repository, PhabricatorRepositoryCommit $commit, $data = array()) { $vcs = $repository->getVersionControlSystem(); switch ($vcs) { case PhabricatorRepositoryType::REPOSITORY_TYPE_GIT: $class = 'PhabricatorRepositoryGitCommitMessageParserWorker'; break; case PhabricatorRepositoryType::REPOSITORY_TYPE_SVN: $class = 'PhabricatorRepositorySvnCommitMessageParserWorker'; break; case PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL: $class = 'PhabricatorRepositoryMercurialCommitMessageParserWorker'; break; default: throw new Exception("Unknown repository type '{$vcs}'!"); } $data['commitID'] = $commit->getID(); PhabricatorWorker::scheduleTask($class, $data); } private function setCache( PhabricatorRepository $repository, $commit_identifier) { $key = $this->getCacheKey($repository, $commit_identifier); $this->commitCache[$key] = true; } private function getCache( PhabricatorRepository $repository, $commit_identifier) { $key = $this->getCacheKey($repository, $commit_identifier); return idx($this->commitCache, $key, false); } private function getCacheKey( PhabricatorRepository $repository, $commit_identifier) { return $repository->getID().':'.$commit_identifier; } /* -( Git Implementation )------------------------------------------------- */ /** * @task git */ private function executeGitDiscover( PhabricatorRepository $repository) { list($remotes) = $repository->execxLocalCommand( 'remote show -n origin'); $matches = null; if (!preg_match('/^\s*Fetch URL:\s*(.*?)\s*$/m', $remotes, $matches)) { throw new Exception( "Expected 'Fetch URL' in 'git remote show -n origin'."); } self::executeGitVerifySameOrigin( $matches[1], $repository->getRemoteURI(), $repository->getLocalPath()); list($stdout) = $repository->execxLocalCommand( 'branch -r --verbose --no-abbrev'); $branches = DiffusionGitBranch::parseRemoteBranchOutput( $stdout, $only_this_remote = DiffusionBranchInformation::DEFAULT_GIT_REMOTE); $callsign = $repository->getCallsign(); $tracked_something = false; $this->log("Discovering commits in repository '{$callsign}'..."); foreach ($branches as $name => $commit) { $this->log("Examining branch '{$name}', at {$commit}."); if (!$repository->shouldTrackBranch($name)) { $this->log("Skipping, branch is untracked."); continue; } $tracked_something = true; if ($this->isKnownCommit($repository, $commit)) { $this->log("Skipping, HEAD is known."); continue; } $this->log("Looking for new commits."); $this->executeGitDiscoverCommit($repository, $commit, $name, false); } if (!$tracked_something) { $repo_name = $repository->getName(); $repo_callsign = $repository->getCallsign(); throw new Exception( "Repository r{$repo_callsign} '{$repo_name}' has no tracked branches! ". "Verify that your branch filtering settings are correct."); } $this->log("Discovering commits on autoclose branches..."); foreach ($branches as $name => $commit) { $this->log("Examining branch '{$name}', at {$commit}'."); if (!$repository->shouldTrackBranch($name)) { $this->log("Skipping, branch is untracked."); continue; } if (!$repository->shouldAutocloseBranch($name)) { $this->log("Skipping, branch is not autoclose."); continue; } if ($this->isKnownCommitOnAnyAutocloseBranch($repository, $commit)) { $this->log("Skipping, commit is known on an autoclose branch."); continue; } $this->log("Looking for new autoclose commits."); $this->executeGitDiscoverCommit($repository, $commit, $name, true); } } /** * @task git */ private function executeGitDiscoverCommit( PhabricatorRepository $repository, $commit, $branch, $autoclose) { $discover = array($commit); $insert = array($commit); $seen_parent = array(); $stream = new PhabricatorGitGraphStream($repository, $commit); while (true) { $target = array_pop($discover); $parents = $stream->getParents($target); foreach ($parents as $parent) { if (isset($seen_parent[$parent])) { // We end up in a loop here somehow when we parse Arcanist if we // don't do this. TODO: Figure out why and draw a pretty diagram // since it's not evident how parsing a DAG with this causes the // loop to stop terminating. continue; } $seen_parent[$parent] = true; if ($autoclose) { $known = $this->isKnownCommitOnAnyAutocloseBranch( $repository, $parent); } else { $known = $this->isKnownCommit($repository, $parent); } if (!$known) { $this->log("Discovered commit '{$parent}'."); $discover[] = $parent; $insert[] = $parent; } } if (empty($discover)) { break; } } $n = count($insert); if ($autoclose) { $this->log("Found {$n} new autoclose commits on branch '{$branch}'."); } else { $this->log("Found {$n} new commits on branch '{$branch}'."); } while (true) { $target = array_pop($insert); $epoch = $stream->getCommitDate($target); $epoch = trim($epoch); if ($autoclose) { $this->updateCommit($repository, $target, $branch); } else { $this->recordCommit($repository, $target, $epoch, $branch); } if (empty($insert)) { break; } } } /** * @task git */ public static function executeGitVerifySameOrigin($remote, $expect, $where) { $remote_path = self::getPathFromGitURI($remote); $expect_path = self::getPathFromGitURI($expect); $remote_match = self::executeGitNormalizePath($remote_path); $expect_match = self::executeGitNormalizePath($expect_path); if ($remote_match != $expect_match) { throw new Exception( "Working copy at '{$where}' has a mismatched origin URL. It has ". "origin URL '{$remote}' (with remote path '{$remote_path}'), but the ". "configured URL '{$expect}' (with remote path '{$expect_path}') is ". "expected. Refusing to proceed because this may indicate that the ". "working copy is actually some other repository."); } } private static function getPathFromGitURI($raw_uri) { $uri = new PhutilURI($raw_uri); if ($uri->getProtocol()) { return $uri->getPath(); } $uri = new PhutilGitURI($raw_uri); if ($uri->getDomain()) { return $uri->getPath(); } return $raw_uri; } /** * @task git */ private static function executeGitNormalizePath($path) { // Strip away "/" and ".git", so similar paths correctly match. $path = trim($path, '/'); $path = preg_replace('/\.git$/', '', $path); return $path; } /* -( Mercurial Implementation )------------------------------------------- */ private function executeHgDiscover(PhabricatorRepository $repository) { // NOTE: "--debug" gives us 40-character hashes. list($stdout) = $repository->execxLocalCommand('--debug branches'); $branches = ArcanistMercurialParser::parseMercurialBranches($stdout); $got_something = false; foreach ($branches as $name => $branch) { $commit = $branch['rev']; if ($this->isKnownCommit($repository, $commit)) { continue; } else { $this->executeHgDiscoverCommit($repository, $commit); $got_something = true; } } return $got_something; } private function executeHgDiscoverCommit( PhabricatorRepository $repository, $commit) { $discover = array($commit); $insert = array($commit); $seen_parent = array(); $stream = new PhabricatorMercurialGraphStream($repository); // For all the new commits at the branch heads, walk backward until we // find only commits we've aleady seen. while ($discover) { $target = array_pop($discover); $parents = $stream->getParents($target); foreach ($parents as $parent) { if (isset($seen_parent[$parent])) { continue; } $seen_parent[$parent] = true; if (!$this->isKnownCommit($repository, $parent)) { $discover[] = $parent; $insert[] = $parent; } } } foreach ($insert as $target) { $epoch = $stream->getCommitDate($target); $this->recordCommit($repository, $target, $epoch); } } } diff --git a/src/applications/repository/worker/commitchangeparser/PhabricatorRepositorySvnCommitChangeParserWorker.php b/src/applications/repository/worker/commitchangeparser/PhabricatorRepositorySvnCommitChangeParserWorker.php index cff194e04f..56b05b388c 100644 --- a/src/applications/repository/worker/commitchangeparser/PhabricatorRepositorySvnCommitChangeParserWorker.php +++ b/src/applications/repository/worker/commitchangeparser/PhabricatorRepositorySvnCommitChangeParserWorker.php @@ -1,789 +1,789 @@ getDetail('remote-uri'); $svn_commit = $commit->getCommitIdentifier(); $callsign = $repository->getCallsign(); $full_name = 'r'.$callsign.$svn_commit; echo "Parsing {$full_name}...\n"; if ($this->isBadCommit($full_name)) { echo "This commit is marked bad!\n"; return; } // Pull the top-level path changes out of "svn log". This is pretty // straightforward; just parse the XML log. $log = $this->getSVNLogXMLObject($uri, $svn_commit, $verbose = true); $entry = $log->logentry[0]; if (!$entry->paths) { // TODO: Explicitly mark this commit as broken elsewhere? This isn't // supposed to happen but we have some cases like rE27 and rG935 in the // Facebook repositories where things got all clowned up. return; } $raw_paths = array(); foreach ($entry->paths->path as $path) { $name = trim((string)$path); $raw_paths[$name] = array( 'rawPath' => $name, 'rawTargetPath' => (string)$path['copyfrom-path'], 'rawChangeType' => (string)$path['action'], 'rawTargetCommit' => (string)$path['copyfrom-rev'], ); } $copied_or_moved_map = array(); $deleted_paths = array(); $add_paths = array(); foreach ($raw_paths as $path => $raw_info) { if ($raw_info['rawTargetPath']) { $copied_or_moved_map[$raw_info['rawTargetPath']][] = $raw_info; } switch ($raw_info['rawChangeType']) { case 'D': $deleted_paths[$path] = $raw_info; break; case 'A': $add_paths[$path] = $raw_info; break; } } // If a path was deleted, we need to look in the repository history to // figure out where the former valid location for it is so we can figure out // if it was a directory or not, among other things. $lookup_here = array(); foreach ($raw_paths as $path => $raw_info) { if ($raw_info['rawChangeType'] != 'D') { continue; } // If a change copies a directory and then deletes something from it, // we need to look at the old location for information about the path, not // the new location. This workflow is pretty ridiculous -- so much so that // Trac gets it wrong. See Facebook rO6 for an example, if you happen to // work at Facebook. $parents = $this->expandAllParentPaths($path, $include_self = true); foreach ($parents as $parent) { if (isset($add_paths[$parent])) { $relative_path = substr($path, strlen($parent)); $lookup_here[$path] = array( 'rawPath' => $add_paths[$parent]['rawTargetPath'].$relative_path, 'rawCommit' => $add_paths[$parent]['rawTargetCommit'], ); continue 2; } } // Otherwise we can just look at the previous revision. $lookup_here[$path] = array( 'rawPath' => $path, 'rawCommit' => $svn_commit - 1, ); } $lookup = array(); foreach ($raw_paths as $path => $raw_info) { if ($raw_info['rawChangeType'] == 'D') { $lookup[$path] = $lookup_here[$path]; } else { // For everything that wasn't deleted, we can just look it up directly. $lookup[$path] = array( 'rawPath' => $path, 'rawCommit' => $svn_commit, ); } } $effects = array(); $path_file_types = $this->lookupPathFileTypes($repository, $lookup); foreach ($raw_paths as $path => $raw_info) { if ($raw_info['rawChangeType'] == 'D' && $path_file_types[$path] == DifferentialChangeType::FILE_DIRECTORY) { // Bad. Child paths aren't enumerated in "svn log" so we need // to go fishing. $list = $this->lookupRecursiveFileList( $repository, $lookup[$path]); foreach ($list as $deleted_path => $path_file_type) { $deleted_path = rtrim($path.'/'.$deleted_path, '/'); if (!empty($raw_paths[$deleted_path])) { // We somehow learned about this deletion explicitly? // TODO: Unclear how this is possible. continue; } $effect_type = DifferentialChangeType::TYPE_DELETE; $effect_target_path = null; if (isset($copied_or_moved_map[$deleted_path])) { $effect_target_path = $path; if (count($copied_or_moved_map[$deleted_path]) > 1) { $effect_type = DifferentialChangeType::TYPE_MULTICOPY; } else { $effect_type = DifferentialChangeType::TYPE_MOVE_AWAY; } } $effects[$deleted_path] = array( 'rawPath' => $deleted_path, 'rawTargetPath' => $effect_target_path, 'rawTargetCommit' => null, 'rawDirect' => true, 'changeType' => $effect_type, 'fileType' => $path_file_type, ); $deleted_paths[$deleted_path] = $effects[$deleted_path]; } } } $resolved_types = array(); $supplemental = array(); foreach ($raw_paths as $path => $raw_info) { if (isset($resolved_types[$path])) { $type = $resolved_types[$path]; } else { switch ($raw_info['rawChangeType']) { case 'D': if (isset($copied_or_moved_map[$path])) { if (count($copied_or_moved_map[$path]) > 1) { $type = DifferentialChangeType::TYPE_MULTICOPY; } else { $type = DifferentialChangeType::TYPE_MOVE_AWAY; } } else { $type = DifferentialChangeType::TYPE_DELETE; } break; case 'A': $copy_from = $raw_info['rawTargetPath']; $copy_rev = $raw_info['rawTargetCommit']; if (!strlen($copy_from)) { $type = DifferentialChangeType::TYPE_ADD; } else { if (isset($deleted_paths[$copy_from])) { $type = DifferentialChangeType::TYPE_MOVE_HERE; $other_type = DifferentialChangeType::TYPE_MOVE_AWAY; } else { $type = DifferentialChangeType::TYPE_COPY_HERE; $other_type = DifferentialChangeType::TYPE_COPY_AWAY; } $source_file_type = $this->lookupPathFileType( $repository, $copy_from, array( 'rawPath' => $copy_from, 'rawCommit' => $copy_rev, )); if ($source_file_type == DifferentialChangeType::FILE_DELETED) { throw new Exception( "Something is wrong; source of a copy must exist."); } if ($source_file_type != DifferentialChangeType::FILE_DIRECTORY) { if (isset($raw_paths[$copy_from]) || isset($effects[$copy_from])) { break; } $effects[$copy_from] = array( 'rawPath' => $copy_from, 'rawTargetPath' => null, 'rawTargetCommit' => null, 'rawDirect' => false, 'changeType' => $other_type, 'fileType' => $source_file_type, ); } else { // ULTRADISASTER. We've added a directory which was copied // or moved from somewhere else. This is the most complex and // ridiculous case. $list = $this->lookupRecursiveFileList( $repository, array( 'rawPath' => $copy_from, 'rawCommit' => $copy_rev, )); foreach ($list as $from_path => $from_file_type) { $full_from = rtrim($copy_from.'/'.$from_path, '/'); $full_to = rtrim($path.'/'.$from_path, '/'); if (empty($raw_paths[$full_to])) { $effects[$full_to] = array( 'rawPath' => $full_to, 'rawTargetPath' => $full_from, 'rawTargetCommit' => $copy_rev, 'rawDirect' => true, 'changeType' => $type, 'fileType' => $from_file_type, ); } else { // This means we picked the file up explicitly elsewhere. // If the file as modified, SVN will drop the copy // information. We need to restore it. $supplemental[$full_to]['rawTargetPath'] = $full_from; $supplemental[$full_to]['rawTargetCommit'] = $copy_rev; if ($raw_paths[$full_to]['rawChangeType'] == 'M') { $resolved_types[$full_to] = $type; } } if (empty($raw_paths[$full_from]) && empty($effects[$full_from])) { if ($other_type == DifferentialChangeType::TYPE_COPY_AWAY) { // Add an indirect effect for the copied file, if we // don't already have an entry for it (e.g., a separate // change). $effects[$full_from] = array( 'rawPath' => $full_from, 'rawTargetPath' => null, 'rawTargetCommit' => null, 'rawDirect' => false, 'changeType' => $other_type, 'fileType' => $from_file_type, ); } } } } } break; // This is "replaced", caused by "svn rm"-ing a file, putting another // in its place, and then "svn add"-ing it. We do not distinguish // between this and "M". case 'R': case 'M': if (isset($copied_or_moved_map[$path])) { $type = DifferentialChangeType::TYPE_COPY_AWAY; } else { $type = DifferentialChangeType::TYPE_CHANGE; } break; } } $resolved_types[$path] = $type; } foreach ($raw_paths as $path => $raw_info) { $raw_paths[$path]['changeType'] = $resolved_types[$path]; if (isset($supplemental[$path])) { foreach ($supplemental[$path] as $key => $value) { $raw_paths[$path][$key] = $value; } } } foreach ($raw_paths as $path => $raw_info) { $effects[$path] = array( 'rawPath' => $path, 'rawTargetPath' => $raw_info['rawTargetPath'], 'rawTargetCommit' => $raw_info['rawTargetCommit'], 'rawDirect' => true, 'changeType' => $raw_info['changeType'], 'fileType' => $path_file_types[$path], ); } $parents = array(); foreach ($effects as $path => $effect) { foreach ($this->expandAllParentPaths($path) as $parent_path) { $parents[$parent_path] = true; } } $parents = array_keys($parents); foreach ($parents as $parent) { if (isset($effects[$parent])) { continue; } $effects[$parent] = array( 'rawPath' => $parent, 'rawTargetPath' => null, 'rawTargetCommit' => null, 'rawDirect' => false, 'changeType' => DifferentialChangeType::TYPE_CHILD, 'fileType' => DifferentialChangeType::FILE_DIRECTORY, ); } $lookup_paths = array(); foreach ($effects as $effect) { $lookup_paths[$effect['rawPath']] = true; if ($effect['rawTargetPath']) { $lookup_paths[$effect['rawTargetPath']] = true; } } $lookup_paths = array_keys($lookup_paths); $lookup_commits = array(); foreach ($effects as $effect) { if ($effect['rawTargetCommit']) { $lookup_commits[$effect['rawTargetCommit']] = true; } } $lookup_commits = array_keys($lookup_commits); $path_map = $this->lookupOrCreatePaths($lookup_paths); $commit_map = $this->lookupSvnCommits($repository, $lookup_commits); $this->writeChanges($repository, $commit, $effects, $path_map, $commit_map); $this->writeBrowse($repository, $commit, $effects, $path_map); $this->finishParse(); } private function writeChanges( PhabricatorRepository $repository, PhabricatorRepositoryCommit $commit, array $effects, array $path_map, array $commit_map) { $conn_w = $repository->establishConnection('w'); $sql = array(); foreach ($effects as $effect) { $sql[] = qsprintf( $conn_w, '(%d, %d, %d, %nd, %nd, %d, %d, %d, %d)', $repository->getID(), $path_map[$effect['rawPath']], $commit->getID(), $effect['rawTargetPath'] ? $path_map[$effect['rawTargetPath']] : null, $effect['rawTargetCommit'] ? $commit_map[$effect['rawTargetCommit']] : null, $effect['changeType'], $effect['fileType'], $effect['rawDirect'] ? 1 : 0, $commit->getCommitIdentifier()); } queryfx( $conn_w, 'DELETE FROM %T WHERE commitID = %d', PhabricatorRepository::TABLE_PATHCHANGE, $commit->getID()); foreach (array_chunk($sql, 512) as $sql_chunk) { queryfx( $conn_w, 'INSERT INTO %T (repositoryID, pathID, commitID, targetPathID, targetCommitID, changeType, fileType, isDirect, commitSequence) VALUES %Q', PhabricatorRepository::TABLE_PATHCHANGE, implode(', ', $sql_chunk)); } } private function writeBrowse( PhabricatorRepository $repository, PhabricatorRepositoryCommit $commit, array $effects, array $path_map) { $conn_w = $repository->establishConnection('w'); $sql = array(); foreach ($effects as $effect) { $type = $effect['changeType']; if (!$effect['rawDirect']) { if ($type == DifferentialChangeType::TYPE_COPY_AWAY) { // Don't write COPY_AWAY to the filesystem table if it isn't a direct // event. continue; } if ($type == DifferentialChangeType::TYPE_CHILD) { // Don't write CHILD to the filesystem table. Although doing these // writes has the nice property of letting you see when a directory's // contents were last changed, it explodes the table tremendously // and makes Diffusion far slower. continue; } } if ($effect['rawPath'] == '/') { // Don't write any events on '/' to the filesystem table; in // particular, it doesn't have a meaningful parentID. continue; } $existed = !DifferentialChangeType::isDeleteChangeType($type); $sql[] = qsprintf( $conn_w, '(%d, %d, %d, %d, %d, %d)', $repository->getID(), $path_map[$this->getParentPath($effect['rawPath'])], $commit->getCommitIdentifier(), $path_map[$effect['rawPath']], $existed ? 1 : 0, $effect['fileType']); } queryfx( $conn_w, 'DELETE FROM %T WHERE repositoryID = %d AND svnCommit = %d', PhabricatorRepository::TABLE_FILESYSTEM, $repository->getID(), $commit->getCommitIdentifier()); foreach (array_chunk($sql, 512) as $sql_chunk) { queryfx( $conn_w, 'INSERT INTO %T (repositoryID, parentID, svnCommit, pathID, existed, fileType) VALUES %Q', PhabricatorRepository::TABLE_FILESYSTEM, implode(', ', $sql_chunk)); } } private function lookupSvnCommits( PhabricatorRepository $repository, array $commits) { if (!$commits) { return array(); } $commit_table = new PhabricatorRepositoryCommit(); $commit_data = queryfx_all( $commit_table->establishConnection('w'), 'SELECT id, commitIdentifier FROM %T - WHERE repositoryID = %d AND commitIdentifier in (%Ld)', + WHERE repositoryID = %d AND commitIdentifier in (%Ls)', $commit_table->getTableName(), $repository->getID(), $commits); $commit_map = ipull($commit_data, 'id', 'commitIdentifier'); $need = array(); foreach ($commits as $commit) { if (empty($commit_map[$commit])) { $need[] = $commit; } } // If we are parsing a Subversion repository and have been configured to // import only some subdirectory of it, we may find commits which reference // other foreign commits outside of the directory (for instance, because of // a move or copy). Rather than trying to execute full parses on them, just // create stub commits and identify the stubs as foreign commits. if ($need) { $subpath = $repository->getDetail('svn-subpath'); if (!$subpath) { $commits = implode(', ', $need); throw new Exception( "Missing commits ({$need}) in a SVN repository which is not ". "configured for subdirectory-only parsing!"); } foreach ($need as $foreign_commit) { $commit = new PhabricatorRepositoryCommit(); $commit->setRepositoryID($repository->getID()); $commit->setCommitIdentifier($foreign_commit); $commit->setEpoch(0); $commit->save(); $data = new PhabricatorRepositoryCommitData(); $data->setCommitID($commit->getID()); $data->setAuthorName(''); $data->setCommitMessage(''); $data->setCommitDetails( array( 'foreign-svn-stub' => true, // Denormalize this to make it easier to debug cases where someone // did half a parse and then changed the subdirectory or something // like that. 'svn-subpath' => $subpath, )); $data->save(); $commit_map[$foreign_commit] = $commit->getID(); } } return $commit_map; } private function lookupPathFileType( PhabricatorRepository $repository, $path, array $path_info) { $result = $this->lookupPathFileTypes( $repository, array( $path => $path_info, )); return $result[$path]; } private function lookupPathFileTypes( PhabricatorRepository $repository, array $paths) { $result_map = array(); $repository_uri = $repository->getDetail('remote-uri'); if (isset($paths['/'])) { $result_map['/'] = DifferentialChangeType::FILE_DIRECTORY; unset($paths['/']); } $parents = array(); $path_mapping = array(); foreach ($paths as $path => $lookup) { $parent = dirname($lookup['rawPath']); $parent = ltrim($parent, '/'); $parent = $this->encodeSVNPath($parent); $parent = $repository_uri.$parent.'@'.$lookup['rawCommit']; $parent = escapeshellarg($parent); $parents[$parent] = true; $path_mapping[$parent][] = dirname($path); } // Reverse this list so we can pop $path_mapping, as that's more efficient // than shifting it. We need to associate these maps positionally because // a change can copy the same source path from multiple revisions via // "svn cp path@1 a; svn cp path@2 b;" and the XML output gives us no way // to distinguish which revision we're looking at except based on its // position in the document. $all_paths = array_reverse(array_keys($parents)); foreach (array_chunk($all_paths, 64) as $path_chunk) { list($raw_xml) = $repository->execxRemoteCommand( '--xml ls %C', implode(' ', $path_chunk)); $xml = new SimpleXMLElement($raw_xml); foreach ($xml->list as $list) { $list_path = (string)$list['path']; // SVN is a big mess. See Facebook rG8 (a revision which adds files // with spaces in their names) for an example. $list_path = rawurldecode($list_path); if ($list_path == $repository_uri) { $base = '/'; } else { $base = substr($list_path, strlen($repository_uri)); } $mapping = array_pop($path_mapping); foreach ($list->entry as $entry) { $val = $this->getFileTypeFromSVNKind($entry['kind']); foreach ($mapping as $base_path) { // rtrim() causes us to handle top-level directories correctly. $key = rtrim($base_path, '/').'/'.$entry->name; $result_map[$key] = $val; } } } } foreach ($paths as $path => $lookup) { if (empty($result_map[$path])) { $result_map[$path] = DifferentialChangeType::FILE_DELETED; } } return $result_map; } private function encodeSVNPath($path) { $path = rawurlencode($path); $path = str_replace('%2F', '/', $path); return $path; } private function getFileTypeFromSVNKind($kind) { $kind = (string)$kind; switch ($kind) { case 'dir': return DifferentialChangeType::FILE_DIRECTORY; case 'file': return DifferentialChangeType::FILE_NORMAL; default: throw new Exception("Unknown SVN file kind '{$kind}'."); } } private function lookupRecursiveFileList( PhabricatorRepository $repository, array $info) { $path = $info['rawPath']; $rev = $info['rawCommit']; $path = $this->encodeSVNPath($path); $hashkey = md5($repository->getDetail('remote-uri').$path.'@'.$rev); // This method is quite horrible. The underlying challenge is that some // commits in the Facebook repository are enormous, taking multiple hours // to 'ls -R' out of the repository and producing XML files >1GB in size. // If we try to SimpleXML them, the object exhausts available memory on a // 64G machine. Instead, cache the XML output and then parse it line by line // to limit space requirements. $cache_loc = sys_get_temp_dir().'/diffusion.'.$hashkey.'.svnls'; if (!Filesystem::pathExists($cache_loc)) { $tmp = new TempFile(); $repository->execxRemoteCommand( '--xml ls -R %s%s@%d > %s', $repository->getDetail('remote-uri'), $path, $rev, $tmp); execx( 'mv %s %s', $tmp, $cache_loc); } $map = $this->parseRecursiveListFileData($cache_loc); Filesystem::remove($cache_loc); return $map; } private function parseRecursiveListFileData($file_path) { $map = array(); $mode = 'xml'; $done = false; $entry = null; foreach (new LinesOfALargeFile($file_path) as $lno => $line) { switch ($mode) { case 'entry': if ($line == '') { $entry = implode('', $entry); $pattern = '@^\s+kind="(file|dir)">'. '(.*?)'. '((.*?))?@'; $matches = null; if (!preg_match($pattern, $entry, $matches)) { throw new Exception("Unable to parse entry!"); } $map[html_entity_decode($matches[2])] = $this->getFileTypeFromSVNKind($matches[1]); $mode = 'entry-or-end'; } else { $entry[] = $line; } break; case 'entry-or-end': if ($line == '') { $done = true; break 2; } else if ($line == ' or = 1) { array_pop($parts); $parents[] = '/'.implode('/', $parts); } return $parents; } }