diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py index b4ad922..cfec64c 100644 --- a/bitshift/crawler/__init__.py +++ b/bitshift/crawler/__init__.py @@ -4,7 +4,7 @@ Contains functions for initializing all subsidiary, threaded crawlers. """ -import os, Queue +import logging, logging.handlers, os, Queue from bitshift.crawler import crawler, indexer @@ -20,6 +20,8 @@ def crawl(): 3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`. """ + _configure_logging() + MAX_URL_QUEUE_SIZE = 5e3 repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) @@ -29,3 +31,24 @@ def crawl(): for thread in threads: thread.start() + +def _configure_logging(): + LOG_FILE_DIR = "log" + + if not os.path.exists(LOG_FILE_DIR): + os.mkdir(LOG_FILE_DIR) + + logging.getLogger("requests").setLevel(logging.WARNING) + + formatter = logging.Formatter( + fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s" + " %(message)s"), datefmt="%y-%m-%d %H:%M:%S") + + handler = logging.handlers.TimedRotatingFileHandler( + "%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1, + backupCount=20) + handler.setFormatter(formatter) + + root_logger = logging.getLogger() + root_logger.addHandler(handler) + root_logger.setLevel(logging.NOTSET) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index e4b4929..785ac61 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -4,7 +4,7 @@ Contains all website/framework-specific Class crawlers. """ -import requests, time, threading +import logging, requests, time, threading from bitshift.crawler import indexer @@ -22,6 +22,7 @@ class GitHubCrawler(threading.Thread): :ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository` with repository metadata retrieved by :class:`GitHubCrawler`, and other Git crawlers, to be processed by :class:`indexer.GitIndexer`. + :ivar _logger: (:class:`logging.Logger`) A class-specific logger object. """ AUTHENTICATION = { @@ -39,6 +40,9 @@ class GitHubCrawler(threading.Thread): """ self.clone_queue = clone_queue + self._logger = logging.getLogger("%s.%s" % + (__name__, self.__class__.__name__)) + self._logger.info("Starting.") super(GitHubCrawler, self).__init__(name=self.__class__.__name__) def run(self): @@ -61,11 +65,17 @@ class GitHubCrawler(threading.Thread): try: response = requests.get(next_api_url, params=self.AUTHENTICATION) - except ConnectionError as exception: + except ConnectionError as excep: + self._logger.warning("API %s call failed: %s: %s", + next_api_url, excep.__class__.__name__, excep) + time.sleep(0.5) continue queue_percent_full = (float(self.clone_queue.qsize()) / self.clone_queue.maxsize) * 100 + self._logger.info("API call made. Queue size: %d/%d, %d%%." % + ((self.clone_queue.qsize(), self.clone_queue.maxsize, + queue_percent_full))) for repo in response.json(): while self.clone_queue.full(): @@ -73,15 +83,15 @@ class GitHubCrawler(threading.Thread): self.clone_queue.put(indexer.GitRepository( repo["html_url"], repo["full_name"].replace("/", ""), - "GitHub")) + "GitHub", + #self._get_repo_stars(repo["full_name"])) + 0)) if int(response.headers["x-ratelimit-remaining"]) == 0: time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) next_api_url = response.headers["link"].split(">")[0][1:] - with open(".github_api.log", "w") as log_file: - log_file.write("%s\n" % next_api_url) sleep_time = api_request_interval - (time.time() - start_time) if sleep_time > 0: @@ -105,7 +115,6 @@ class GitHubCrawler(threading.Thread): API_URL = "https://api.github.com/search/repositories" - params = self.AUTHENTICATION params["q"] = "repo:%s" % repo_name @@ -116,9 +125,18 @@ class GitHubCrawler(threading.Thread): }) if int(resp.headers["x-ratelimit-remaining"]) == 0: - time.sleep(int(resp.headers["x-ratelimit-reset"]) - time.time()) + sleep_time = int(resp.headers["x-ratelimit-reset"]) - time.time() + if sleep_time > 0: + logging.info("API quota exceeded. Sleep time: %d." % sleep_time) + time.sleep(sleep_time) - return int(resp.json()["items"][0]["stargazers_count"]) + if "items" not in resp.json() or len(resp.json()["items"]) == 0: + self._logger.critical("No API result: %s. Result: %s" % (resp.url, + str(resp.json()))) + return 0 + else: + rank = float(resp.json()["items"][0]["stargazers_count"]) / 1000 + return rank if rank < 1.0 else 1.0 class BitbucketCrawler(threading.Thread): """ @@ -131,6 +149,7 @@ class BitbucketCrawler(threading.Thread): :ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert :class:`indexer.GitRepository` repository urls into. + :ivar _logger: (:class:`logging.Logger`) A class-specific logger object. """ def __init__(self, clone_queue): @@ -143,6 +162,9 @@ class BitbucketCrawler(threading.Thread): """ self.clone_queue = clone_queue + self._logger = logging.getLogger("%s.%s" % + (__name__, self.__class__.__name__)) + self._logger.info("Starting.") super(BitbucketCrawler, self).__init__(name=self.__class__.__name__) def run(self): @@ -162,10 +184,15 @@ class BitbucketCrawler(threading.Thread): response = requests.get(next_api_url).json() except ConnectionError as exception: time.sleep(0.5) + self._logger.warning("API %s call failed: %s: %s", + next_api_url, excep.__class__.__name__, excep) continue queue_percent_full = (float(self.clone_queue.qsize()) / self.clone_queue.maxsize) * 100 + self._logger.info("API call made. Queue size: %d/%d, %d%%." % + ((self.clone_queue.qsize(), self.clone_queue.maxsize, + queue_percent_full))) for repo in response["values"]: if repo["scm"] == "git": @@ -181,7 +208,4 @@ class BitbucketCrawler(threading.Thread): clone_url, repo["full_name"], "Bitbucket")) next_api_url = response["next"] - with open(".bitbucket_api.log", "w") as log_file: - log_file.write("%s\n" % next_api_url) - time.sleep(0.2) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index d2ef907..69c579c 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -3,7 +3,8 @@ repositories. """ -import bs4, os, Queue, re, shutil, string, subprocess, time, threading +import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\ + threading from ..database import Database from ..codelet import Codelet @@ -11,6 +12,9 @@ from ..codelet import Codelet GIT_CLONE_DIR = "/tmp/bitshift" THREAD_QUEUE_SLEEP = 0.5 +import pymongo #debug +db = pymongo.MongoClient().bitshift #debug + class GitRepository(object): """ A representation of a Git repository's metadata. @@ -19,24 +23,29 @@ class GitRepository(object): :ivar name: (str) The name of the repository. :ivar framework_name: (str) The name of the online Git framework that the repository belongs to (eg, GitHub, BitBucket). + :ivar rank: (float) The rank of the repository, as assigned by + :class:`crawler.GitHubCrawler`. """ - def __init__(self, url, name, framework_name): + def __init__(self, url, name, framework_name, rank): """ Create a GitRepository instance. :param url: see :attr:`GitRepository.url` :param name: see :attr:`GitRepository.name` :param framework_name: see :attr:`GitRepository.framework_name` + :param rank: see :attr:`GitRepository.rank` :type url: str :type name: str :type framework_name: str + :type rank: float """ self.url = url self.name = name self.framework_name = framework_name + self.rank = rank class GitIndexer(threading.Thread): """ @@ -50,6 +59,7 @@ class GitIndexer(threading.Thread): cloned by :class:`_GitCloner`, which are to be indexed. :ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner, which feeds :class:`GitIndexer`. + :ivar _logger: (:class:`logging.Logger`) A class-specific logger object. """ def __init__(self, clone_queue): @@ -66,6 +76,9 @@ class GitIndexer(threading.Thread): self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) self.git_cloner = _GitCloner(clone_queue, self.index_queue) self.git_cloner.start() + self._logger = logging.getLogger("%s.%s" % + (__name__, self.__class__.__name__)) + self._logger.info("Starting.") if not os.path.exists(GIT_CLONE_DIR): os.makedirs(GIT_CLONE_DIR) @@ -88,52 +101,43 @@ class GitIndexer(threading.Thread): repo = self.index_queue.get() self.index_queue.task_done() - try: - self._index_repository(repo.url, repo.name, repo.framework_name) - except Exception as exception: - pass + # try: + self._index_repository(repo) + # except Exception as excep: + # self._logger.warning("%s: %s.", excep.__class__.__name__, excep) - def _index_repository(self, repo_url, repo_name, framework_name): + def _index_repository(self, repo): """ Clone and index (create and insert Codeletes for) a Git repository. - `git clone` the Git repository located at **repo_url**, call - _insert_repository_codelets, then remove said repository. + `git clone` the Git repository located at **repo.url**, call + `_insert_repository_codelets()`, then remove said repository. - :param repo_url: The url the Git repository was cloned from. - :param repo_name: The name of the repository. - :param framework_name: The name of the framework the repository is from. + :param repo_url: The metadata of the repository to be indexed. - :type repo_url: str - :type repo_name: str - :type framework_name: str + :type repo_url: :class:`GitRepository` """ - with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir: - try: - self._insert_repository_codelets(repo_url, repo_name, - framework_name) - except Exception as exception: - pass + with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir: + # try: + self._insert_repository_codelets(repo) + # except Exception as excep: + # self._logger.warning("%s: %s.", excep.__class__.__name__, excep) - if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)): - shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) + if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): + shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) - def _insert_repository_codelets(self, repo_url, repo_name, framework_name): + def _insert_repository_codelets(self, repo): """ Create and insert a Codelet for the files inside a Git repository. - Create a new Codelet, and insert it into the Database singleton, for every - file inside the current working directory's default branch (usually - *master*). + Create a new Codelet, and insert it into the Database singleton, for + every file inside the current working directory's default branch + (usually *master*). - :param repo_url: The url the Git repository was cloned from. - :param repo_name: The name of the repository. - :param framework_name: The name of the framework the repository is from. + :param repo_url: The metadata of the repository to be indexed. - :type repo_url: str - :type repo_name: str - :type framework_name: str + :type repo_url: :class:`GitRepository` """ commits_meta = _get_commits_metadata() @@ -142,7 +146,6 @@ class GitIndexer(threading.Thread): for filename in commits_meta.keys(): try: - source = "" with open(filename) as source_file: source = _decode(source_file.read()) if source is None: @@ -152,13 +155,14 @@ class GitIndexer(threading.Thread): authors = [(_decode(author),) for author in \ commits_meta[filename]["authors"]] - codelet = Codelet("%s:%s" % (repo_name, filename), source, filename, - None, authors, _generate_file_url(filename, repo_url, - framework_name), + codelet = Codelet("%s:%s" % (repo.name, filename), source, filename, + None, authors, _generate_file_url(filename, + repo.url, repo.framework_name), commits_meta[filename]["time_created"], - commits_meta[filename]["time_last_modified"]) + commits_meta[filename]["time_last_modified"], + repo.rank) - # Database.insert(codelet) + db.codelets.insert(codelet.__dict__) #debug class _GitCloner(threading.Thread): """ @@ -171,6 +175,7 @@ class _GitCloner(threading.Thread): :attr:`crawler.GitHubCrawler.clone_queue`. :ivar index_queue: (:class:`Queue.Queue`) see :attr:`GitIndexer.index_queue`. + :ivar _logger: (:class:`logging.Logger`) A class-specific logger object. """ def __init__(self, clone_queue, index_queue): @@ -186,6 +191,9 @@ class _GitCloner(threading.Thread): self.clone_queue = clone_queue self.index_queue = index_queue + self._logger = logging.getLogger("%s.%s" % + (__name__, self.__class__.__name__)) + self._logger.info("Starting.") super(_GitCloner, self).__init__(name=self.__class__.__name__) def run(self): @@ -339,11 +347,11 @@ def _get_git_commits(): sample_returned_array = [ { "author" : (str) "author" - "timestamp" : (int) 1396919293, + "timestamp" : (`datetime.datetime`) , "filenames" : (str array) ["file1", "file2"] } ] - :rtype: dictionary + :rtype: array of dictionaries """ git_log = subprocess.check_output(("git --no-pager log --name-only" @@ -355,7 +363,7 @@ def _get_git_commits(): if len(fields) > 2: commits.append({ "author" : fields[0], - "timestamp" : int(fields[1]), + "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), "filenames" : fields[2].split("\x00")[:-2] }) @@ -374,28 +382,14 @@ def _get_tracked_files(): :rtype: str array """ - GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"] - GIT_IGNORE_EXTENSIONS = ["t[e]?xt(ile)?", "m(ark)?down", "mkd[n]?", - "md(wn|t[e]?xt)?", "rst"] - files = [] for dirname, subdir_names, filenames in os.walk("."): for filename in filenames: path = os.path.join(dirname, filename) if _is_ascii(path): - files.append(path) - - valuable_files = [] - for filename in files: - filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE) - for pattern in GIT_IGNORE_FILES]) - extension = filename.split(".")[-1] - extension_match = any([re.match(pattern, filename, flags=re.IGNORECASE) - for pattern in GIT_IGNORE_EXTENSIONS]) + files.append(path[2:]) - if not (filename_match or extension_match): - valuable_files.append(filename[2:]) - return valuable_files + return files def _get_commits_metadata(): """ @@ -407,11 +401,11 @@ def _get_commits_metadata(): sample_returned_dict = { "my_file" : { "authors" : (str array) ["author1", "author2"], - "time_created" : (int) 1395939566, - "time_last_modified" : (int) 1396920409 + "time_created" : (`datetime.datetime`) , + "time_last_modified" : (`datetime.datetime`) } } - :rtype: dictionary + :rtype: dictionary of dictionaries """ commits = _get_git_commits()