Browse Source

Re-add logging, rem file filters.

Add:
    bitshift/
        __init__.py
            -add `_configure_logging()`, which sets up a more robust logging
            infrastructure than was previously used: log files are rotated once
            per hour, and have some additional formatting rules.

        (crawler, indexer).py
            -add hierarchically-descending loggers to individual threaded
            classes (`GitHubCrawler`, `GitIndexer`, etc.); add logging calls.

        indexer.py
            -remove file filtering regex matches from `_get_tracked_files()`,
            as non-code files will be discarded by the parsers.
tags/v1.0^2
Severyn Kozak 10 years ago
parent
commit
6762c1fa3d
3 changed files with 114 additions and 73 deletions
  1. +24
    -1
      bitshift/crawler/__init__.py
  2. +35
    -11
      bitshift/crawler/crawler.py
  3. +55
    -61
      bitshift/crawler/indexer.py

+ 24
- 1
bitshift/crawler/__init__.py View File

@@ -4,7 +4,7 @@
Contains functions for initializing all subsidiary, threaded crawlers. Contains functions for initializing all subsidiary, threaded crawlers.
""" """


import os, Queue
import logging, logging.handlers, os, Queue


from bitshift.crawler import crawler, indexer from bitshift.crawler import crawler, indexer


@@ -20,6 +20,8 @@ def crawl():
3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`. 3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`.
""" """


_configure_logging()

MAX_URL_QUEUE_SIZE = 5e3 MAX_URL_QUEUE_SIZE = 5e3


repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
@@ -29,3 +31,24 @@ def crawl():


for thread in threads: for thread in threads:
thread.start() thread.start()

def _configure_logging():
LOG_FILE_DIR = "log"

if not os.path.exists(LOG_FILE_DIR):
os.mkdir(LOG_FILE_DIR)

logging.getLogger("requests").setLevel(logging.WARNING)

formatter = logging.Formatter(
fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s"
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S")

handler = logging.handlers.TimedRotatingFileHandler(
"%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1,
backupCount=20)
handler.setFormatter(formatter)

root_logger = logging.getLogger()
root_logger.addHandler(handler)
root_logger.setLevel(logging.NOTSET)

+ 35
- 11
bitshift/crawler/crawler.py View File

@@ -4,7 +4,7 @@
Contains all website/framework-specific Class crawlers. Contains all website/framework-specific Class crawlers.
""" """


import requests, time, threading
import logging, requests, time, threading


from bitshift.crawler import indexer from bitshift.crawler import indexer


@@ -22,6 +22,7 @@ class GitHubCrawler(threading.Thread):
:ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository` :ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository`
with repository metadata retrieved by :class:`GitHubCrawler`, and other Git with repository metadata retrieved by :class:`GitHubCrawler`, and other Git
crawlers, to be processed by :class:`indexer.GitIndexer`. crawlers, to be processed by :class:`indexer.GitIndexer`.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
""" """


AUTHENTICATION = { AUTHENTICATION = {
@@ -39,6 +40,9 @@ class GitHubCrawler(threading.Thread):
""" """


self.clone_queue = clone_queue self.clone_queue = clone_queue
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")
super(GitHubCrawler, self).__init__(name=self.__class__.__name__) super(GitHubCrawler, self).__init__(name=self.__class__.__name__)


def run(self): def run(self):
@@ -61,11 +65,17 @@ class GitHubCrawler(threading.Thread):
try: try:
response = requests.get(next_api_url, response = requests.get(next_api_url,
params=self.AUTHENTICATION) params=self.AUTHENTICATION)
except ConnectionError as exception:
except ConnectionError as excep:
self._logger.warning("API %s call failed: %s: %s",
next_api_url, excep.__class__.__name__, excep)
time.sleep(0.5)
continue continue


queue_percent_full = (float(self.clone_queue.qsize()) / queue_percent_full = (float(self.clone_queue.qsize()) /
self.clone_queue.maxsize) * 100 self.clone_queue.maxsize) * 100
self._logger.info("API call made. Queue size: %d/%d, %d%%." %
((self.clone_queue.qsize(), self.clone_queue.maxsize,
queue_percent_full)))


for repo in response.json(): for repo in response.json():
while self.clone_queue.full(): while self.clone_queue.full():
@@ -73,15 +83,15 @@ class GitHubCrawler(threading.Thread):


self.clone_queue.put(indexer.GitRepository( self.clone_queue.put(indexer.GitRepository(
repo["html_url"], repo["full_name"].replace("/", ""), repo["html_url"], repo["full_name"].replace("/", ""),
"GitHub"))
"GitHub",
#self._get_repo_stars(repo["full_name"]))
0))


if int(response.headers["x-ratelimit-remaining"]) == 0: if int(response.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(response.headers["x-ratelimit-reset"]) - time.sleep(int(response.headers["x-ratelimit-reset"]) -
time.time()) time.time())


next_api_url = response.headers["link"].split(">")[0][1:] next_api_url = response.headers["link"].split(">")[0][1:]
with open(".github_api.log", "w") as log_file:
log_file.write("%s\n" % next_api_url)


sleep_time = api_request_interval - (time.time() - start_time) sleep_time = api_request_interval - (time.time() - start_time)
if sleep_time > 0: if sleep_time > 0:
@@ -105,7 +115,6 @@ class GitHubCrawler(threading.Thread):


API_URL = "https://api.github.com/search/repositories" API_URL = "https://api.github.com/search/repositories"



params = self.AUTHENTICATION params = self.AUTHENTICATION
params["q"] = "repo:%s" % repo_name params["q"] = "repo:%s" % repo_name


@@ -116,9 +125,18 @@ class GitHubCrawler(threading.Thread):
}) })


if int(resp.headers["x-ratelimit-remaining"]) == 0: if int(resp.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(resp.headers["x-ratelimit-reset"]) - time.time())
sleep_time = int(resp.headers["x-ratelimit-reset"]) - time.time()
if sleep_time > 0:
logging.info("API quota exceeded. Sleep time: %d." % sleep_time)
time.sleep(sleep_time)


return int(resp.json()["items"][0]["stargazers_count"])
if "items" not in resp.json() or len(resp.json()["items"]) == 0:
self._logger.critical("No API result: %s. Result: %s" % (resp.url,
str(resp.json())))
return 0
else:
rank = float(resp.json()["items"][0]["stargazers_count"]) / 1000
return rank if rank < 1.0 else 1.0


class BitbucketCrawler(threading.Thread): class BitbucketCrawler(threading.Thread):
""" """
@@ -131,6 +149,7 @@ class BitbucketCrawler(threading.Thread):


:ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert :ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert
:class:`indexer.GitRepository` repository urls into. :class:`indexer.GitRepository` repository urls into.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
""" """


def __init__(self, clone_queue): def __init__(self, clone_queue):
@@ -143,6 +162,9 @@ class BitbucketCrawler(threading.Thread):
""" """


self.clone_queue = clone_queue self.clone_queue = clone_queue
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")
super(BitbucketCrawler, self).__init__(name=self.__class__.__name__) super(BitbucketCrawler, self).__init__(name=self.__class__.__name__)


def run(self): def run(self):
@@ -162,10 +184,15 @@ class BitbucketCrawler(threading.Thread):
response = requests.get(next_api_url).json() response = requests.get(next_api_url).json()
except ConnectionError as exception: except ConnectionError as exception:
time.sleep(0.5) time.sleep(0.5)
self._logger.warning("API %s call failed: %s: %s",
next_api_url, excep.__class__.__name__, excep)
continue continue


queue_percent_full = (float(self.clone_queue.qsize()) / queue_percent_full = (float(self.clone_queue.qsize()) /
self.clone_queue.maxsize) * 100 self.clone_queue.maxsize) * 100
self._logger.info("API call made. Queue size: %d/%d, %d%%." %
((self.clone_queue.qsize(), self.clone_queue.maxsize,
queue_percent_full)))


for repo in response["values"]: for repo in response["values"]:
if repo["scm"] == "git": if repo["scm"] == "git":
@@ -181,7 +208,4 @@ class BitbucketCrawler(threading.Thread):
clone_url, repo["full_name"], "Bitbucket")) clone_url, repo["full_name"], "Bitbucket"))


next_api_url = response["next"] next_api_url = response["next"]
with open(".bitbucket_api.log", "w") as log_file:
log_file.write("%s\n" % next_api_url)

time.sleep(0.2) time.sleep(0.2)

+ 55
- 61
bitshift/crawler/indexer.py View File

@@ -3,7 +3,8 @@
repositories. repositories.
""" """


import bs4, os, Queue, re, shutil, string, subprocess, time, threading
import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\
threading


from ..database import Database from ..database import Database
from ..codelet import Codelet from ..codelet import Codelet
@@ -11,6 +12,9 @@ from ..codelet import Codelet
GIT_CLONE_DIR = "/tmp/bitshift" GIT_CLONE_DIR = "/tmp/bitshift"
THREAD_QUEUE_SLEEP = 0.5 THREAD_QUEUE_SLEEP = 0.5


import pymongo #debug
db = pymongo.MongoClient().bitshift #debug

class GitRepository(object): class GitRepository(object):
""" """
A representation of a Git repository's metadata. A representation of a Git repository's metadata.
@@ -19,24 +23,29 @@ class GitRepository(object):
:ivar name: (str) The name of the repository. :ivar name: (str) The name of the repository.
:ivar framework_name: (str) The name of the online Git framework that the :ivar framework_name: (str) The name of the online Git framework that the
repository belongs to (eg, GitHub, BitBucket). repository belongs to (eg, GitHub, BitBucket).
:ivar rank: (float) The rank of the repository, as assigned by
:class:`crawler.GitHubCrawler`.
""" """


def __init__(self, url, name, framework_name):
def __init__(self, url, name, framework_name, rank):
""" """
Create a GitRepository instance. Create a GitRepository instance.


:param url: see :attr:`GitRepository.url` :param url: see :attr:`GitRepository.url`
:param name: see :attr:`GitRepository.name` :param name: see :attr:`GitRepository.name`
:param framework_name: see :attr:`GitRepository.framework_name` :param framework_name: see :attr:`GitRepository.framework_name`
:param rank: see :attr:`GitRepository.rank`


:type url: str :type url: str
:type name: str :type name: str
:type framework_name: str :type framework_name: str
:type rank: float
""" """


self.url = url self.url = url
self.name = name self.name = name
self.framework_name = framework_name self.framework_name = framework_name
self.rank = rank


class GitIndexer(threading.Thread): class GitIndexer(threading.Thread):
""" """
@@ -50,6 +59,7 @@ class GitIndexer(threading.Thread):
cloned by :class:`_GitCloner`, which are to be indexed. cloned by :class:`_GitCloner`, which are to be indexed.
:ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner, :ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner,
which feeds :class:`GitIndexer`. which feeds :class:`GitIndexer`.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
""" """


def __init__(self, clone_queue): def __init__(self, clone_queue):
@@ -66,6 +76,9 @@ class GitIndexer(threading.Thread):
self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
self.git_cloner = _GitCloner(clone_queue, self.index_queue) self.git_cloner = _GitCloner(clone_queue, self.index_queue)
self.git_cloner.start() self.git_cloner.start()
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")


if not os.path.exists(GIT_CLONE_DIR): if not os.path.exists(GIT_CLONE_DIR):
os.makedirs(GIT_CLONE_DIR) os.makedirs(GIT_CLONE_DIR)
@@ -88,52 +101,43 @@ class GitIndexer(threading.Thread):


repo = self.index_queue.get() repo = self.index_queue.get()
self.index_queue.task_done() self.index_queue.task_done()
try:
self._index_repository(repo.url, repo.name, repo.framework_name)
except Exception as exception:
pass
# try:
self._index_repository(repo)
# except Exception as excep:
# self._logger.warning("%s: %s.", excep.__class__.__name__, excep)


def _index_repository(self, repo_url, repo_name, framework_name):
def _index_repository(self, repo):
""" """
Clone and index (create and insert Codeletes for) a Git repository. Clone and index (create and insert Codeletes for) a Git repository.


`git clone` the Git repository located at **repo_url**, call
_insert_repository_codelets, then remove said repository.
`git clone` the Git repository located at **repo.url**, call
`_insert_repository_codelets()`, then remove said repository.


:param repo_url: The url the Git repository was cloned from.
:param repo_name: The name of the repository.
:param framework_name: The name of the framework the repository is from.
:param repo_url: The metadata of the repository to be indexed.


:type repo_url: str
:type repo_name: str
:type framework_name: str
:type repo_url: :class:`GitRepository`
""" """


with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir:
try:
self._insert_repository_codelets(repo_url, repo_name,
framework_name)
except Exception as exception:
pass
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir:
# try:
self._insert_repository_codelets(repo)
# except Exception as excep:
# self._logger.warning("%s: %s.", excep.__class__.__name__, excep)


if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))


def _insert_repository_codelets(self, repo_url, repo_name, framework_name):
def _insert_repository_codelets(self, repo):
""" """
Create and insert a Codelet for the files inside a Git repository. Create and insert a Codelet for the files inside a Git repository.


Create a new Codelet, and insert it into the Database singleton, for every
file inside the current working directory's default branch (usually
*master*).
Create a new Codelet, and insert it into the Database singleton, for
every file inside the current working directory's default branch
(usually *master*).


:param repo_url: The url the Git repository was cloned from.
:param repo_name: The name of the repository.
:param framework_name: The name of the framework the repository is from.
:param repo_url: The metadata of the repository to be indexed.


:type repo_url: str
:type repo_name: str
:type framework_name: str
:type repo_url: :class:`GitRepository`
""" """


commits_meta = _get_commits_metadata() commits_meta = _get_commits_metadata()
@@ -142,7 +146,6 @@ class GitIndexer(threading.Thread):


for filename in commits_meta.keys(): for filename in commits_meta.keys():
try: try:
source = ""
with open(filename) as source_file: with open(filename) as source_file:
source = _decode(source_file.read()) source = _decode(source_file.read())
if source is None: if source is None:
@@ -152,13 +155,14 @@ class GitIndexer(threading.Thread):


authors = [(_decode(author),) for author in \ authors = [(_decode(author),) for author in \
commits_meta[filename]["authors"]] commits_meta[filename]["authors"]]
codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
None, authors, _generate_file_url(filename, repo_url,
framework_name),
codelet = Codelet("%s:%s" % (repo.name, filename), source, filename,
None, authors, _generate_file_url(filename,
repo.url, repo.framework_name),
commits_meta[filename]["time_created"], commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"])
commits_meta[filename]["time_last_modified"],
repo.rank)


# Database.insert(codelet)
db.codelets.insert(codelet.__dict__) #debug


class _GitCloner(threading.Thread): class _GitCloner(threading.Thread):
""" """
@@ -171,6 +175,7 @@ class _GitCloner(threading.Thread):
:attr:`crawler.GitHubCrawler.clone_queue`. :attr:`crawler.GitHubCrawler.clone_queue`.
:ivar index_queue: (:class:`Queue.Queue`) see :ivar index_queue: (:class:`Queue.Queue`) see
:attr:`GitIndexer.index_queue`. :attr:`GitIndexer.index_queue`.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
""" """


def __init__(self, clone_queue, index_queue): def __init__(self, clone_queue, index_queue):
@@ -186,6 +191,9 @@ class _GitCloner(threading.Thread):


self.clone_queue = clone_queue self.clone_queue = clone_queue
self.index_queue = index_queue self.index_queue = index_queue
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")
super(_GitCloner, self).__init__(name=self.__class__.__name__) super(_GitCloner, self).__init__(name=self.__class__.__name__)


def run(self): def run(self):
@@ -339,11 +347,11 @@ def _get_git_commits():
sample_returned_array = [ sample_returned_array = [
{ {
"author" : (str) "author" "author" : (str) "author"
"timestamp" : (int) 1396919293,
"timestamp" : (`datetime.datetime`) <object>,
"filenames" : (str array) ["file1", "file2"] "filenames" : (str array) ["file1", "file2"]
} }
] ]
:rtype: dictionary
:rtype: array of dictionaries
""" """


git_log = subprocess.check_output(("git --no-pager log --name-only" git_log = subprocess.check_output(("git --no-pager log --name-only"
@@ -355,7 +363,7 @@ def _get_git_commits():
if len(fields) > 2: if len(fields) > 2:
commits.append({ commits.append({
"author" : fields[0], "author" : fields[0],
"timestamp" : int(fields[1]),
"timestamp" : datetime.datetime.fromtimestamp(int(fields[1])),
"filenames" : fields[2].split("\x00")[:-2] "filenames" : fields[2].split("\x00")[:-2]
}) })


@@ -374,28 +382,14 @@ def _get_tracked_files():
:rtype: str array :rtype: str array
""" """


GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"]
GIT_IGNORE_EXTENSIONS = ["t[e]?xt(ile)?", "m(ark)?down", "mkd[n]?",
"md(wn|t[e]?xt)?", "rst"]

files = [] files = []
for dirname, subdir_names, filenames in os.walk("."): for dirname, subdir_names, filenames in os.walk("."):
for filename in filenames: for filename in filenames:
path = os.path.join(dirname, filename) path = os.path.join(dirname, filename)
if _is_ascii(path): if _is_ascii(path):
files.append(path)

valuable_files = []
for filename in files:
filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
for pattern in GIT_IGNORE_FILES])
extension = filename.split(".")[-1]
extension_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
for pattern in GIT_IGNORE_EXTENSIONS])
files.append(path[2:])


if not (filename_match or extension_match):
valuable_files.append(filename[2:])
return valuable_files
return files


def _get_commits_metadata(): def _get_commits_metadata():
""" """
@@ -407,11 +401,11 @@ def _get_commits_metadata():
sample_returned_dict = { sample_returned_dict = {
"my_file" : { "my_file" : {
"authors" : (str array) ["author1", "author2"], "authors" : (str array) ["author1", "author2"],
"time_created" : (int) 1395939566,
"time_last_modified" : (int) 1396920409
"time_created" : (`datetime.datetime`) <object>,
"time_last_modified" : (`datetime.datetime`) <object>
} }
} }
:rtype: dictionary
:rtype: dictionary of dictionaries
""" """


commits = _get_git_commits() commits = _get_git_commits()


Loading…
Cancel
Save