WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.
Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.

!apt-get update -qq
!apt-get install chromium-chromedriver -y -qq

_img_sizes = {'>400*300':'isz:lt,islt:qsvga','>640*480':'isz:lt,islt:vga','>800*600':'isz:lt,islt:svga',
              '>1024*768':'visz:lt,islt:xga', '>2MP':'isz:lt,islt:2mp','>4MP':'isz:lt,islt:4mp','>6MP':'isz:lt,islt:6mp',
              '>8MP':'isz:lt,islt:8mp', '>10MP':'isz:lt,islt:10mp','>12MP':'isz:lt,islt:12mp','>15MP':'isz:lt,islt:15mp',
              '>20MP':'isz:lt,islt:20mp','>40MP':'isz:lt,islt:40mp','>70MP':'isz:lt,islt:70mp'}


class ImageDownloader:
    def __init__(self, data_path, dataset_name):
        """The ImageDownloader helps download images from the google image search page"""
        self._path = Path(data_path)
        self._dataset_path = self._path/dataset_name
        self._dataset_name = dataset_name
        
        os.makedirs(self._path, exist_ok=True)
        os.makedirs(self._dataset_path, exist_ok=True)
        
    def add_images_to_class(self, class_name, google_query, n_images=1000):
        """Add new images to the image class with a Google search query."""
        class_path = self._dataset_path/class_name
        url = _search_url(google_query)
        html = self.get_google_image_html(url)
        img_urls = self.get_img_urls_from_html(html)
        print(f'{len(img_urls)} image links found on Google image search for the query "{google_query}".')
        img_fnames = _download_images(class_path, img_urls)
        print(f'{len(img_fnames)} images now available in class {class_name}.')
        
        
    def get_google_image_html(self, url):
        """Get the html code of the Google Image Search."""
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')        

        try: 
            driver = webdriver.Chrome(options=options)
        except: 
            print("""Error initializing chromedriver. 
                  Check if it's in your path by running `which chromedriver`""")
        driver.set_window_size(1440, 900)
        driver.get(url)
        old_height = 0
        for i in range(10):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1.0 + random.random())
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == old_height:
                try:
                    button = driver.find_elements_by_xpath("//input[@type='button' and @value='Show more results']")[0]
                    button.click()
                except:
                    pass    
            old_height = new_height
        return driver.page_source
    
    def get_img_urls_from_html(self, html):
        """Extract image urls from html code."""
        bs = BeautifulSoup(html, 'html.parser')
        img_tags = bs.find_all('img')
        urls = []
        for tag in img_tags:
            if tag.has_attr('data-src'):
                urls.append(tag['data-src'])
        return urls

def _download_images(label_path:PathOrStr, img_urls:list, max_workers:int=defaults.cpus, timeout:int=4) -> FilePathList:
    """
    Downloads images in `img_tuples` to `label_path`. 
    If the directory doesn't exist, it'll be created automatically.
    Uses `parallel` to speed things up in `max_workers` when the system has enough CPU cores.
    If something doesn't work, try setting up `max_workers=0` to debug.
    """
    os.makedirs(Path(label_path), exist_ok=True)
    parallel( partial(_download_single_image, label_path, timeout=timeout), img_urls, max_workers=max_workers)
    return get_image_files(label_path)

def _download_single_image(label_path:Path, img_url:tuple, i:int, timeout:int=4) -> None:
    """
    Downloads a single image from Google Search results to `label_path`
    given an `img_tuple` that contains `(fname, url)` of an image to download.
    `i` is just an iteration number `int`. 
    """
    fname = img_url[-30:]+'.png'
    try:
        download_url(img_url, label_path/fname, timeout=timeout)
    except:
        print(f"Could not download image {img_url}")
    
def _search_url(search_term:str, size:str='>400*300', format:str='jpg') -> str:
    "Return a Google Images Search URL for a given search term."
    return ('https://www.google.com/search?q=' + quote(search_term) +
            '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' +
            _url_params(size, format) + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg')

def _url_params(size:str='>400*300', format:str='jpg') -> str:
    "Build Google Images Search Url params and return them as a string."
    _fmts = {'jpg':'ift:jpg','gif':'ift:gif','png':'ift:png','bmp':'ift:bmp', 'svg':'ift:svg','webp':'webp','ico':'ift:ico'}
    if size not in _img_sizes: 
        raise RuntimeError(f"""Unexpected size argument value: {size}.
                    See `widgets.image_downloader._img_sizes` for supported sizes.""") 
    if format not in _fmts: 
        raise RuntimeError(f"Unexpected image file format: {format}. Use jpg, gif, png, bmp, svg, webp, or ico.")
    return "&tbs=" + _img_sizes[size] + "," + _fmts[format]

Example: Cats vs dogs

In this example a new dataset is created and images for cats and dogs are downloaded.

data_path = Path('../data/')
dataset_name = 'cats_vs_dogs'

img_dl = ImageDownloader(data_path, dataset_name)

class_name = 'cat'
search_query = 'cat'

img_dl.add_images_to_class(class_name, search_query)

562 image links found on Google image search for the query "cat".

567 images now available in class cat.

search_query = 'cat funny'

img_dl.add_images_to_class(class_name, search_query)

525 image links found on Google image search for the query "cat funny".

1091 images now available in class cat.

class_name = 'dog'
search_query = 'dog'
img_dl.add_images_to_class(class_name, search_query)

490 image links found on Google image search for the query "dog".

487 images now available in class dog.

class ImageDownloader[source]

Example: Cats vs dogs

`class` `ImageDownloader`[source]