BadZipFile: File is not a zip file

19,451

in both zip archive open statements:

with zipfile.ZipFile(os.path.join(data_dir,folder),mode='r')

and

with zipfile.ZipFile(os.path.join(temp_dir,sub_folder),mode='r')

nothing (at least nothing that we can check) guarantees that the file names you're passing are actually .zip files. It could be a directory, an already extracted file, some file that was already there...

I suggest that you check the file extension prior to extracting, for instance:

import fnmatch
zfn = os.path.join(temp_dir,sub_folder)
if fnmatch.fnmatch(zfn,"*.zip"):
    with zipfile.ZipFile(zfn,mode='r') as whatever:

Some .zip files could be corrupt, but that's less likely. Also, if you wanted to extract .jar and other zip-structured files with a different extension, replace the fnmatch by

if zfn.lower().endswith(('.zip','.jar','.docx')):
Share:
19,451
BuddyCool
Author by

BuddyCool

Updated on June 09, 2022

Comments

  • BuddyCool
    BuddyCool almost 2 years

    This is my code. I get the error when I try to execute this script

     Error    raise BadZipFile("File is not a zip file")  
              BadZipFile: File is not a zip file
    

    This is my source directorypath

    data_dir = r'L:\DataQA\Python Unzip Files\Source Zipped'

    I have multiple zipped folders within ‘Source Zipped’(uncompressed) folder. The same code works when I zip all the subfolder of source Zipped into single zipped folder. But I don’t want this approach.

    import os
    import zipfile
    import shutil
    import json
    import logging
    import logging.config
    import time
    
    def my_start_time():
        global start_time, cumulative_time, start_time_stamp
        start_time = time.time()
        this_time = time.localtime(start_time)
        start_time_stamp = '{:4d}{:02d}{:02d} {:02d}:{:02d}:{:02d}'.format(\
                        this_time.tm_year, this_time.tm_mon, this_time.tm_mday,\
                        this_time.tm_hour, this_time.tm_min, this_time.tm_sec)
        cumulative_time = start_time - start_time 
        logging.info('Initial Setup: {:s}'.format(start_time_stamp))
    
    def my_time():
        global cumulative_time
        time_taken = time.time() - start_time
        incremental_time = time_taken - cumulative_time
        cumulative_time = time_taken
        logging.info("Started: %s  Complete:  Cumulative: %.4f s  Incremental: %.4f s\n" \
              % (start_time_stamp, cumulative_time, incremental_time) )
    
    logging.basicConfig(filename='myunzip_task_log.txt',level=logging.DEBUG)
    my_start_time()
    
    logging.info('Initial Setup...')
    
    def write_to_json(data, file):
        value = False
        with open(file, 'w') as f:
            json.dump(json.dumps(data, sort_keys=True),f)   
            f.close()
            value = True
        return value
    
    
    data_dir = r'L:\DataQA\Python Unzip Files\Source Zipped'
    temp_dir =  r'L:\DataQA\Python Unzip Files\temp1'
    new_dir = r'L:\DataQA\Python Unzip Files\temp2'
    final_dir = r'L:\DataQA\Python Unzip Files\Destination Unzipped files'
    
    
    
    
    
    big_list = os.listdir(data_dir)
    
    archive_count = 0
    file_count = 152865
    basename1 = os.path.join(final_dir,'GENERIC_ROUGHDRAFT')
    basename2 = os.path.join(final_dir,'XACTDOC')
    
    my_time()
    archive_count = len(big_list)
    logging.info('Unzipping {} archives...'.format(archive_count))
    for folder in big_list:
        prior_count = file_count
        logging.info('Starting: {}'.format(folder))
    
        try:
            shutil.rmtree(temp_dir)
        except FileNotFoundError: 
            pass
        os.mkdir(temp_dir)
        with zipfile.ZipFile(os.path.join(data_dir,folder),mode='r') as a_zip:
            a_zip.extractall(path = temp_dir)
            archive_count += 1
            logging.info('Cumulative total of {} archive(s) unzipped'.format(archive_count))
            bigger_list = os.listdir(temp_dir)
            logging.info('Current archive contains {} subfolders'.format(len(bigger_list)))
            for sub_folder in bigger_list:
                with zipfile.ZipFile(os.path.join(temp_dir,sub_folder),mode='r') as b_zip:
                    b_zip.extractall(path = new_dir)
                file1 = "%s (%d).%s" % (basename1, file_count, 'xml')
                file2 = "%s (%d).%s" % (basename2, file_count, 'xml')
                shutil.copy(os.path.join(new_dir, 'GENERIC_ROUGHDRAFT.xml'), file1)
                shutil.copy(os.path.join(new_dir, 'XACTDOC.xml'), file2)
                file_count += 1
            logging.info('{} subfolders unzipped'.format(file_count - prior_count))
        #os.remove(data_dir)
        shutil.rmtree(data_dir)
        os.mkdir(data_dir)
        #os.unlink(data_dir)
        my_time()
    logging.info('Total of {0} files -- {1} pairs -- should be in {2}'.format(2*(file_count-1), file_count-1, final_dir))
    
    time.sleep(1)
    
    my_time()
    

    File Explorer Capture