BadZipFile: File is not a zip file
in both zip archive open statements:
with zipfile.ZipFile(os.path.join(data_dir,folder),mode='r')
and
with zipfile.ZipFile(os.path.join(temp_dir,sub_folder),mode='r')
nothing (at least nothing that we can check) guarantees that the file names you're passing are actually .zip
files. It could be a directory, an already extracted file, some file that was already there...
I suggest that you check the file extension prior to extracting, for instance:
import fnmatch
zfn = os.path.join(temp_dir,sub_folder)
if fnmatch.fnmatch(zfn,"*.zip"):
with zipfile.ZipFile(zfn,mode='r') as whatever:
Some .zip files could be corrupt, but that's less likely. Also, if you wanted to extract .jar
and other zip-structured files with a different extension, replace the fnmatch
by
if zfn.lower().endswith(('.zip','.jar','.docx')):
BuddyCool
Updated on June 09, 2022Comments
-
BuddyCool almost 2 years
This is my code. I get the error when I try to execute this script
Error raise BadZipFile("File is not a zip file") BadZipFile: File is not a zip file
This is my source directorypath
data_dir = r'L:\DataQA\Python Unzip Files\Source Zipped'
I have multiple zipped folders within ‘Source Zipped’(uncompressed) folder. The same code works when I zip all the subfolder of source Zipped into single zipped folder. But I don’t want this approach.
import os import zipfile import shutil import json import logging import logging.config import time def my_start_time(): global start_time, cumulative_time, start_time_stamp start_time = time.time() this_time = time.localtime(start_time) start_time_stamp = '{:4d}{:02d}{:02d} {:02d}:{:02d}:{:02d}'.format(\ this_time.tm_year, this_time.tm_mon, this_time.tm_mday,\ this_time.tm_hour, this_time.tm_min, this_time.tm_sec) cumulative_time = start_time - start_time logging.info('Initial Setup: {:s}'.format(start_time_stamp)) def my_time(): global cumulative_time time_taken = time.time() - start_time incremental_time = time_taken - cumulative_time cumulative_time = time_taken logging.info("Started: %s Complete: Cumulative: %.4f s Incremental: %.4f s\n" \ % (start_time_stamp, cumulative_time, incremental_time) ) logging.basicConfig(filename='myunzip_task_log.txt',level=logging.DEBUG) my_start_time() logging.info('Initial Setup...') def write_to_json(data, file): value = False with open(file, 'w') as f: json.dump(json.dumps(data, sort_keys=True),f) f.close() value = True return value data_dir = r'L:\DataQA\Python Unzip Files\Source Zipped' temp_dir = r'L:\DataQA\Python Unzip Files\temp1' new_dir = r'L:\DataQA\Python Unzip Files\temp2' final_dir = r'L:\DataQA\Python Unzip Files\Destination Unzipped files' big_list = os.listdir(data_dir) archive_count = 0 file_count = 152865 basename1 = os.path.join(final_dir,'GENERIC_ROUGHDRAFT') basename2 = os.path.join(final_dir,'XACTDOC') my_time() archive_count = len(big_list) logging.info('Unzipping {} archives...'.format(archive_count)) for folder in big_list: prior_count = file_count logging.info('Starting: {}'.format(folder)) try: shutil.rmtree(temp_dir) except FileNotFoundError: pass os.mkdir(temp_dir) with zipfile.ZipFile(os.path.join(data_dir,folder),mode='r') as a_zip: a_zip.extractall(path = temp_dir) archive_count += 1 logging.info('Cumulative total of {} archive(s) unzipped'.format(archive_count)) bigger_list = os.listdir(temp_dir) logging.info('Current archive contains {} subfolders'.format(len(bigger_list))) for sub_folder in bigger_list: with zipfile.ZipFile(os.path.join(temp_dir,sub_folder),mode='r') as b_zip: b_zip.extractall(path = new_dir) file1 = "%s (%d).%s" % (basename1, file_count, 'xml') file2 = "%s (%d).%s" % (basename2, file_count, 'xml') shutil.copy(os.path.join(new_dir, 'GENERIC_ROUGHDRAFT.xml'), file1) shutil.copy(os.path.join(new_dir, 'XACTDOC.xml'), file2) file_count += 1 logging.info('{} subfolders unzipped'.format(file_count - prior_count)) #os.remove(data_dir) shutil.rmtree(data_dir) os.mkdir(data_dir) #os.unlink(data_dir) my_time() logging.info('Total of {0} files -- {1} pairs -- should be in {2}'.format(2*(file_count-1), file_count-1, final_dir)) time.sleep(1) my_time()