Importing an Excel Sheet and Validate the Imported Data with Loosely Coupled

15,868

I have done the same thing on a project. The difference is that I didn't have to import Excel sheets, but CSV files. I created a CSVValueProvider. And, therefore, the CSV data was bound to my IEnumerable model automatically.

As for validation, I figured that going through all rows, and cells, and validating them one by one is not very efficient, especially when the CSV file has thousands of records. So, what I did was that I created some validation methods that went through the CSV data column by column, instead of row by row, and did a linq query on each column and returned the row numbers of the cells with invalid data. Then, added the invalid row number/column names to ModelState.

UPDATE:

Here is what I have done...

CSVReader Class:

// A class that can read and parse the data in a CSV file.
public class CSVReader
{
    // Regex expression that's used to parse the data in a line of a CSV file
    private const string ESCAPE_SPLIT_REGEX = "({1}[^{1}]*{1})*(?<Separator>{0})({1}[^{1}]*{1})*";
    // String array to hold the headers (column names)
    private string[] _headers;
    // List of string arrays to hold the data in the CSV file. Each string array in the list represents one line (row).
    private List<string[]> _rows;
    // The StreamReader class that's used to read the CSV file.
    private StreamReader _reader;

    public CSVReader(StreamReader reader)
    {
        _reader = reader;

        Parse();
    }

    // Reads and parses the data from the CSV file
    private void Parse()
    {
        _rows = new List<string[]>();
        string[] row;
        int rowNumber = 1;

        var headerLine = "RowNumber," + _reader.ReadLine();
        _headers = GetEscapedSVs(headerLine);
        rowNumber++;

        while (!_reader.EndOfStream)
        {
            var line = rowNumber + "," + _reader.ReadLine();
            row = GetEscapedSVs(line);
            _rows.Add(row);
            rowNumber++;
        }

        _reader.Close();
    }

    private string[] GetEscapedSVs(string data)
    {
        if (!data.EndsWith(","))
            data = data + ",";

        return GetEscapedSVs(data, ",", "\"");
    }

    // Parses each row by using the given separator and escape characters
    private string[] GetEscapedSVs(string data, string separator, string escape)
    {
        string[] result = null;

        int priorMatchIndex = 0;
        MatchCollection matches = Regex.Matches(data, string.Format(ESCAPE_SPLIT_REGEX, separator, escape));

        // Skip empty rows...
        if (matches.Count > 0) 
        {
            result = new string[matches.Count];

            for (int index = 0; index <= result.Length - 2; index++)
            {
                result[index] = data.Substring(priorMatchIndex, matches[index].Groups["Separator"].Index - priorMatchIndex);
                priorMatchIndex = matches[index].Groups["Separator"].Index + separator.Length;
            }
            result[result.Length - 1] = data.Substring(priorMatchIndex, data.Length - priorMatchIndex - 1);

            for (int index = 0; index <= result.Length - 1; index++)
            {
                if (Regex.IsMatch(result[index], string.Format("^{0}.*[^{0}]{0}$", escape))) 
                    result[index] = result[index].Substring(1, result[index].Length - 2);

                result[index] = result[index].Replace(escape + escape, escape);

                if (result[index] == null || result[index] == escape) 
                    result[index] = "";
            }
        }

        return result;
    }

    // Returns the number of rows
    public int RowCount
    {
        get
        {
            if (_rows == null)
                return 0;
            return _rows.Count;
        }
    }

    // Returns the number of headers (columns)
    public int HeaderCount
    {
        get
        {
            if (_headers == null)
                return 0;
            return _headers.Length;
        }
    }

    // Returns the value in a given column name and row index
    public object GetValue(string columnName, int rowIndex)
    {
        if (rowIndex >= _rows.Count)
        {
            return null;
        }

        var row = _rows[rowIndex];

        int colIndex = GetColumnIndex(columnName);

        if (colIndex == -1 || colIndex >= row.Length)
        {
            return null;
        }

        var value = row[colIndex];
        return value;
    }

    // Returns the column index of the provided column name
    public int GetColumnIndex(string columnName)
    {
        int index = -1;

        for (int i = 0; i < _headers.Length; i++)
        {
            if (_headers[i].Replace(" ","").Equals(columnName, StringComparison.CurrentCultureIgnoreCase))
            {
                index = i;
                return index;
            }
        }

        return index;
    }
}

CSVValueProviderFactory Class:

public class CSVValueProviderFactory : ValueProviderFactory
{
    public override IValueProvider GetValueProvider(ControllerContext controllerContext)
    {
        var uploadedFiles = controllerContext.HttpContext.Request.Files;

        if (uploadedFiles.Count > 0)
        {
            var file = uploadedFiles[0];
            var extension = file.FileName.Split('.').Last();

            if (extension.Equals("csv", StringComparison.CurrentCultureIgnoreCase))
            {
                if (file.ContentLength > 0)
                {
                    var stream = file.InputStream;
                    var csvReader = new CSVReader(new StreamReader(stream, Encoding.Default, true));

                    return new CSVValueProvider(controllerContext, csvReader);
                }
            }
        }

        return null;
    }
}

CSVValueProvider Class:

// Represents a value provider for the data in an uploaded CSV file.
public class CSVValueProvider : IValueProvider
{
    private CSVReader _csvReader;

    public CSVValueProvider(ControllerContext controllerContext, CSVReader csvReader)
    {
        if (controllerContext == null)
        {
            throw new ArgumentNullException("controllerContext");
        }

        if (csvReader == null)
        {
            throw new ArgumentNullException("csvReader");
        }

        _csvReader = csvReader;
    }

    public bool ContainsPrefix(string prefix)
    {
        if (prefix.Contains('[') && prefix.Contains(']'))
        {
            if (prefix.Contains('.'))
            {
                var header = prefix.Split('.').Last();
                if (_csvReader.GetColumnIndex(header) == -1)
                {
                    return false;
                }
            }

            int index = int.Parse(prefix.Split('[').Last().Split(']').First());
            if (index >= _csvReader.RowCount)
            {
                return false;
            }
        }

        return true;
    }

    public ValueProviderResult GetValue(string key)
    {
        if (!key.Contains('[') || !key.Contains(']') || !key.Contains('.'))
        {
            return null;
        }

        object value = null;
        var header = key.Split('.').Last();

        int index = int.Parse(key.Split('[').Last().Split(']').First());
        value = _csvReader.GetValue(header, index);

        if (value == null)
        {
            return null;
        }

        return new ValueProviderResult(value, value.ToString(), CultureInfo.CurrentCulture);
    }
}

For the validation, as I mentioned before, I figured that it would not be efficient to do it using DataAnnotation attributes. A row by row validation of the data would take a long time for CSV files with thousands of rows. So, I decided to validate the data in the Controller after the Model Binding is done. I should also mention that I needed to validate the data in the CSV file against some data in the database. If you just need to validate things like Email Address or Phone Number, you might as well just use DataAnnotation.

Here is a sample method for validating the Email Address column:

private void ValidateEmailAddress(IEnumerable<CSVViewModel> csvData)
{
    var invalidRows = csvData.Where(d => ValidEmail(d.EmailAddress) == false).ToList();

    foreach (var invalidRow in invalidRows)
    {
        var key = string.Format("csvData[{0}].{1}", invalidRow.RowNumber - 2, "EmailAddress");
        ModelState.AddModelError(key, "Invalid Email Address"); 
    }        
}

private static bool ValidEmail(string email)
{
    if(email == "")
        return false;
    else
        return new System.Text.RegularExpressions.Regex(@"^[\w-\.]+@([\w-]+\.)+[\w-]{2,6}$").IsMatch(email);
}

UPDATE 2:

For validation using DataAnnotaion, you just use DataAnnotation attributes in your CSVViewModel like below (the CSVViewModel is the class that your CSV data will be bound to in your Controller Action):

public class CSVViewModel
{
    // User proper names for your CSV columns, these are just examples...   

    [Required]
    public int Column1 { get; set; } 
    [Required]
    [StringLength(30)]
    public string Column2 { get; set; }
}
Share:
15,868
Tarik
Author by

Tarik

I am a software engineer with interests in different technologies.

Updated on June 05, 2022

Comments

  • Tarik
    Tarik almost 2 years

    I am trying to develop a module which will read excel sheets (possibly from other data sources too, so it should be loosely coupled) and convert them into Entities so to save.

    The logic will be this:

    1. The excel sheet can be in different format, for example column names in Excel sheet can be different so my system needs to be able to map different fields to my entities.
    2. For now I will be assuming the format defined above will be same and hardcoded for now instead of coming from database dynamically after set on a configuration mapping UI kinda thing.
    3. The data needs to be validated before even get mapped. So I should be able validate it beforehand against something. We're not using like XSD or something else so I should validate it against the object structure I am using as a template for importing.

    The problem is, I put together some things together but I don't say I liked what I did. My Question is how I can improve the code below and make things more modular and fix the validation issues.

    The code below is a mock-up and is not expected to work, just to see some structure of the design.

    This is code I've come up with so far, and I've realized one thing that I need to improve my design patterns skills but for now I need your help, if you could help me:

    //The Controller, a placeholder
    class UploadController
    {
        //Somewhere here we call appropriate class and methods in order to convert
        //excel sheet to dataset
    }
    

    After we uploaded file using an MVC Controller, there could be different controllers specialized to import certain behaviors, in this example I will uploading person related tables,

    interface IDataImporter
    {
        void Import(DataSet dataset);
    }
    

    //We can use many other importers besides PersonImporter class PersonImporter : IDataImporter { //We divide dataset to approprate data tables and call all the IImportActions //related to Person data importing //We call inserting to database functions here of the DataContext since this way //we can do less db roundtrip.

    public string PersonTableName {get;set;}
    public string DemographicsTableName {get;set;}
    
    public Import(Dataset dataset)
    {
        CreatePerson();
        CreateDemograhics();
    }
    
    //We put different things in different methods to clear the field. High cohesion.
    private void CreatePerson(DataSet dataset)
    {   
        var personDataTable = GetDataTable(dataset,PersonTableName);
        IImportAction addOrUpdatePerson = new AddOrUpdatePerson();
        addOrUpdatePerson.MapEntity(personDataTable);
    }
    
    private void CreateDemograhics(DataSet dataset)
    {
        var demographicsDataTable = GetDataTable(dataset,DemographicsTableName);
        IImportAction demoAction = new AddOrUpdateDemographic(demographicsDataTable);
        demoAction.MapEntity();
    }
    
    private DataTable GetDataTable(DataSet dataset, string tableName)
    {
        return dataset.Tables[tableName];
    }
    

    }

    I have IDataImporter and specialized concrete class PersonImporter. However, I am not sure it looks good so far since things should be SOLID so basically easy to extend later in the project cycle, this will be a foundation for future improvements, lets keep going:

    IImportActions are where the magic mostly happens. Instead of designing things table based, I am developing it behavior based so one can call any of them to import things in more modular model. For example a table may have 2 different actions.

    interface IImportAction
    {
        void MapEntity(DataTable table);
    }
    
    //A sample import action, AddOrUpdatePerson
    class AddOrUpdatePerson : IImportAction
    {
        //Consider using default values as well?
        public string FirstName {get;set;}
        public string LastName {get;set;}
        public string EmployeeId {get;set;}
        public string Email {get;set;}
    
        public void MapEntity(DataTable table)
        {
            //Each action is producing its own data context since they use
            //different actions.
            using(var dataContext = new DataContext())
            {
                foreach(DataRow row in table.Rows)
                {
                    if(!emailValidate(row[Email]))
                    {
                        LoggingService.LogWarning(emailValidate.ValidationMessage);
                    }
    
                    var person = new Person(){
                        FirstName = row[FirstName],
                        LastName = row[LastName],
                        EmployeeId = row[EmployeeId],
                        Email = row[Email]
                    };
    
                    dataContext.SaveObject(person);
                }
    
                dataContext.SaveChangesToDatabase();
            }
        }   
    }
    
    class AddOrUpdateDemographic: IImportAction
    {
        static string Name {get;set;}
        static string EmployeeId {get;set;}
    
        //So here for example, we will need to save dataContext first before passing it in 
        //to get the PersonId from Person (we're assuming that we need PersonId for Demograhics)    
        public void MapEntity(DataTable table)
        {
            using(var dataContext = new DataCOntext())
            {
                foreach(DataRow row in table.Rows)
                {
                    var demograhic = new Demographic(){
                        Name = row[Name],
                        PersonId = dataContext.People.First(t => t.EmployeeId = int.Parse(row["EmpId"]))
                    };
    
                    dataContext.SaveObject(person);
                }
    
                dataContext.SaveChangesToDatabase();
            }
        }
    }
    

    And the validation, which mostly where I suck at unfortunately. The validation needs to be easy to extend and loosely coupled and also I need to be able to call this validation beforehand instead of adding everything.

    public static class ValidationFactory
    {
        public static Lazy<IFieldValidation> PhoneValidation = new Lazy<IFieldValidation>(()=>new PhoneNumberValidation());
        public static Lazy<IFieldValidation> EmailValidation = new Lazy<IFieldValidation>(()=>new EmailValidation());
        //etc.
    }
    
    interface IFieldValidation
    {
        string ValidationMesage{get;set;}
        bool Validate(object value);
    }
    
    class PhoneNumberValidation : IFieldValidation
    {
        public string ValidationMesage{get;set;}
        public bool Validate(object value)
        {   
            var validated = true; //lets say...
            var innerValue = (string) value;
            //validate innerValue using Regex or something
            //if validation fails, then set ValidationMessage propert for logging.
            return validated;
        }
    }
    
    class EmailValidation : IFieldValidation
    {
        public string ValidationMesage{get;set;}
        public bool Validate(object value)
        {   
            var validated = true; //lets say...
            var innerValue = (string) value;
            //validate innerValue using Regex or something
            //if validation fails, then set ValidationMessage propert for logging.
            return validated;
        }
    }
    
  • user1447718
    user1447718 over 10 years
    @atarvati, hi, am trying to design a similar thing, can you please more details about the design? thanks.
  • ataravati
    ataravati over 10 years
    Do you need sample code for the CSVValueProvider or validation?
  • user1447718
    user1447718 over 10 years
    both of them will help me.
  • user1447718
    user1447718 over 10 years
    Hi Atarvati, can you throw some light on validation using DataAnnotation? any online samples?
  • ataravati
    ataravati over 10 years
    See Update 2 in the answer.