{"type":"doc","content":[{"type":"extension","attrs":{"layout":"default","extensionType":"com.atlassian.confluence.macro.core","extensionKey":"toc","parameters":{"macroParams":{"outline":{"value":"true"}},"macroMetadata":{"macroId":{"value":"073e588d-acef-4b61-b804-dc23c9622715"},"schemaVersion":{"value":"1"},"title":"Table of Contents"}}}},{"type":"heading","attrs":{"level":1},"content":[{"text":"DATA INTEGRATION STEPS - OVERVIEW","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Introduction","type":"text"}]},{"type":"paragraph","content":[{"text":"Boxalino provides a complete Data Science Eco-system built upon Google BigQuery and fully integrated with Boxalino Real-Time Platform.","type":"text"}]},{"type":"paragraph","content":[{"text":"In this section we present the 5 key steps of a Data Integration project which are then explained in details in the following sections of the document.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"4 Key Steps","type":"text"}]},{"type":"paragraph","content":[{"text":"To integrate new data, the key steps are:","type":"text"}]},{"type":"orderedList","attrs":{"order":1},"content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Set-up your Google Cloud Environment","type":"text","marks":[{"type":"strong"}]},{"type":"hardBreak"},{"text":"You will need an ","type":"text"},{"text":"Organization","type":"text","marks":[{"type":"em"}]},{"text":", ","type":"text"},{"text":"Project","type":"text","marks":[{"type":"em"}]},{"text":", ","type":"text"},{"text":"Billing Account, Service Account","type":"text","marks":[{"type":"em"}]},{"text":", ","type":"text"},{"text":"Storage","type":"text","marks":[{"type":"em"}]},{"text":" ","type":"text"},{"text":"Bucket","type":"text","marks":[{"type":"em"}]},{"text":" and ","type":"text"},{"text":"BigQuery Datasets","type":"text","marks":[{"type":"em"}]},{"text":" (","type":"text"},{"text":"Stage","type":"text","marks":[{"type":"em"}]},{"text":" and ","type":"text"},{"text":"Core Datasets","type":"text","marks":[{"type":"em"}]},{"text":")","type":"text"},{"type":"hardBreak"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Export your data to Google Cloud Storage","type":"text","marks":[{"type":"strong"}]},{"type":"hardBreak"},{"text":"Set-up an ","type":"text"},{"text":"Automatic Uploading Process","type":"text","marks":[{"type":"em"}]},{"text":" for your ","type":"text"},{"text":"Full historical data and Partial daily data","type":"text","marks":[{"type":"em"}]},{"text":" as CSV or JSON files in ","type":"text"},{"text":"Google Cloud Storage","type":"text","marks":[{"type":"em"}]},{"type":"hardBreak"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Set-up your Data Processing","type":"text","marks":[{"type":"strong"}]},{"type":"hardBreak"},{"text":"Create your Core Dataset Tables as an ","type":"text"},{"text":"Operational Data Store","type":"text","marks":[{"type":"em"}]},{"text":" and set-up an ","type":"text"},{"text":"Automatic Data Processing","type":"text","marks":[{"type":"em"}]},{"text":" of your files to your ","type":"text"},{"text":"Stage Dataset","type":"text","marks":[{"type":"em"}]},{"text":" and ","type":"text"},{"text":"Core Datasets","type":"text","marks":[{"type":"em"}]},{"type":"hardBreak"},{"text":"Watch the tutorial video: ","type":"text"},{"text":"https://youtu.be/yYz_HmeX4T8","type":"text","marks":[{"type":"link","attrs":{"href":"https://youtu.be/yYz_HmeX4T8"}}]},{"type":"hardBreak"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Integrate your Core Data with Boxalino","type":"text","marks":[{"type":"strong"}]},{"type":"hardBreak"},{"text":"Boxalino integrates your data from your ","type":"text"},{"text":"Core Dataset","type":"text","marks":[{"type":"em"}]},{"text":" into ","type":"text"},{"text":"Boxalino Data Science Eco-System","type":"text","marks":[{"type":"em"}]}]}]}]},{"type":"mediaSingle","attrs":{"layout":"center"},"content":[{"type":"media","attrs":{"width":614,"id":"d2a426b8-6bf8-4fd8-9ef8-63fb12665813","collection":"contentId-34439169","type":"file","height":166}}]},{"type":"paragraph"},{"type":"paragraph"},{"type":"paragraph","content":[{"text":"For an optimal integration, your data will go through 3 phases: Google Storage as files, BigQuery stage as direct representation of your files and BigQuery core as consolidated data. The process is typically executed once per day, but can be adjusted to be executed more frequently.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"System overview","type":"text"}]},{"type":"mediaSingle","attrs":{"layout":"center"},"content":[{"type":"media","attrs":{"width":614,"id":"264e4428-3242-4fd5-9177-b2e6c33775de","collection":"contentId-34439169","type":"file","height":336}}]},{"type":"paragraph"},{"type":"paragraph","content":[{"text":"The Operational System with Data (the system which manages the data to be exported) typically exports the data as files to a Google Cloud Storage bucket using the Google Cloud SDK:","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/appengine/docs/standard/go/download","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/appengine/docs/standard/go/download"}}]}]},{"type":"paragraph","content":[{"text":"A variant where another machine (e.g.: a VM hosted in the same secured location as the Operational System) can be used to send the files to Google Storage with the Google Cloud SDK.","type":"text"}]},{"type":"paragraph","content":[{"text":"This variant is typically not required and is not recommended as the Google Cloud SDK can be installed on any machine safely and, by doing so, the data have the least amount of temporary storage to ensure optimal security.","type":"text"}]},{"type":"heading","attrs":{"level":1},"content":[{"text":"STEP 1: SET-UP YOUR GOOGLE ENVIRONMENT","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Summary","type":"text"}]},{"type":"paragraph","content":[{"text":"In this step, all the required preparation points of your Google Environment should be done.","type":"text"}]},{"type":"paragraph","content":[{"text":"Typically, if you didn’t have a Google Cloud account to start with, it is possible to do it in 2 hours.","type":"text"}]},{"type":"paragraph","content":[{"text":"You will need the required credentials (including payments) to complete the set-up.","type":"text"}]},{"type":"paragraph","content":[{"text":"The key parts of this step are:","type":"text"}]},{"type":"orderedList","attrs":{"order":1},"content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Create a Google Cloud Organization","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Create a Google Cloud Project","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Create a Google Billing Account","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Create a Google Cloud Storage Bucket","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Create a Google Cloud Service Account","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Create STAGE and CORE BigQuery Datasets","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Create a Google Group for Users","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Create A Google Cloud Organization","type":"text"}]},{"type":"paragraph","content":[{"text":"This is the resource representing your company in Google Cloud. An Organization resource is available for Google G Suite and Cloud Identity customers.","type":"text"}]},{"type":"paragraph","content":[{"text":"You can create a Google Cloud Organization in two ways: Cloud Identity or GSuite.","type":"text"}]},{"type":"paragraph","content":[{"text":"The information to decide which one is the right one for you and how to do it are described here:","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/resource-manager/docs/creating-managing-organization","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/resource-manager/docs/creating-managing-organization"}}]}]},{"type":"paragraph","content":[{"text":"Then you can view and manage billing accounts and projects under your organization here:","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/resource-manager/docs/organization-resource-management","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/resource-manager/docs/organization-resource-management"}}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Create A Google Cloud Project","type":"text"}]},{"type":"paragraph","content":[{"text":"Google Cloud Platform (GCP) projects form the basis for creating, enabling, and using all GCP services including managing APIs, enabling billing, adding and removing collaborators, and managing permissions for GCP resources. This is the main container for all of our project activities, both related to the uploading of files and their processing in BigQuery.","type":"text"}]},{"type":"paragraph","content":[{"text":"Please follow the instructions here:","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/resource-manager/docs/creating-managing-projects","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/resource-manager/docs/creating-managing-projects"}}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Create A Google Billing Account","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/billing/docs/how-to/manage-billing-account","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/billing/docs/how-to/manage-billing-account"}}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Create A Google Cloud Storage Bucket","type":"text"}]},{"type":"paragraph","content":[{"text":"Google Cloud Storage is a RESTful online file storage web service for storing and accessing data on Google Cloud Platform infrastructure. The service combines the performance and scalability of Google's cloud with advanced security and sharing capabilities.","type":"text"},{"type":"hardBreak"},{"text":"https://cloud.google.com/storage/","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/storage/"}}]}]},{"type":"paragraph","content":[{"text":"Google Cloud Storage Bucket is the “folder” which will contain all your exported files.","type":"text"}]},{"type":"paragraph","content":[{"text":"Please make sure to use the region ","type":"text"},{"text":"europe-west1","type":"text","marks":[{"type":"strong"}]},{"text":".","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/storage/docs/creating-buckets","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/storage/docs/creating-buckets"}}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Create A Google Service Account","type":"text"}]},{"type":"paragraph","content":[{"text":"This is the account to be used for automated process in Step 2 and 4. The Service account is the account the machine will use to perform the data loading and transformation tasks. The service account must have access to your ","type":"text"},{"text":"Storage Bucket ","type":"text","marks":[{"type":"em"}]},{"text":"and your","type":"text"},{"text":" BigQuery Project","type":"text","marks":[{"type":"em"}]},{"text":".","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/iam/docs/service-accounts","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/iam/docs/service-accounts"}}]}]},{"type":"paragraph","content":[{"text":"Make sure to give access rights for this service account to the Google Cloud Storage Bucket and provide with the authentication information to the required people for the set-up of Step 2.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Create STAGE and CORE BigQuery Datasets","type":"text"}]},{"type":"paragraph","content":[{"text":"Please use the location EU as below.","type":"text"}]},{"type":"mediaSingle","attrs":{"layout":"center"},"content":[{"type":"media","attrs":{"width":451,"id":"0a7905ed-2458-40fa-aa8a-90cb073c2707","collection":"contentId-34439169","type":"file","height":142}}]},{"type":"paragraph","content":[{"text":"Then please share your datasets with Boxalino by adding as members both with the role of Viewer and BigQuery Data Viewer following accounts as per the screen-shot below:","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"organization@boxalino.com","type":"text","marks":[{"type":"link","attrs":{"href":"mailto:organization@boxalino.com"}}]},{"text":" ","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"285267441728-compute@developer.gserviceaccount.com","type":"text","marks":[{"type":"link","attrs":{"href":"mailto:285267441728-compute@developer.gserviceaccount.com"}}]}]}]}]},{"type":"mediaSingle","attrs":{"layout":"center"},"content":[{"type":"media","attrs":{"width":524,"id":"65c2dc81-db5f-4876-9d53-0f7718f05df4","collection":"contentId-34439169","type":"file","height":723}}]},{"type":"paragraph","content":[{"text":"Google BigQuery ","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"BigQuery is a RESTful web service that enables interactive analysis of massive datasets working in conjunction with Google Storage. It is a serverless Software-as-a-Service that may be used complementarily with MapReduce.","type":"text"},{"type":"hardBreak"},{"text":"https://cloud.google.com/bigquery","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/bigquery"}}]}]},{"type":"paragraph","content":[{"text":"Google BigQuery Dataset","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"A dataset is contained within a specific project. Datasets are top-level containers that are used to organize and control access to your tables and views. A table or view must belong to a dataset, so you need to create at least one dataset before loading data into BigQuery.","type":"text"},{"type":"hardBreak"},{"text":"In other words, dataset is the same as a MySQL “database” but in BigQuery.","type":"text"}]},{"type":"paragraph","content":[{"text":"Here is a general introduction into BigQuery Datasets.","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/bigquery/docs/datasets-intro","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/bigquery/docs/datasets-intro"}}]}]},{"type":"paragraph","content":[{"text":"Create a dataset called [account]_stage and [account]_core (where account is your account name which can be defined typically based on the name of the system providing these data, possibly for a specific account in case several different accounts are exported from the same system in parallel):","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/bigquery/docs/datasets","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/bigquery/docs/datasets"}}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Create a Google Group","type":"text"}]},{"type":"paragraph","content":[{"text":"Create a Google Group and include there all users that should have access to BigQuery (you can manage yourself who has access directly from there in the future)","type":"text"}]},{"type":"paragraph","content":[{"text":"https://support.google.com/cloudidentity/answer/33343?hl=en","type":"text","marks":[{"type":"link","attrs":{"href":"https://support.google.com/cloudidentity/answer/33343?hl=en"}}]}]},{"type":"paragraph","content":[{"text":"Assign relevant access roles to the new user group, see the below links","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"https://cloud.google.com/iam/docs/understanding-roles","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/iam/docs/understanding-roles"}}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"https://cloud.google.com/iam/docs/understanding-roles#bigquery-roles","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/iam/docs/understanding-roles#bigquery-roles"}}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"https://cloud.google.com/bigquery/docs/dataset-access-controls","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/bigquery/docs/dataset-access-controls"}}]}]}]}]},{"type":"heading","attrs":{"level":1},"content":[{"text":"STEP 2: EXPORT YOUR DATA TO GOOGLE CLOUD STORAGE","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Summary","type":"text"}]},{"type":"paragraph","content":[{"text":"Before proceeding further, make sure that your Google Environment has been prepared as per earlier mentioned instructions.","type":"text"}]},{"type":"paragraph","content":[{"text":"As a result, you should now have ready the following information:","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Google Service Account (an e-mail address) and a certificate with a private key","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Address of the Cloud Storage Bucket (e.g.: gs://my-bucket)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"The key activities in this step are:","type":"text"}]},{"type":"orderedList","attrs":{"order":1},"content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Install Google Cloud SDK","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Authenticate with the Service Account","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Set up an export process generating CSV or JSON files","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Upload the generated files into a Google Cloud Storage Bucket","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Automate the process","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Install Google Cloud SDK","type":"text"}]},{"type":"paragraph","content":[{"text":"Install the Google Cloud SDK in the environment exporting your data as per the below instructions:","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/appengine/docs/standard/go/download","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/appengine/docs/standard/go/download"}}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Authenticate with your Google Service Account","type":"text"}]},{"type":"paragraph","content":[{"text":"Authenticate with the Google Service Account created in Step 1 (including the certificate with the private key) using the SDK command ","type":"text"},{"text":"gcloud auth","type":"text","marks":[{"type":"strong"}]},{"text":" as per the instructions here:","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/sdk/gcloud/reference/auth/activate-service-account","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/sdk/gcloud/reference/auth/activate-service-account"}}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Set up an export process generating CSV or JSON files","type":"text"}]},{"type":"paragraph","content":[{"text":"The process is not described in detail as it is client-specific.","type":"text"}]},{"type":"paragraph","content":[{"text":"Its output should be a set of files containing the exported data. There can be two kinds of exports: the full data (typically customer masterdata) and the partial data (typically transaction data for the chosen period, such as transactions from the day before).","type":"text"}]},{"type":"paragraph","content":[{"text":"Transaction data","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"By transaction data we mean data connecting customers to products, such as the purchase history. While the purchase history is the main (and typically most important) use case, other transaction data could be also considered (sales leads, support requests, etc.).","type":"text"}]},{"type":"paragraph","content":[{"text":"This data typically comes structured in the following two ways (examples given for the CSV format)","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"either with one line per order with the required internal order and customer identifiers as well as an external order identifier (which should be used to map the data to pre-existing order data in the Boxalino Data Science Eco-system and which might be the same as the internal order identifier) and descriptive columns about the order (date, status, total value, etc.)","type":"text"}]}]}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"or with one line per product per order with the required product identifier as well as descriptive columns about the ordered product (quantity, price before and after discounts, etc.).","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"For details look at the Annex: WHAT DATA DO WE NEED?","type":"text"}]},{"type":"paragraph","content":[{"text":"Customer data","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"By customer data we mean data describing a customer in the system exporting the data (depending on the system, a customer can be defined as a unique physical person, a company, an account or using another logic). This data is typically contained in one CSV file (with one line per customer with the required customer id as referred to in the transaction data, one column for the external identifiers (which should be used to map the data to pre-existing customer data in the Boxalino Data Science Eco-system and might be the same as the internal customer identifier) as well as descriptive columns about the customer (e.g.: gender, zip code, date of birth, etc.). We recommend to export anonymized customer data only unless there is a specific compelling reason to do otherwise.","type":"text"}]},{"type":"paragraph","content":[{"text":"For details look at the Annex: WHAT DATA DO WE NEED?","type":"text"}]},{"type":"paragraph","content":[{"text":"Full data","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"The full data includes all information available at and consistent to a chosen point in time (typically the moment of the extraction of the data from its source system). Also known as “full snapshot”. One of the two basic kinds of data provided by the Automatic Upload Process.","type":"text"}]},{"type":"paragraph","content":[{"text":"Partial (aka DELTA) data","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"One of the two basic kinds of data provided by the Automatic Upload Process. (full or partial). The partial daily data (aka DAILY DELTA) only contains data that has changed over a period of the previous day (typically all changes from the day before) but we can also generate weekly or monthly deltas. If the data is not easily exported as a delta (i.e. the customer data is often more difficult to export as such compared to transactions) then it is possible to export the full data, instead. The files can be either in CSV or JSON.","type":"text"}]},{"type":"paragraph","content":[{"text":"CSV files should have column names in the first row.","type":"text"}]},{"type":"paragraph","content":[{"text":"JSON files should use the Google-compliant format with one JSON record per row (and not a single JSON object with all the data in it).","type":"text"}]},{"type":"paragraph","content":[{"text":"If possible (and to be compatible with the standard SQL and Shell-scripts samples provided in Step 4) the files should follow the following naming pattern:","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"First part of file name should be date (+ time) to support wildcard (“*”) matching","type":"text"}]}]}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Second part should be F (for full data) or D (for delta data)","type":"text"}]}]}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Name of the file should indicate the contents of the export (customer, transactions, product… etc) and optionally a numerical post-fix before the file extension (e.g. when the export is spread over several files)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"e.g.: 20190822T071553_D_products.json","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Copy the files to Google Cloud Storage Bucket","type":"text"}]},{"type":"paragraph","content":[{"text":"Copy the files to Google Cloud Storage Bucket defined in step 1 using the command ","type":"text"},{"text":"gsutil cp","type":"text","marks":[{"type":"strong"}]},{"text":" as described here:","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/storage/docs/gsutil/commands/cp","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/storage/docs/gsutil/commands/cp"}}]}]},{"type":"paragraph","content":[{"text":"it is recommended to put the files into a folder structure with 3 levels (starting from the root):","type":"text"}]},{"type":"orderedList","attrs":{"order":1},"content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Your bucket name","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"data (fix name)","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Type of contents (e.g.: “customers” or “orders”)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"optionally: the files can be compressed using gzip","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Automate the process","type":"text"}]},{"type":"paragraph","content":[{"text":"Automatically run the export of the Partial daily data (typically once per day at a set hour during the night). Set up an easy facility to run (manually) the export of the full data.","type":"text"}]},{"type":"heading","attrs":{"level":1},"content":[{"text":"STEP 3: SET UP YOUR DATA PROCESSING","type":"text"}]},{"type":"paragraph","content":[{"text":"Watch the tutorial video: ","type":"text","marks":[{"type":"strong"}]},{"text":"https://youtu.be/yYz_HmeX4T8","type":"text","marks":[{"type":"strong"},{"type":"link","attrs":{"href":"https://youtu.be/yYz_HmeX4T8"}}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Summary","type":"text"}]},{"type":"paragraph","content":[{"text":"This step describes the creation of Core Dataset Tables that act as an ","type":"text"},{"text":"Operational Data Store","type":"text","marks":[{"type":"em"}]},{"text":" and the set-up of an Automatic Data Processing within Google Cloud to load the last files provided by the ","type":"text"},{"text":"Automatic Uploading Process","type":"text","marks":[{"type":"em"}]},{"text":" ","type":"text"},{"text":"Run","type":"text","marks":[{"type":"em"}]},{"text":".","type":"text"}]},{"type":"paragraph","content":[{"text":"There are many ways to set up such a process. We give an example that is very simple standard copy&paste implementation using a Google VM Instance:","type":"text"}]},{"type":"paragraph","content":[{"text":"Google Cloud Virtual Machine Instance (VM Instance)","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"text":" ","type":"text"},{"type":"hardBreak"},{"text":"An Automatic Data Processing loads the files generated by the last Automatic Uploading Process Run into the STAGE Dataset (direct mapping, load the files as they are into tables which names are derived from the names of the files) and then update the CORE Dataset based on the generated stage tables.","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/compute/docs/instances/","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/compute/docs/instances/"}}]}]},{"type":"paragraph","content":[{"text":"We suggest to create a process that is capable of handling both the full and the delta exports.","type":"text"}]},{"type":"paragraph","content":[{"text":"In order to do so the following tasks should be done:","type":"text"}]},{"type":"orderedList","attrs":{"order":1},"content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Create a VM Instance","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Set up your VM Instance with a Shell Script","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Run a Setup Script with chosen parameters","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Set up a Cron-job (for automated execution of the data processing)","type":"text"}]}]}]},{"type":"paragraph"},{"type":"paragraph","content":[{"text":"Automatic Data Processing","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"text":" ","type":"text"},{"type":"hardBreak"},{"text":"An Automatic Data Processing loads the files generated by the last Automatic Uploading Process Run into the Stage Dataset (direct mapping, load the files as they are into tables which names are derived from the names of the files) and then update the Core Dataset on the basis of these generated stage tables.","type":"text"}]},{"type":"paragraph","content":[{"text":"Automatic Data Processing","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"text":" ","type":"text"},{"text":"Run","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"An Automatic Data Processing Run is specific execution (with its timestamp) of an Automatic Data Processing. Typically It should be scheduled to run shortly after the Automatic Uploading Process Run (it is not required, but it could be triggered by the end of the Automatic Uploading Process Run instead of being scheduled independently, but this requires an additional step in the setup of the Automatic Uploading Process.","type":"text"},{"type":"hardBreak"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Create a VM Instance","type":"text"}]},{"type":"paragraph","content":[{"text":"In this part, we describe how to create a VM instance in Google Cloud. It can be done in the Google cloud console in : Compute Engine -> VM Instance -> Create Instance","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/compute/docs/instances/create-start-instance","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/compute/docs/instances/create-start-instance"}}]}]},{"type":"paragraph","content":[{"text":"Here are some of the important parameters for the VM:","type":"text"}]},{"type":"orderedList","attrs":{"order":1},"content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"VM machine type : G1-small is sufficient (usually)","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Location must be ","type":"text"},{"text":"europe-west-1","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Make sure your Service Account has access to Cloud Storage and BigQuery APIs and can connect to the VM","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"You can then simply start the VM Instance (it normally starts automatically after creation) and open an SSH session in a browser window to perform the next steps.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Setup of your VM with a Shell Script","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Copy the Setup Script from the following section into your home/user directory","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Make the script files executable using command ","type":"text"},{"text":"chmod","type":"text","marks":[{"type":"strong"}]},{"text":" u+x FILENAME","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Execute the script with correct parameters from inside your home/user directory. The command should follow this pattern: ","type":"text"},{"text":"[STAGE and CORE dataset prefix] [Your GCS bucket name] [Table name],[Column1],[Column2],[Column3],[Column4],[Column5]... ","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Here is an example of how it could look like:","type":"text"},{"text":" bash configure.sh boxalino boxalino-data transactions,order_id,order_date,order_status,order_amount,order_currency","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"f you want to create multiple tables, you must append them to the end of the command and separate each table with a space. If you don't separate it, it will be used as a whole table.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Be careful, you should only run the setup script once because if you've already entered data in one of your tables and run the script again, each table will be overwritten and you will lose your current changes. Just make sure you add all the necessary tables from the start.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"install a system package gettext-base in case it is missing in your system (using ","type":"text"},{"text":"yum","type":"text","marks":[{"type":"code"}]},{"text":" or ","type":"text"},{"text":"dnf","type":"text","marks":[{"type":"code"}]},{"text":" or ","type":"text"},{"text":"apt","type":"text","marks":[{"type":"code"}]},{"text":" package manager depending on your linux distribution).","type":"text"},{"type":"hardBreak"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Setup Script","type":"text"}]},{"type":"paragraph","content":[{"text":"This script has to be run only once (see point 5.6 below). It will generate several directories, two SQL files and the shell script for regular automated data processing:","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"A file with DDL statements to create CORE tables creation that needs to be executed manually once (","type":"text"},{"text":"create-core.sql","type":"text","marks":[{"type":"code"}]},{"text":")","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"A file with SQL code for regular (daily) loads (","type":"text"},{"text":"stage2core.sql","type":"text","marks":[{"type":"code"}]},{"text":") that takes data from the STAGE, transforms and stores it in the CORE area.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"A file with Bash shell script that activates and steers the data processing (","type":"text"},{"text":"process.sh","type":"text","marks":[{"type":"code"}]},{"text":", here referenced also as “Automation Script”). This file can be called from the linux ","type":"text"},{"text":"cron","type":"text","marks":[{"type":"code"}]},{"text":" daemon to guarantee a reliable schedule-based execution.","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Full code follows.","type":"text"}]},{"type":"paragraph"},{"type":"codeBlock","attrs":{"language":"shell"},"content":[{"text":"#!/usr/bin/env bash\n\n# good shell scripting practices\nset -o errexit # exit when a command fails\nset -o nounset # exit when an undeclared variable is used\nset -o pipefail # return exit status of the last command that threw a non-zero exit code even when piped together\n# set -o xtrace # enable debugging\n\n# functions\nusage () {\n cat 1>&2 </YYYYMMDD*_.(csv|json)\n\n EXPORT_DEFINITION any number of definitions can be provided separated\n by spaces\n a single export definition has the form of EXPORT_NAME,COLUMN1,COLUMN2,...\n the export name determines the expected filenames, names of STAGE tables\n and the names of the tables in the CORE and for this reason ONLY\n names containing the following character classes [a-zA-Z0-9_]\n are allowed\n it is assumed that the first COLUMN is the primary key used for\n matching records between STAGE and CORE\n NOTE that the order of the export definitions determines the order\n of the loading / processing the data\n\n\nArtifacts created:\n logs\n sql/${DDLSQL}\n sql/${DMLSQL}\n tmp\n ${ETLSCRIPT}\n\nExample:\n $0 [STAGE and CORE dataset prefix] [Your GCS bucket name] [Table name],[Column1], /\n[Column2],[Column3],[Column4],[Column5]...\n\n the above call will create a process to load files named\n \"YYYYMMDD*[Table name].(csv|json)\" into BigQuery STAGE table\n \"[STAGE dataset prefix]_stage.YYYYMMDD*[Table name]\" and during the CORE load\n into \"[CORE dataset prefix]_core.[Table name]\"\n\nHELP_USAGE\n}\n\n# global configuration\nROOTDIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nTOPDIRS=\"logs sql tmp\"\nDDLSQL=\"create-core.sql\"\nDMLSQL=\"stage2core.sql\"\nETLSCRIPT=\"process.sh\"\n\nif [[ ${#} -lt 3 ]]; then\n usage\n exit 22\nfi\n\n# what is our client\nCLIENT=${1}\nCLIENT_BUCKET=${2}\nCLIENT_STAGE=${CLIENT}_stage\nCLIENT_CORE=${CLIENT}_core\nshift 2\n\n# create directories\nfor d in ${TOPDIRS}; do\n newdir=${ROOTDIR}/${d}\n mkdir -p ${newdir}\n echo \"created directory ${newdir}\"\ndone; echo\n\n# initiate artifacts\necho -e \"-- autogenerated DDL for creation of core tables\\n\" >${ROOTDIR}/sql/${DDLSQL}\necho -e \"-- autogenerated SQL for stage2core load\\n\" >${ROOTDIR}/sql/${DMLSQL}\ncat >${ROOTDIR}/${ETLSCRIPT} <>${ROOTDIR}/sql/${DDLSQL}\n for c in \"${columns[@]}\"; do\n if [[ ${c} = \"${columns[0]}\" ]]; then\n echo -e \" ${c} STRING NOT NULL\" >>${ROOTDIR}/sql/${DDLSQL}\n else\n echo -e \" ,${c} STRING\" >>${ROOTDIR}/sql/${DDLSQL}\n fi\n done;\n echo -e \" ,create_tm DATETIME NOT NULL,update_tm DATETIME NOT NULL,src_cd STRING\" >>${ROOTDIR}/sql/${DDLSQL}\n echo -e \") OPTIONS(description=\\\"autogenerated\\\");\\n\" >>${ROOTDIR}/sql/${DDLSQL}\n echo \"DDL for ${exportname} written\"\n\n # append commands to stage2core SQL for the export\n echo -e \"-- ${exportname}\" >>${ROOTDIR}/sql/${DMLSQL}\n echo -e \"DELETE FROM \\`${CLIENT_CORE}.${exportname}\\`\" >>${ROOTDIR}/sql/${DMLSQL}\n echo -e \"WHERE ${columns[0]} IN (SELECT CAST(${columns[0]} AS STRING) FROM \\`${CLIENT_STAGE}.\\${${exportname}_stage_table}\\`);\" >>${ROOTDIR}/sql/${DMLSQL}\n echo -e \"INSERT INTO \\`${CLIENT_CORE}.${exportname}\\` (${columnstring},create_tm,update_tm,src_cd)\" >>${ROOTDIR}/sql/${DMLSQL}\n echo -e \"SELECT \" >>${ROOTDIR}/sql/${DMLSQL}\n for c in \"${columns[@]}\"; do\n if [[ ${c} = \"${columns[0]}\" ]]; then\n echo -e \" CAST(${c} AS STRING) AS ${c}\" >>${ROOTDIR}/sql/${DMLSQL}\n else\n echo -e \" ,CAST(${c} AS STRING) AS ${c}\" >>${ROOTDIR}/sql/${DMLSQL}\n fi\n done;\n echo -e \" ,current_datetime AS create_tm, current_datetime AS update_tm, '${exportname}' AS src_cd\" >>${ROOTDIR}/sql/${DMLSQL}\n echo -e \"FROM \\`${CLIENT_STAGE}.\\${${exportname}_stage_table}\\`;\\n\" >>${ROOTDIR}/sql/${DMLSQL}\n echo \"SQL for ${exportname} written\"\n\n # continue generating ETLSCRIPT contents\n echo -e \"# set up variables for ${exportname} processing\" >>${ROOTDIR}/${ETLSCRIPT}\n echo -e \"${exportname}_file=\\$(gsutil ls gs://${CLIENT_BUCKET}/data/${exportname}/\\${ETLDATE}*${exportname}.* | grep -iE 'csv|json' | tail -n 1)\" >>${ROOTDIR}/${ETLSCRIPT}\n echo -e \"${exportname}_file_type=\\${${exportname}_file##*.}\" >>${ROOTDIR}/${ETLSCRIPT}\n echo -e \"export ${exportname}_stage_table=\\$(basename \\${${exportname}_file%.*})\" >>${ROOTDIR}/${ETLSCRIPT}\n echo -e \"export ${exportname}_core_table=${exportname}\" >>${ROOTDIR}/${ETLSCRIPT}\n echo -e \"# load ${exportname} data from storage into BigQuery STAGE dataset\" >>${ROOTDIR}/${ETLSCRIPT}\n echo -e \"if [[ \\$${exportname}_file_type = CSV || \\$${exportname}_file_type = csv ]]; then bq load --source_format CSV --skip_leading_rows 1 --field_delimiter=\\${DEFAULT_CSV_DELIMITER} --autodetect --replace ${CLIENT_STAGE}.\\${${exportname}_stage_table} \\$${exportname}_file; fi\" >>${ROOTDIR}/${ETLSCRIPT}\n echo -e \"if [[ \\$${exportname}_file_type = JSON || \\$${exportname}_file_type = json ]]; then bq load --source_format NEWLINE_DELIMITED_JSON --autodetect --replace ${CLIENT_STAGE}.\\${${exportname}_stage_table} \\$${exportname}_file; fi\\n\\n\" >>${ROOTDIR}/${ETLSCRIPT}\n\n echo -e \"Finished processing export ${exportname}\\n\"\ndone\n\n# write the remaining portion of ETLSCRIPT contents\necho -e \"# STAGE to CORE queries from ${DMLSQL} - template and execute in synchronous fashion\" >>${ROOTDIR}/${ETLSCRIPT}\necho -e \"live_sql_file=\\$(date +%Y%m%dT%H%M%S)_${DMLSQL}\" >>${ROOTDIR}/${ETLSCRIPT}\necho -e \"envsubst tmp/\\${live_sql_file}\" >>${ROOTDIR}/${ETLSCRIPT}\necho -e \"bq query --nouse_legacy_sql >${ROOTDIR}/${ETLSCRIPT}\n\n# make generated processing script executable\nchmod ug+x ${ROOTDIR}/${ETLSCRIPT}\n\n# print cron instructions\necho -e \"To enable automatic daily execution type the following command\"\necho -e \"\\tcrontab -e\"\necho -e \"and add the following line\"\necho -e \"\\t* H * * * ${ROOTDIR}/${ETLSCRIPT} \\$(date --date '1 day ago' +%Y%m%d) 1>&2 2>${ROOTDIR}/logs/\\$(date +%Y%m%dT%H%M%S)_${ETLSCRIPT%%.*}.log\"\necho -e \"where 'H' stands for the HOUR of automatic execution (we recommend running very early in the morning)\\n\"\n\necho -e \"ALL DONE\"\nexit 0","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Automation Script","type":"text"}]},{"type":"paragraph","content":[{"text":"Here we include a sample the shell script referred to in the prior section which needs to be executed by the ","type":"text"},{"text":"cron","type":"text","marks":[{"type":"code"}]},{"text":". This shell script is generated during the configuration and its contents depend on parameters (name of the client and exports and their structure) entered by the user.","type":"text"}]},{"type":"codeBlock","attrs":{"language":"shell"},"content":[{"text":"#!/usr/bin/env bash\n# this script was autogenerated on 2019-11-11 11:06:57\nset -o errexit # exit when a command fails\nset -o nounset # exit when an undeclared variable is used\nset -o pipefail # return exit status of the last command that threw a non-zero exit code even when piped together\n# set -o xtrace # enable debugging\nexport ETLDATE=$1\nif [[ -z ${ETLDATE} || ! $ETLDATE =~ ^20[1-2][0-9][0-1][0-9][0-3][0-9]$ ]]; then\n echo \"Missing argument ETLDATE or format mismatch (expected 20YYMMDD)\"\n exit 22\nfi\n\nDEFAULT_CSV_DELIMITER=,\n\n# set up variables for customer processing\ncustomer_file=$(gsutil ls gs://integration-bucket/data/customer/${ETLDATE}*customer.* | grep -iE 'csv|json' | tail -n 1)\ncustomer_file_type=${customer_file##*.}\nexport customer_stage_table=$(basename ${customer_file%.*})\nexport customer_core_table=customer\n# load customer data from storage into BigQuery STAGE dataset\nif [[ $customer_file_type = CSV || $customer_file_type = csv ]]; then bq load --source_format CSV --skip_leading_rows 1 --field_delimiter=${DEFAULT_CSV_DELIMITER} --autodetect --replace kevinllado_stage.${customer_stage_table} $customer_file; fi\nif [[ $customer_file_type = JSON || $customer_file_type = json ]]; then bq load --source_format NEWLINE_DELIMITED_JSON --autodetect --replace kevinllado_stage.${customer_stage_table} $customer_file; fi\n\n# STAGE to CORE queries from stage2core.sql - template and execute in synchronous fashion\nlive_sql_file=$(date +%Y%m%dT%H%M%S)_stage2core.sql\nenvsubst tmp/${live_sql_file}\nbq query --nouse_legacy_sql AS STRING). Both DELETE and INSERT statements need to be adjusted consistently in case you edit the type of first column of the table which we consider that table’s Primary Key by convention.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"You may also choose to transform the flat data coming from a CSV using complex or nested types such as ARRAYs and STRUCTs. Or you could combine information coming from several CSV files into one table (e.g. several different files with different types of customer properties - with each type being an ARRAY or a STRUCT in the final desired CORE table). For more information see","type":"text"},{"type":"hardBreak"},{"text":"https://cloud.google.com/bigquery/docs/nested-repeated","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/bigquery/docs/nested-repeated"}}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"If you changed the data type of any column in any of the CORE tables in the prior section, you need to adjust the corresponding “cast(column_name as string)” to match the right format. The Same applies for advanced transformations that map flat data into complex / nested data types in the CORE.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Setup the Cron-job","type":"text"}]},{"type":"paragraph","content":[{"text":"The final step is to set up the ","type":"text"},{"text":"cron","type":"text","marks":[{"type":"code"}]},{"text":"-job which will automate the daily execution of your script file. The corresponding instructions will be printed on the terminal after the execution of the Setup Shell-Script has finished.","type":"text"}]},{"type":"paragraph","content":[{"text":"You can test the Automation Script by typing this in the command line: ","type":"text"},{"text":"bash process.sh [Date in the format YYYYMMDD].","type":"text","marks":[{"type":"strong"}]}]},{"type":"paragraph","content":[{"text":"For details, you can watch the simple how-to video:","type":"text"}]},{"type":"paragraph","content":[{"text":"https://www.youtube.com/watch?v=llUw3RtD-Yw","type":"text","marks":[{"type":"link","attrs":{"href":"https://www.youtube.com/watch?v=llUw3RtD-Yw"}}]}]},{"type":"paragraph","content":[{"text":"The Automation Script (“process“) requires one parameter to be passed in from the ","type":"text"},{"text":"cron","type":"text","marks":[{"type":"code"}]},{"text":"-job: the ETLDATE in the format YYYYMMDD.","type":"text"}]},{"type":"paragraph","content":[{"text":"Typically today’s load processes the data from yesterday. The correct ETLDATE value will thus be generted using shell subcommand ","type":"text"},{"text":"$(date --date '1 day ago' +%Y%m%d)","type":"text","marks":[{"type":"code"}]}]},{"type":"codeBlock","attrs":{"language":"shell"},"content":[{"text":"# open your linux terminal and type\ncrontab -e\n\n# now you will be taken to an editor (usually vim)\n# at the top there is help text about using crontab\n# create a new entry such as\n\n* 1 * * * /home/$USER/process.sh $(date --date '1 day ago' +%Y%m%d)\n\n# save the text and exit the editor\n# in vim this is achieved by hitting ESC, typing ':wq' and confirming with ENTER\n","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Possible errors and how to solve them","type":"text"}]},{"type":"paragraph","content":[{"text":"Here are the most important errors which can appear during the execution process of the Automation script.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Wrong datatype","type":"text"}]},{"type":"paragraph","content":[{"text":"There is an error, which tells you that it was not able to insert specific data. It tries to insert it as a false datatype, an example would be that he tries to put a postal code which contains some letters into an integer datatype. This happens because the autodetect function of bigquery tries to be smart and identifies the datatype by himself. In a case where postal codes mostly consist of numbers, it will try to insert the ones which contain letters as an integer as well. A solution to bypass this is by creating your own schema manually. To do that you have to create a JSON file, where you write the schema for the table which failed. The schema should have this pattern: ","type":"text"}]},{"type":"codeBlock","attrs":{"language":"json"},"content":[{"text":"[\n {\n \"name\":\"CustomerID\",\n \"type\":\"STRING\",\n \"mode\":\"NULLABLE\",\n \"description\":\"\"\n },\n {\n \"name\":\"Email\",\n \"type\":\"STRING\",\n \"mode\":\"NULLABLE\",\n \"description\":\"\"\n },\n {\n \"name\":\"GenderCode\",\n \"type\":\"STRING\",\n \"mode\":\"NULLABLE\",\n \"description\":\"\"\n },\n {\n \"name\":\"Birthdate\",\n \"type\":\"STRING\",\n \"mode\":\"NULLABLE\",\n \"description\":\"\"\n },\n {\n \"name\":\"PosZIP\",\n \"type\":\"STRING\",\n \"mode\":\"NULLABLE\",\n \"description\":\"\"\n },\n {\n \"name\":\"BillingZIP\",\n \"type\":\"STRING\",\n \"mode\":\"NULLABLE\",\n \"description\":\"\"\n },\n {\n \"name\":\"Language\",\n \"type\":\"STRING\",\n \"mode\":\"NULLABLE\",\n \"description\":\"\"\n },\n {\n \"name\":\"PrimaryShoppingStore\",\n \"type\":\"STRING\",\n \"mode\":\"NULLABLE\",\n \"description\":\"\"\n }\n]","type":"text"}]},{"type":"paragraph","content":[{"text":"After you created the schema, save it and go into the process.sh script, there you have to search the table you want to pass the schema, the line you search for should look like this one:","type":"text"}]},{"type":"codeBlock","content":[{"text":"if [[ $kunden_file_type = CSV || $kunden_file_type = csv ]]; then bq load --source_format CSV --skip_leading_rows 1 --field_delimiter=${DEFAULT_CSV_DELIMITER} --replace boxalino_stage.${kunden_stage_table} $kunden_file ./schema.json$","type":"text"}]},{"type":"paragraph","content":[{"text":"You should remove the ","type":"text"},{"text":"--autodetect","type":"text","marks":[{"type":"strong"}]},{"text":" and paste the path to your JSON file at the end of this line. This will fix your problem.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Wrong separator","type":"text"}]},{"type":"paragraph","content":[{"text":"Another problem you might encounter is that your data was not recognised properly and wrongly inserted into the BigQuery tables. The reason for that will be the separator of your CSV files. If your CSV files are using semicolon as separators, then you have to change the script. In the Automation script, the standard separator is a comma. If you don’t change it, it won’t be able to fill your data into the BigQuery tables. To change this, open up the process.sh script. Almost at the top you should see the following line: ","type":"text"},{"text":"DEFAULT_CSV_DELIMITER=,","type":"text","marks":[{"type":"code"}]},{"text":". The only thing you have to do here is to change it into that: ","type":"text"},{"text":"DEFAULT_CSV_DELIMITER=\\;","type":"text","marks":[{"type":"code"}]}]},{"type":"heading","attrs":{"level":1},"content":[{"text":"STEP 4: INTEGRATE YOUR CORE DATA WITH BOXALINO","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Summary","type":"text"}]},{"type":"paragraph","content":[{"text":"Boxalino will integrate your data in the Boxalino Data Science Eco-System based on its representation in your CORE dataset.","type":"text"}]},{"type":"paragraph","content":[{"text":"This process offers a degree of customization which can further be discussed and explained during a workshop.","type":"text"}]},{"type":"paragraph","content":[{"text":"More detailed information is available on demand.","type":"text"}]},{"type":"paragraph","content":[{"text":"As preparation, the questionnaire in annex 2 should be filled.","type":"text"}]},{"type":"paragraph"},{"type":"heading","attrs":{"level":1},"content":[{"text":"EXECUTION PLAN","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Example Execution Plan","type":"text"}]},{"type":"paragraph","content":[{"text":"It is recommended to structure the execution of the 5 steps according to the following plan:","type":"text"}]},{"type":"paragraph"},{"type":"paragraph","content":[{"text":"Phase 1 (typically 1-2 weeks):","type":"text","marks":[{"type":"strong"}]}]},{"type":"paragraph","content":[{"text":"Workshop to define (typically 2 hours with the Client, the Client Provider and Boxalino):","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Content and format of the exported files","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Key aspects of the BigQuery Core Dataset tables","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Implementation:","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Step 1 – Typically by the IT of the Client","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Step 2 – Typically by the Provider with the data to export","type":"text"}]}]}]},{"type":"paragraph"},{"type":"paragraph","content":[{"text":"Phase 2 (typically 1-2 weeks):","type":"text","marks":[{"type":"strong"}]}]},{"type":"paragraph","content":[{"text":"Workshop to define (typically 2 hours with the Client and Boxalino):","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Full aspects of the BigQuery Core Dataset tables","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Key aspects of the integration","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Implementation:","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Step 3 – Typically by the IT of the Client (with the support of Boxalino)","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Step 4 – Typically by the IT of the Client (with the support of Boxalino)","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Step 5 – Typically by Boxalino (typically 1 person Day of Work)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":1},"content":[{"text":"GLOSSARY","type":"text"}]},{"type":"paragraph","content":[{"text":"Google Cloud Organization","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"This is the resource representing your company in Google Cloud. An Organization resource is available for Google G Suite and Cloud Identity customers.","type":"text"},{"type":"hardBreak"},{"text":"https://cloud.google.com/resource-manager/docs/creating-managing-organization","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/resource-manager/docs/creating-managing-organization"}}]}]},{"type":"paragraph","content":[{"text":"Google Cloud Project ","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"Google Cloud Platform (GCP) projects forms the basis for creating, enabling, and using all GCP services including managing APIs, enabling billing, adding and removing collaborators, and managing permissions for GCP resources. This is the main container for all project activities, including the upload of files and their processing in BigQuery.","type":"text"},{"type":"hardBreak"},{"text":"https://cloud.google.com/resource-manager/docs/creating-managing-projects","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/resource-manager/docs/creating-managing-projects"}}]}]},{"type":"paragraph","content":[{"text":"Google Cloud Service Account","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"This is the account to be used for process automation in Step 2 and 4. The Service account is a technical account meant for a machine / process to perform the data loading and transformation tasks. The service account must have access to your ","type":"text"},{"text":"Storage Bucket ","type":"text","marks":[{"type":"em"}]},{"text":"and your","type":"text"},{"text":" BigQuery Project","type":"text","marks":[{"type":"em"}]},{"text":".","type":"text"},{"type":"hardBreak"},{"text":"https://cloud.google.com/iam/docs/service-accounts","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/iam/docs/service-accounts"}}]}]},{"type":"paragraph","content":[{"text":"Google Cloud Storage ","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"Google Cloud Storage is a RESTful online file storage web service for storing and accessing data on Google Cloud Platform infrastructure. The service combines the performance and scalability of Google's cloud with advanced security and sharing capabilities.","type":"text"},{"type":"hardBreak"},{"text":"https://cloud.google.com/storage/","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/storage/"}}]}]},{"type":"paragraph","content":[{"text":"Google Cloud Storage Bucket ","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"This is the “folder” which will contain all your exported files.","type":"text"},{"type":"hardBreak"},{"text":"Please make sure to use the region europe-west1.","type":"text"},{"text":" ","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"https://cloud.google.com/storage/docs/creating-buckets","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/storage/docs/creating-buckets"}}]}]},{"type":"paragraph","content":[{"text":"Google BigQuery ","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"BigQuery is a RESTful web service that enables interactive analysis of massive datasets working in conjunction with Google Storage. It is a serverless Software as a Service / Datawarehouse as a Service that may be used complementarily with MapReduce.","type":"text"},{"type":"hardBreak"},{"text":"https://cloud.google.com/bigquery","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/bigquery"}}]}]},{"type":"paragraph","content":[{"text":"Google BigQuery Dataset","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"A dataset is contained within a specific project. Datasets are top-level containers that are used to organize and control access to your tables and views. A table or view must belong to a dataset, thus you need to create at least one dataset before loading data into BigQuery.","type":"text"},{"type":"hardBreak"},{"text":"In other words, Dataset is the concept of a MySQL “database” for BigQuery.","type":"text"},{"type":"hardBreak"},{"text":"https://cloud.google.com/bigquery/docs/datasets-intro","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/bigquery/docs/datasets-intro"}}]}]},{"type":"paragraph","content":[{"text":"Automatic Uploading Process","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"This is a process that have to be integrated on the systems your data should be exported from. The process uploads the export as CSV or JSON files to the Google Cloud Storage using Google Cloud SDK.","type":"text"}]},{"type":"paragraph","content":[{"text":"Automatic Uploading Process Run","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"An Automatic Uploading Process Run is specific execution (with its timestamp) of an Automatic Uploading Process","type":"text"}]},{"type":"paragraph","content":[{"text":"Google BigQuery Stage Dataset","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"This is a standard BigQuery Dataset (typically named [account]_stage with [account] as your account name) which will contain the automatically generated tables from every Automatic Data Processing (typically there are as many tables as they are files in your Google Storage in one run of your Automatic Uploading Process and each file contains the date or datetime of the run (compliant with the datetime indicated in your file names).","type":"text"}]},{"type":"paragraph","content":[{"text":"Full data","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"The full data includes all information available at and consistent to a chosen point in time (typically the moment of the extraction of the data from its source system). Also known as “full snapshot”. One of the two basic kinds of data provided by the Automatic Upload Process.","type":"text"}]},{"type":"paragraph","content":[{"text":"Transaction data","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"By Transaction data we mean data connecting customers to products, like the purchase history. While the purchase history is the main (and typically most important) use case, other Transaction data could be also considered (Sales leads, support requests, etc.).","type":"text"}]},{"type":"paragraph","content":[{"text":"This data are typically contained into two CSV files","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"one with one line per order with the required internal order and customer identifiers as well as an external order identifier (which should be used to map the data to pre-existing order data in the Boxalino Data Science Eco-system and which might be the same as the internal order identifier) and descriptive columns about the order (date, status, total value, etc.)","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"one with one line per product per order with the required product identifier as well as descriptive columns about the ordered product (quantity, price before and after discounts, etc.).","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"For details look at the Annex: WHAT DATA DO WE NEED?","type":"text"}]},{"type":"paragraph","content":[{"text":"Customer data","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"By Customer data we mean data describing a customer in the system exporting the data (depending on the system, a customer can be defined by a unique physical person, or an account or another logic). This data are typically contained in one CSV file (with one line per customer with the required customer id as referred to in the transaction data, one column for the external identifiers (which should be used to map the data to pre-existing customer data in the Boxalino Data Science Eco-system and might be the same as the internal customer identifier) as well as descriptive columns about the customers (e.g.: gender, zip code, date of birth, etc.). Typically, it is recommended to export only anonymised customer data unless specifically decided otherwise.","type":"text"}]},{"type":"paragraph","content":[{"text":"For details look at the Annex: WHAT DATA DO WE NEED?","type":"text"}]},{"type":"paragraph","content":[{"text":"Partial daily data","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"This is one of the two possible content of an Automatic Uploading Process (Full historical or Partial daily). The Partial daily data contains only the data which have (or might have) changed since the last Partial daily data. If the data are not easily convertible to delta (for example, customer data are sometimes more difficult to export as a delta than transaction data), it is typically possible to export the full data for some of the files (e.g.: the customers data are exported in full, but the transaction data are only exported for the last month).","type":"text"}]},{"type":"paragraph","content":[{"text":"Operational Data Store","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"An operational data store (or \"ODS\") is used for operational reporting and as a source of data for the Enterprise Data Warehouse (EDW). It is a complementary element to an EDW in a decision support landscape, and is used for operational reporting, controls and decision making, as opposed to the EDW, which is used for tactical and strategic decision support. An ODS is a database designed to integrate data from multiple sources for additional operations on the data, for reporting, controls and operational decision support. Unlike a production master data store, the data is not passed back to operational systems. It may be passed for further operations and to the data warehouse for reporting.","type":"text"}]},{"type":"paragraph","content":[{"text":"Google BigQuery Core Dataset","type":"text","marks":[{"type":"strong"},{"type":"em"}]}]},{"type":"paragraph","content":[{"text":"This is a standard BigQuery Dataset (typically named [account]_core with [account] as your account name) which will have a set of manually defined tables which will contain the consolidated data from the run of all the Automatic Uploading Process runs. contain the automatically generated tables from every Automatic Data Processing (typically there are as many tables as they are files in your Google Storage in one run of your Automatic Uploading Process and each file ends with the date or datetime of the run (compliant with the datetime indicated in your file names).","type":"text"}]},{"type":"paragraph","content":[{"text":"Automatic Data Processing","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"text":" ","type":"text"},{"type":"hardBreak"},{"text":"An Automatic Data Processing loads the files generated by the last Automatic Uploading Process Run into the Stage Dataset (direct mapping, load the files as they are into tables which names are derived from the names of the files) and then update the Core Dataset on the basis of these generated stage tables.","type":"text"}]},{"type":"paragraph","content":[{"text":"Automatic Data Processing","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"text":" ","type":"text"},{"text":"Run","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"An Automatic Data Processing Run is specific execution (with its timestamp) of an Automatic Data Processing. Typically It should be scheduled to run shortly after the Automatic Uploading Process Run (it is not required, but it could be triggered by the end of the Automatic Uploading Process Run instead of being scheduled independently, but this requires an additional step in the setup of the Automatic Uploading Process.","type":"text"}]},{"type":"paragraph","content":[{"text":"Google Cloud Virtual Machine Instance (VM Instance)","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"text":" ","type":"text"},{"type":"hardBreak"},{"text":"An Automatic Data Processing loads the files generated by the last Automatic Uploading Process Run into the Stage Dataset (direct mapping, load the files as they are into tables which names are derived from the names of the files) and then update the Core Dataset on the basis of these generated stage tables.","type":"text"}]},{"type":"paragraph","content":[{"text":"https://cloud.google.com/compute/docs/instances/","type":"text","marks":[{"type":"link","attrs":{"href":"https://cloud.google.com/compute/docs/instances/"}}]}]},{"type":"paragraph","content":[{"text":"Boxalino Data Science Eco-System","type":"text","marks":[{"type":"strong"},{"type":"em"}]}]},{"type":"paragraph","content":[{"text":"Boxalino Data Science Eco-System create a highly standardized “nucleus” of a Data Warehouse in Google BigQuery and connects the result of data science analytics process (report and lab) to be automatically uploaded in Boxalino Real-Time platform. The process will integrate the base standard data from the e-shops (including product data, behavioral data and sometimes also customers and transactions data) in addition to any additional data source set-up through the process described in this document.","type":"text"}]},{"type":"mediaSingle","attrs":{"layout":"align-start","width":50.0},"content":[{"type":"media","attrs":{"width":367,"id":"8e0819dd-37bb-4376-9bf2-9d10775bcf97","collection":"contentId-34439169","type":"file","height":313}}]},{"type":"paragraph","content":[{"text":"Please consider that the usage of the words “stage” and “core” in this diagram refers to Boxalino internal stage and core datasets, not to the ones described in this document which are specific to the process described in this document.","type":"text"}]},{"type":"heading","attrs":{"level":1},"content":[{"text":"ANNEX #1: WHAT DATA DO WE NEED?","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Summary","type":"text"}]},{"type":"paragraph","content":[{"text":"The data to be exported are typically of two types: Transaction and Customer data.","type":"text"}]},{"type":"paragraph","content":[{"text":"Transaction data","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"By Transaction data we mean data connecting customers to products, like the purchase history. While the purchase history is the main (and typically most important) use case, other Transaction data could be also considered (Sales leads, support requests, etc.).","type":"text"}]},{"type":"paragraph","content":[{"text":"This data are typically contained into two CSV files (you have to make sure that the fields are comma and not semicolons separated)","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"one with one line per order with the required internal order and customer identifiers as well as an external order identifier (which should be used to map the data to pre-existing order data in the Boxalino Data Science Eco-system and which might be the same as the internal order identifier) and descriptive columns about the order (date, status, total value, etc.)","type":"text"}]}]}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"one with one line per product per order with the required product identifier as well as descriptive columns about the ordered product (quantity, price before and after discounts, etc.).","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"The key information needed in Transaction Data are:","type":"text"}]},{"type":"orderedList","attrs":{"order":1},"content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Internal ID (unique per order)","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"External ID (can be the same as Internal ID) matching the order ID of the E-shop system (to detect duplicate orders present in both systems)","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"External Product ID matching the product ID or a product attribute value of the E-shop (please indicate what attribute)","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Internal Customer ID (matching the internal id of the customer data below)","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"External Customer ID matching the customer ID or a customer attribute value of the E-shop (please indicate what attribute)","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"A list of Order Attribute with their values such as:","type":"text"}]},{"type":"orderedList","attrs":{"order":1},"content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Order date","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Order status","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Order full order price","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Qty, Order and price before and after discount of each purchased product","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"System (Store-id, online store, etc.)","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"… (key relevant order attributes should be added)","type":"text"}]}]}]}]}]},{"type":"paragraph","content":[{"text":"Customer data","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"By Customer data we mean data describing a customer in the system exporting the data (depending on the system, a customer can be defined by a unique physical person, or an account or another logic). This data are typically contained in one CSV file (with one line per customer with the required customer id as referred to in the transaction data, one column for the external identifiers (which should be used to map the data to pre-existing customer data in the Boxalino Data Science Eco-system and might be the same as the internal customer identifier) as well as descriptive columns about the customers (e.g.: gender, zip code, date of birth, etc.). Typically, it is recommended to export only anonymised customer data unless specifically decided otherwise.","type":"text"}]},{"type":"paragraph","content":[{"text":"The key information needed in Customer Data are:","type":"text"}]},{"type":"orderedList","attrs":{"order":1},"content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":" Internal ID (unique per customer ID)","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"External ID (can be the same as Internal ID) matching the customer ID or a customer attribute value of the E-shop (please indicate what attribute)","type":"text"},{"type":"hardBreak"},{"text":"A hashed customer ID or hashed e-mail can be used","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"A list of Customer and Attribute with their values such as:","type":"text"}]},{"type":"orderedList","attrs":{"order":1},"content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Gender","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Date of Birth","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Zip Code","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Loyalty points information","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Newsletter subscriptions","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"… (key relevant customer attributes should be added)","type":"text"}]}]}]}]}]},{"type":"paragraph"},{"type":"heading","attrs":{"level":1},"content":[{"text":"ANNEX #2: DATA QUESTIONNAIRE","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Summary","type":"text"}]},{"type":"paragraph","content":[{"text":"Transaction data","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"How many files are there (please provide their names and a short description)?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"If you have only 1 file, is there one row per order and per product? if not, what is the logic of the rows?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"What are the column names of each files (please provide their names, format, and a short description)?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"Which columns of which file contain the internal and external order ids?","type":"text"}]},{"type":"paragraph","content":[{"text":"By Internal id, we mean the unique id of the order of the system providing the data.","type":"text"}]},{"type":"paragraph","content":[{"text":"By external id, we mean the id of the e-shop data (e.g.: the Magento order id). ","type":"text"}]},{"type":"paragraph","content":[{"text":"If the external id does not match directly the id of the e-shop data, please indicate the name of the attribute of the e-shop data which contains this value.","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"Same question for the internal and external customer ids?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"Same question for the internal and external product ids?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"Which columns of which files contain the order date and what is the exact format?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"Which columns of which files contain the order status and what are the possible values and their meaning?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"What condition makes an order or a specific product of an order successfully bought?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"Which columns of which files contain the billing and delivery address (including zip code)?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"If any, which columns of which files contain customer data (except the Internal / External ids)? ","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"If any, which columns of which files contain product data (except the Internal / External ids)? ","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"Customer data","type":"text","marks":[{"type":"strong"},{"type":"em"}]},{"type":"hardBreak"},{"text":"How many files are there (please provide their names and a short description)?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"What are the column names of each files (please provide their names, format, and a short description)?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"Which columns of which file contain the internal and external customer ids?","type":"text"}]},{"type":"paragraph","content":[{"text":"By Internal id, we mean the unique id of the order of the system providing the data.","type":"text"}]},{"type":"paragraph","content":[{"text":"By external id, we mean the id of the e-shop data (e.g.: the Magento order id). ","type":"text"}]},{"type":"paragraph","content":[{"text":"If the external id does not match directly the id of the e-shop data, please indicate the name of the attribute of the e-shop data which contains this value.","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"Which columns of which files contain the birth date and what is the exact format?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"Which columns of which files contain the gender and what are the possible values and their meaning?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"Which columns of which files contain the e-mail and and can there be more than 1 per internal id?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"Which columns of which files contain the billing and delivery address (including zip code)?","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph","content":[{"text":"___________________________________________________________________________________________","type":"text"}]},{"type":"paragraph"}],"version":1}

Browser not supported